| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4) | |||
| project(OpenBLAS) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 2) | |||
| set(OpenBLAS_PATCH_VERSION 18) | |||
| set(OpenBLAS_PATCH_VERSION 19) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| enable_language(ASM) | |||
| @@ -45,8 +45,8 @@ endif() | |||
| message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") | |||
| include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake") | |||
| include("${CMAKE_SOURCE_DIR}/cmake/system.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") | |||
| set(BLASDIRS interface driver/level2 driver/level3 driver/others) | |||
| @@ -123,9 +123,9 @@ endforeach () | |||
| # Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. | |||
| # Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want. | |||
| if (NOT NOFORTRAN AND NOT NO_LAPACK) | |||
| include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/lapack.cmake") | |||
| if (NOT NO_LAPACKE) | |||
| include("${CMAKE_SOURCE_DIR}/cmake/lapacke.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/lapacke.cmake") | |||
| endif () | |||
| endif () | |||
| @@ -137,7 +137,7 @@ endif() | |||
| # add objects to the openblas lib | |||
| add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| include("${CMAKE_SOURCE_DIR}/cmake/export.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") | |||
| # Set output for libopenblas | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| @@ -150,3 +150,14 @@ In chronological order: | |||
| * theoractice <https://github.com/theoractice/> | |||
| * [2016-03-20] Fix compiler error in VisualStudio with CMake | |||
| * [2016-03-22] Fix access violation on Windows while static linking | |||
| * Paul Mustière <https://github.com/buffer51/> | |||
| * [2016-02-04] Fix Android build on ARMV7 | |||
| * [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8 | |||
| * Shivraj Patil <https://github.com/sva-img/> | |||
| * [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA | |||
| * Kaustubh Raste <https://github.com/ksraste/> | |||
| * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA | |||
| * [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA | |||
| @@ -1,4 +1,22 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.2.19 | |||
| 1-Sep-2016 | |||
| common: | |||
| * Improved cross compiling. | |||
| * Fix the bug on musl libc. | |||
| POWER: | |||
| * Optimize BLAS on Power8 | |||
| * Fixed Julia+OpenBLAS bugs on Power8 | |||
| MIPS: | |||
| * Optimize BLAS on MIPS P5600 and I6400 (Thanks, Shivraj Patil, Kaustubh Raste) | |||
| ARM: | |||
| * Improved on ARM Cortex-A57. (Thanks, Ashwin Sekhar T K) | |||
| ==================================================================== | |||
| Version 0.2.18 | |||
| 12-Apr-2016 | |||
| @@ -108,8 +108,6 @@ endif | |||
| tests : | |||
| ifndef NOFORTRAN | |||
| ifndef TARGET | |||
| ifndef CROSS | |||
| touch $(LIBNAME) | |||
| ifndef NO_FBLAS | |||
| $(MAKE) -C test all | |||
| @@ -119,8 +117,6 @@ ifndef NO_CBLAS | |||
| $(MAKE) -C ctest all | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| libs : | |||
| ifeq ($(CORE), UNKOWN) | |||
| @@ -20,75 +20,75 @@ lib.grd : | |||
| $(error OpenBLAS: Please run "make" firstly) | |||
| install : lib.grd | |||
| @-mkdir -p $(DESTDIR)$(PREFIX) | |||
| @-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
| @-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | |||
| @-mkdir -p "$(DESTDIR)$(PREFIX)" | |||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" | |||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" | |||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" | |||
| @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| #for inc | |||
| @echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||
| @echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||
| @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||
| @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||
| @cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||
| @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||
| @echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||
| @echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||
| @echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||
| @cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||
| @echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||
| @echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||
| @echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||
| @echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||
| @cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||
| @echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||
| ifndef NO_CBLAS | |||
| @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h | |||
| @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" | |||
| endif | |||
| ifndef NO_LAPACKE | |||
| @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" | |||
| endif | |||
| #for install static library | |||
| ifndef NO_STATIC | |||
| @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||
| @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| endif | |||
| #for install shared library | |||
| ifndef NO_SHARED | |||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) | |||
| @install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||
| @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||
| @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| ifeq ($(OSNAME), NetBSD) | |||
| @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||
| @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| ifeq ($(OSNAME), Darwin) | |||
| @-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) | |||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||
| @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| @-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
| @-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" | |||
| @-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| @-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR) | |||
| @@ -96,34 +96,34 @@ endif | |||
| endif | |||
| #Generating OpenBLASConfig.cmake | |||
| @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | |||
| @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| ifndef NO_SHARED | |||
| #ifeq logical or | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| endif | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| endif | |||
| ifeq ($(OSNAME), Darwin) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| endif | |||
| else | |||
| #only static | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| endif | |||
| #Generating OpenBLASConfigVersion.cmake | |||
| @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | |||
| @echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo Install OK! | |||
| @@ -0,0 +1,3 @@ | |||
| ifdef BINARY64 | |||
| else | |||
| endif | |||
| @@ -1,4 +1,26 @@ | |||
| # CCOMMON_OPT += -DALLOC_SHM | |||
| ifdef USE_THREAD | |||
| ifeq ($(USE_THREAD), 0) | |||
| USE_OPENMP = 0 | |||
| else | |||
| USE_OPENMP = 1 | |||
| endif | |||
| else | |||
| USE_OPENMP = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER8) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| else | |||
| COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math | |||
| endif | |||
| endif | |||
| FLAMEPATH = $(HOME)/flame/lib | |||
| @@ -16,6 +38,16 @@ else | |||
| endif | |||
| endif | |||
| #Either uncomment below line or run make with `USE_MASS=1` to enable support of MASS library | |||
| #USE_MASS = 1 | |||
| ifeq ($(USE_MASS), 1) | |||
| # Path to MASS libs, change it if the libs are installed at any other location | |||
| MASSPATH = /opt/ibm/xlmass/8.1.3/lib | |||
| COMMON_OPT += -mveclibabi=mass -ftree-vectorize -funsafe-math-optimizations -DUSE_MASS | |||
| EXTRALIB += -L$(MASSPATH) -lmass -lmassvp8 -lmass_simdp8 | |||
| endif | |||
| ifdef BINARY64 | |||
| @@ -17,14 +17,26 @@ ifdef CPUIDEMU | |||
| EXFLAGS = -DCPUIDEMU -DVENDOR=99 | |||
| endif | |||
| ifeq ($(TARGET), P5600) | |||
| TARGET_FLAGS = -mips32r5 | |||
| endif | |||
| ifeq ($(TARGET), I6400) | |||
| TARGET_FLAGS = -mips64r6 | |||
| endif | |||
| ifeq ($(TARGET), P6600) | |||
| TARGET_FLAGS = -mips64r6 | |||
| endif | |||
| all: getarch_2nd | |||
| ./getarch_2nd 0 >> $(TARGET_MAKE) | |||
| ./getarch_2nd 1 >> $(TARGET_CONF) | |||
| config.h : c_check f_check getarch | |||
| perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) | |||
| perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) | |||
| ifneq ($(ONLY_CBLAS), 1) | |||
| perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) | |||
| perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS) | |||
| else | |||
| #When we only build CBLAS, we set NOFORTRAN=2 | |||
| echo "NOFORTRAN=2" >> $(TARGET_MAKE) | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.2.18 | |||
| VERSION = 0.2.19 | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -52,6 +52,7 @@ VERSION = 0.2.18 | |||
| # USE_THREAD = 0 | |||
| # If you're going to use this library with OpenMP, please comment it in. | |||
| # This flag is always set for POWER8. Don't modify the flag | |||
| # USE_OPENMP = 1 | |||
| # You can define maximum number of threads. Basically it should be | |||
| @@ -153,10 +154,12 @@ NO_AFFINITY = 1 | |||
| # Common Optimization Flag; | |||
| # The default -O2 is enough. | |||
| # Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT | |||
| # COMMON_OPT = -O2 | |||
| # gfortran option for LAPACK | |||
| # enable this flag only on 64bit Linux and if you need a thread safe lapack library | |||
| # Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT | |||
| # FCOMMON_OPT = -frecursive | |||
| # Profiling flags | |||
| @@ -159,7 +159,7 @@ ifndef GOTOBLAS_MAKEFILE | |||
| export GOTOBLAS_MAKEFILE = 1 | |||
| # Generating Makefile.conf and config.h | |||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) all) | |||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) | |||
| ifndef TARGET_CORE | |||
| include $(TOPDIR)/Makefile.conf | |||
| @@ -462,7 +462,7 @@ endif | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), mips64) | |||
| ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) | |||
| NO_BINARY_MODE = 1 | |||
| endif | |||
| @@ -502,13 +502,16 @@ endif | |||
| ifdef NO_BINARY_MODE | |||
| ifeq ($(ARCH), mips64) | |||
| ifeq ($(ARCH), $(filter $(ARCH),mips64)) | |||
| ifdef BINARY64 | |||
| CCOMMON_OPT += -mabi=64 | |||
| else | |||
| CCOMMON_OPT += -mabi=n32 | |||
| endif | |||
| BINARY_DEFINED = 1 | |||
| else ifeq ($(ARCH), $(filter $(ARCH),mips)) | |||
| CCOMMON_OPT += -mabi=32 | |||
| BINARY_DEFINED = 1 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3A) | |||
| @@ -521,6 +524,21 @@ CCOMMON_OPT += -march=mips64 | |||
| FCOMMON_OPT += -march=mips64 | |||
| endif | |||
| ifeq ($(CORE), P5600) | |||
| CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) | |||
| FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) | |||
| endif | |||
| ifeq ($(CORE), I6400) | |||
| CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) | |||
| FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) | |||
| endif | |||
| ifeq ($(CORE), P6600) | |||
| CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) | |||
| FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) | |||
| endif | |||
| ifeq ($(OSNAME), AIX) | |||
| BINARY_DEFINED = 1 | |||
| endif | |||
| @@ -589,12 +607,14 @@ ifneq ($(NO_LAPACK), 1) | |||
| EXTRALIB += -lgfortran | |||
| endif | |||
| ifdef NO_BINARY_MODE | |||
| ifeq ($(ARCH), mips64) | |||
| ifeq ($(ARCH), $(filter $(ARCH),mips64)) | |||
| ifdef BINARY64 | |||
| FCOMMON_OPT += -mabi=64 | |||
| else | |||
| FCOMMON_OPT += -mabi=n32 | |||
| endif | |||
| else ifeq ($(ARCH), $(filter $(ARCH),mips)) | |||
| FCOMMON_OPT += -mabi=32 | |||
| endif | |||
| else | |||
| ifdef BINARY64 | |||
| @@ -677,21 +697,7 @@ FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| endif | |||
| ifneq ($(ARCH), mips64) | |||
| ifndef BINARY64 | |||
| FCOMMON_OPT += -m32 | |||
| else | |||
| FCOMMON_OPT += -m64 | |||
| endif | |||
| else | |||
| ifdef BINARY64 | |||
| FCOMMON_OPT += -mabi=64 | |||
| else | |||
| FCOMMON_OPT += -mabi=n32 | |||
| endif | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| FCOMMON_OPT += -mp | |||
| endif | |||
| @@ -707,7 +713,7 @@ endif | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), mips64) | |||
| ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) | |||
| ifndef BINARY64 | |||
| FCOMMON_OPT += -n32 | |||
| else | |||
| @@ -737,7 +743,7 @@ endif | |||
| ifeq ($(C_COMPILER), OPEN64) | |||
| ifeq ($(ARCH), mips64) | |||
| ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) | |||
| ifndef BINARY64 | |||
| CCOMMON_OPT += -n32 | |||
| else | |||
| @@ -1126,6 +1132,8 @@ export HAVE_VFP | |||
| export HAVE_VFPV3 | |||
| export HAVE_VFPV4 | |||
| export HAVE_NEON | |||
| export HAVE_MSA | |||
| export MSA_FLAGS | |||
| export KERNELDIR | |||
| export FUNCTION_PROFILE | |||
| export TARGET_CORE | |||
| @@ -43,6 +43,35 @@ On X86 box, compile this library for loongson3a CPU with loongcc (based on Open6 | |||
| make DEBUG=1 | |||
| ### Compile with MASS Support on Power CPU (Optional dependency) | |||
| [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and | |||
| Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER. | |||
| The library can be installed as below - | |||
| * On Ubuntu: | |||
| wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add - | |||
| echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list | |||
| sudo apt-get update | |||
| sudo apt-get install libxlmass-devel.8.1.3 | |||
| * On RHEL/CentOS: | |||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key | |||
| sudo rpm --import repomd.xml.key | |||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo | |||
| sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/ | |||
| sudo yum install libxlmass-devel.8.1.3 | |||
| After installing MASS library, compile openblas with USE_MASS=1. | |||
| Example: | |||
| Compiling on Power8 with MASS support - | |||
| make USE_MASS=1 TARGET=POWER8 | |||
| ### Install to the directory (optional) | |||
| Example: | |||
| @@ -82,6 +111,7 @@ Please read GotoBLAS_01Readme.txt | |||
| - **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | |||
| - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. | |||
| - **FreeBSD**: Supported by community. We didn't test the library on this OS. | |||
| - **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. | |||
| ## Usages | |||
| Link with libopenblas.a or -lopenblas for shared library. | |||
| @@ -53,26 +53,31 @@ PPC440 | |||
| PPC440FP2 | |||
| CELL | |||
| 3.MIPS64 CPU: | |||
| 3.MIPS CPU: | |||
| P5600 | |||
| 4.MIPS64 CPU: | |||
| SICORTEX | |||
| LOONGSON3A | |||
| LOONGSON3B | |||
| I6400 | |||
| P6600 | |||
| 4.IA64 CPU: | |||
| 5.IA64 CPU: | |||
| ITANIUM2 | |||
| 5.SPARC CPU: | |||
| 6.SPARC CPU: | |||
| SPARC | |||
| SPARCV7 | |||
| 6.ARM CPU: | |||
| 7.ARM CPU: | |||
| CORTEXA15 | |||
| CORTEXA9 | |||
| ARMV7 | |||
| ARMV6 | |||
| ARMV5 | |||
| 7.ARM 64-bit CPU: | |||
| 8.ARM 64-bit CPU: | |||
| ARMV8 | |||
| CORTEXA57 | |||
| @@ -1,4 +1,4 @@ | |||
| version: 0.2.18.{build} | |||
| version: 0.2.19.{build} | |||
| #environment: | |||
| @@ -173,7 +173,9 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ | |||
| spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ | |||
| ssymm.goto dsymm.goto csymm.goto zsymm.goto \ | |||
| smallscaling | |||
| smallscaling \ | |||
| isamax.goto idamax.goto icamax.goto izamax.goto \ | |||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto | |||
| acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | |||
| scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | |||
| @@ -226,7 +228,9 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ | |||
| sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ | |||
| sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ | |||
| spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ | |||
| ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas | |||
| ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ | |||
| isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ | |||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto | |||
| mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ | |||
| scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ | |||
| @@ -261,7 +265,9 @@ endif | |||
| essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ | |||
| cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ | |||
| slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl | |||
| slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ | |||
| scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ | |||
| strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl | |||
| veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ | |||
| scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ | |||
| @@ -393,6 +399,9 @@ scholesky.mkl : scholesky.$(SUFFIX) | |||
| scholesky.veclib : scholesky.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| scholesky.essl : scholesky.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Dcholesky ################################################### | |||
| dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) | |||
| @@ -410,6 +419,9 @@ dcholesky.mkl : dcholesky.$(SUFFIX) | |||
| dcholesky.veclib : dcholesky.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| dcholesky.essl : dcholesky.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Ccholesky ################################################### | |||
| ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) | |||
| @@ -427,6 +439,9 @@ ccholesky.mkl : ccholesky.$(SUFFIX) | |||
| ccholesky.veclib : ccholesky.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ccholesky.essl : ccholesky.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Zcholesky ################################################### | |||
| @@ -445,6 +460,9 @@ zcholesky.mkl : zcholesky.$(SUFFIX) | |||
| zcholesky.veclib : zcholesky.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| zcholesky.essl : zcholesky.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Sgemm #################################################### | |||
| sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| @@ -683,6 +701,9 @@ strsm.mkl : strsm.$(SUFFIX) | |||
| strsm.veclib : strsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| strsm.essl : strsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Dtrsm #################################################### | |||
| dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| @@ -699,6 +720,9 @@ dtrsm.mkl : dtrsm.$(SUFFIX) | |||
| dtrsm.veclib : dtrsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| dtrsm.essl : dtrsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Ctrsm #################################################### | |||
| ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) | |||
| @@ -716,6 +740,9 @@ ctrsm.mkl : ctrsm.$(SUFFIX) | |||
| ctrsm.veclib : ctrsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ctrsm.essl : ctrsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Ztrsm #################################################### | |||
| ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) | |||
| @@ -733,6 +760,9 @@ ztrsm.mkl : ztrsm.$(SUFFIX) | |||
| ztrsm.veclib : ztrsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ztrsm.essl : ztrsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Ssyrk #################################################### | |||
| ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| @@ -1911,6 +1941,63 @@ zgemm3m.mkl : zgemm3m.$(SUFFIX) | |||
| zgemm3m.veclib : zgemm3m.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## ISAMAX ############################################## | |||
| isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| isamax.atlas : isamax.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## IDAMAX ############################################## | |||
| idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| idamax.atlas : idamax.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## ICAMAX ############################################## | |||
| icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| icamax.atlas : icamax.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## IZAMAX ############################################## | |||
| izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| izamax.atlas : izamax.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## SNRM2 ############################################## | |||
| snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| snrm2.atlas : snrm2.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## DNRM2 ############################################## | |||
| dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| dnrm2.atlas : dnrm2.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## Sscnrm2 ############################################## | |||
| scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| scnrm2.atlas : scnrm2.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## Ddznrm2 ############################################## | |||
| dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| dznrm2.atlas : dznrm2.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ################################################################################################### | |||
| slinpack.$(SUFFIX) : linpack.c | |||
| @@ -2217,11 +2304,38 @@ cgemm3m.$(SUFFIX) : gemm3m.c | |||
| zgemm3m.$(SUFFIX) : gemm3m.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| isamax.$(SUFFIX) : iamax.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| idamax.$(SUFFIX) : iamax.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| icamax.$(SUFFIX) : iamax.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| izamax.$(SUFFIX) : iamax.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| snrm2.$(SUFFIX) : nrm2.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| dnrm2.$(SUFFIX) : nrm2.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| scnrm2.$(SUFFIX) : nrm2.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| dznrm2.$(SUFFIX) : nrm2.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| smallscaling: smallscaling.c ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread | |||
| clean :: | |||
| @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl | |||
| @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling | |||
| include $(TOPDIR)/Makefile.tail | |||
| @@ -183,9 +183,9 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| #ifdef COMPLEX | |||
| fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6); | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg); | |||
| #else | |||
| fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6); | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg); | |||
| #endif | |||
| } | |||
| @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MBytes\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6); | |||
| " %10.2f MBytes %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| @@ -184,8 +184,8 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| @@ -221,7 +221,7 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg); | |||
| } | |||
| } | |||
| @@ -258,7 +258,7 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg); | |||
| } | |||
| } | |||
| @@ -0,0 +1,190 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #ifdef __CYGWIN32__ | |||
| #include <sys/time.h> | |||
| #endif | |||
| #include "common.h" | |||
| #undef IAMAX | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IAMAX BLASFUNC(izamax) | |||
| #else | |||
| #define IAMAX BLASFUNC(icamax) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define IAMAX BLASFUNC(idamax) | |||
| #else | |||
| #define IAMAX BLASFUNC(isamax) | |||
| #endif | |||
| #endif | |||
| #if defined(__WIN32__) || defined(__WIN64__) | |||
| #ifndef DELTA_EPOCH_IN_MICROSECS | |||
| #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL | |||
| #endif | |||
| int gettimeofday(struct timeval *tv, void *tz){ | |||
| FILETIME ft; | |||
| unsigned __int64 tmpres = 0; | |||
| static int tzflag; | |||
| if (NULL != tv) | |||
| { | |||
| GetSystemTimeAsFileTime(&ft); | |||
| tmpres |= ft.dwHighDateTime; | |||
| tmpres <<= 32; | |||
| tmpres |= ft.dwLowDateTime; | |||
| /*converting file time to unix epoch*/ | |||
| tmpres /= 10; /*convert into microseconds*/ | |||
| tmpres -= DELTA_EPOCH_IN_MICROSECS; | |||
| tv->tv_sec = (long)(tmpres / 1000000UL); | |||
| tv->tv_usec = (long)(tmpres % 1000000UL); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 | |||
| static void *huge_malloc(BLASLONG size){ | |||
| int shmid; | |||
| void *address; | |||
| #ifndef SHM_HUGETLB | |||
| #define SHM_HUGETLB 04000 | |||
| #endif | |||
| if ((shmid =shmget(IPC_PRIVATE, | |||
| (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), | |||
| SHM_HUGETLB | IPC_CREAT |0600)) < 0) { | |||
| printf( "Memory allocation failed(shmget).\n"); | |||
| exit(1); | |||
| } | |||
| address = shmat(shmid, NULL, SHM_RND); | |||
| if ((BLASLONG)address == -1){ | |||
| printf( "Memory allocation failed(shmat).\n"); | |||
| exit(1); | |||
| } | |||
| shmctl(shmid, IPC_RMID, 0); | |||
| return address; | |||
| } | |||
| #define malloc huge_malloc | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| struct timeval start, stop; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Time\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| IAMAX (&m, x, &inc_x); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.6f secs\n", timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -0,0 +1,190 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #ifdef __CYGWIN32__ | |||
| #include <sys/time.h> | |||
| #endif | |||
| #include "common.h" | |||
| #undef NRM2 | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define NRM2 BLASFUNC(dznrm2) | |||
| #else | |||
| #define NRM2 BLASFUNC(scnrm2) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define NRM2 BLASFUNC(dnrm2) | |||
| #else | |||
| #define NRM2 BLASFUNC(snrm2) | |||
| #endif | |||
| #endif | |||
| #if defined(__WIN32__) || defined(__WIN64__) | |||
| #ifndef DELTA_EPOCH_IN_MICROSECS | |||
| #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL | |||
| #endif | |||
| int gettimeofday(struct timeval *tv, void *tz){ | |||
| FILETIME ft; | |||
| unsigned __int64 tmpres = 0; | |||
| static int tzflag; | |||
| if (NULL != tv) | |||
| { | |||
| GetSystemTimeAsFileTime(&ft); | |||
| tmpres |= ft.dwHighDateTime; | |||
| tmpres <<= 32; | |||
| tmpres |= ft.dwLowDateTime; | |||
| /*converting file time to unix epoch*/ | |||
| tmpres /= 10; /*convert into microseconds*/ | |||
| tmpres -= DELTA_EPOCH_IN_MICROSECS; | |||
| tv->tv_sec = (long)(tmpres / 1000000UL); | |||
| tv->tv_usec = (long)(tmpres % 1000000UL); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 | |||
| static void *huge_malloc(BLASLONG size){ | |||
| int shmid; | |||
| void *address; | |||
| #ifndef SHM_HUGETLB | |||
| #define SHM_HUGETLB 04000 | |||
| #endif | |||
| if ((shmid =shmget(IPC_PRIVATE, | |||
| (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), | |||
| SHM_HUGETLB | IPC_CREAT |0600)) < 0) { | |||
| printf( "Memory allocation failed(shmget).\n"); | |||
| exit(1); | |||
| } | |||
| address = shmat(shmid, NULL, SHM_RND); | |||
| if ((BLASLONG)address == -1){ | |||
| printf( "Memory allocation failed(shmat).\n"); | |||
| exit(1); | |||
| } | |||
| shmctl(shmid, IPC_RMID, 0); | |||
| return address; | |||
| } | |||
| #define malloc huge_malloc | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| struct timeval start, stop; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Time\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| NRM2 (&m, x, &inc_x); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.6f secs\n", timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -186,8 +186,8 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6); | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| @@ -189,9 +189,9 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| #ifdef COMPLEX | |||
| fprintf(stderr, " %10.2f MFlops\n", 6. * (double)m / timeg * 1.e-6); | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg); | |||
| #else | |||
| fprintf(stderr, " %10.2f MFlops\n", 1. * (double)m / timeg * 1.e-6); | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 1. * (double)m / timeg * 1.e-6, timeg); | |||
| #endif | |||
| } | |||
| @@ -5,6 +5,7 @@ | |||
| #include <time.h> | |||
| #include <cblas.h> | |||
| #include <omp.h> | |||
| #include <pthread.h> | |||
| #define MIN_SIZE 5 | |||
| #define MAX_SIZE 60 | |||
| #define NB_SIZE 10 | |||
| @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MBytes\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6); | |||
| " %10.2f MBytes %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| @@ -191,8 +191,8 @@ int main(int argc, char *argv[]){ | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1); | |||
| } | |||
| @@ -184,8 +184,8 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| @@ -1,5 +1,8 @@ | |||
| #!/usr/bin/perl | |||
| use File::Basename; | |||
| use File::Temp qw(tempfile); | |||
| # Checking cross compile | |||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); | |||
| @@ -8,6 +11,7 @@ $hostarch = "arm" if ($hostarch =~ /^arm.*/); | |||
| $hostarch = "arm64" if ($hostarch eq "aarch64"); | |||
| $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); | |||
| $tmpf = new File::Temp( UNLINK => 1 ); | |||
| $binary = $ENV{"BINARY"}; | |||
| $makefile = shift(@ARGV); | |||
| @@ -26,14 +30,12 @@ if ($?) { | |||
| $cross_suffix = ""; | |||
| if ($ARGV[0] =~ /(.*)(-[.\d]+)/) { | |||
| if ($1 =~ /(.*-)(.*)/) { | |||
| $cross_suffix = $1; | |||
| } | |||
| } else { | |||
| if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) { | |||
| $cross_suffix = $1; | |||
| } | |||
| if (dirname($compiler_name) ne ".") { | |||
| $cross_suffix .= dirname($compiler_name) . "/"; | |||
| } | |||
| if (basename($compiler_name) =~ /(.*-)(.*)/) { | |||
| $cross_suffix .= $1; | |||
| } | |||
| $compiler = ""; | |||
| @@ -63,7 +65,7 @@ $os = Android if ($data =~ /OS_ANDROID/); | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||
| $architecture = mips32 if ($data =~ /ARCH_MIPS32/); | |||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||
| @@ -79,7 +81,12 @@ if ($os eq "AIX") { | |||
| $defined = 1; | |||
| } | |||
| if (($architecture eq "mips32") || ($architecture eq "mips64")) { | |||
| if ($architecture eq "mips") { | |||
| $compiler_name .= " -mabi=32"; | |||
| $defined = 1; | |||
| } | |||
| if ($architecture eq "mips64") { | |||
| $compiler_name .= " -mabi=n32" if ($binary eq "32"); | |||
| $compiler_name .= " -mabi=64" if ($binary eq "64"); | |||
| $defined = 1; | |||
| @@ -152,10 +159,28 @@ if ($?) { | |||
| die 1; | |||
| } | |||
| $have_msa = 0; | |||
| if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||
| $code = '"addvi.b $w0, $w1, 1"'; | |||
| $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; | |||
| print $tmpf "#include <msa.h>\n\n"; | |||
| print $tmpf "void main(void){ __asm__ volatile($code); }\n"; | |||
| $args = "$msa_flags -o $tmpf.o -x c $tmpf"; | |||
| my @cmd = ("$compiler_name $args"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $have_msa = 0; | |||
| } else { | |||
| $have_msa = 1; | |||
| } | |||
| unlink("$tmpf.o"); | |||
| } | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||
| $architecture = mips32 if ($data =~ /ARCH_MIPS32/); | |||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||
| @@ -243,9 +268,11 @@ print MAKEFILE "BINARY64=\n" if $binformat ne bin64; | |||
| print MAKEFILE "BINARY32=1\n" if $binformat eq bin32; | |||
| print MAKEFILE "BINARY64=1\n" if $binformat eq bin64; | |||
| print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; | |||
| print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne ""; | |||
| print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne ""; | |||
| print MAKEFILE "CROSS=1\n" if $cross != 0; | |||
| print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; | |||
| print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; | |||
| print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; | |||
| $os =~ tr/[a-z]/[A-Z]/; | |||
| $architecture =~ tr/[a-z]/[A-Z]/; | |||
| @@ -257,6 +284,7 @@ print CONFFILE "#define C_$compiler\t1\n"; | |||
| print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; | |||
| print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; | |||
| print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; | |||
| print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; | |||
| if ($os eq "LINUX") { | |||
| @@ -53,7 +53,7 @@ endif() | |||
| add_custom_command( | |||
| TARGET ${OpenBLAS_LIBNAME} PRE_LINK | |||
| COMMAND perl | |||
| ARGS "${CMAKE_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" | |||
| ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" | |||
| COMMENT "Create openblas.def file" | |||
| VERBATIM) | |||
| @@ -50,20 +50,20 @@ else() | |||
| set(TARGET_CONF "config.h") | |||
| endif () | |||
| include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake") | |||
| if (NOT NOFORTRAN) | |||
| include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") | |||
| endif () | |||
| # compile getarch | |||
| set(GETARCH_SRC | |||
| ${CMAKE_SOURCE_DIR}/getarch.c | |||
| ${PROJECT_SOURCE_DIR}/getarch.c | |||
| ${CPUIDEMO} | |||
| ) | |||
| if (NOT MSVC) | |||
| list(APPEND GETARCH_SRC ${CMAKE_SOURCE_DIR}/cpuid.S) | |||
| list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) | |||
| endif () | |||
| if (MSVC) | |||
| @@ -76,7 +76,7 @@ set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") | |||
| file(MAKE_DIRECTORY ${GETARCH_DIR}) | |||
| try_compile(GETARCH_RESULT ${GETARCH_DIR} | |||
| SOURCES ${GETARCH_SRC} | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${CMAKE_SOURCE_DIR} | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} | |||
| OUTPUT_VARIABLE GETARCH_LOG | |||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} | |||
| ) | |||
| @@ -97,8 +97,8 @@ set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") | |||
| set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") | |||
| file(MAKE_DIRECTORY ${GETARCH2_DIR}) | |||
| try_compile(GETARCH2_RESULT ${GETARCH2_DIR} | |||
| SOURCES ${CMAKE_SOURCE_DIR}/getarch_2nd.c | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${CMAKE_SOURCE_DIR} | |||
| SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} | |||
| OUTPUT_VARIABLE GETARCH2_LOG | |||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} | |||
| ) | |||
| @@ -3,7 +3,7 @@ | |||
| ## Description: Ported from OpenBLAS/Makefile.system | |||
| ## | |||
| set(NETLIB_LAPACK_DIR "${CMAKE_SOURCE_DIR}/lapack-netlib") | |||
| set(NETLIB_LAPACK_DIR "${PROJECT_SOURCE_DIR}/lapack-netlib") | |||
| # TODO: Makefile.system detects Darwin (mac) and switches to clang here -hpa | |||
| # http://stackoverflow.com/questions/714100/os-detecting-makefile | |||
| @@ -78,7 +78,7 @@ else () | |||
| set(ONLY_CBLAS 0) | |||
| endif () | |||
| include("${CMAKE_SOURCE_DIR}/cmake/prebuild.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") | |||
| if (NOT DEFINED NUM_THREADS) | |||
| set(NUM_THREADS ${NUM_CORES}) | |||
| @@ -124,17 +124,17 @@ set(OBJCOPY "${CROSS_SUFFIX}objcopy") | |||
| set(OBJCONV "${CROSS_SUFFIX}objconv") | |||
| # OS dependent settings | |||
| include("${CMAKE_SOURCE_DIR}/cmake/os.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/os.cmake") | |||
| # Architecture dependent settings | |||
| include("${CMAKE_SOURCE_DIR}/cmake/arch.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake") | |||
| # C Compiler dependent settings | |||
| include("${CMAKE_SOURCE_DIR}/cmake/cc.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") | |||
| if (NOT NOFORTRAN) | |||
| # Fortran Compiler dependent settings | |||
| include("${CMAKE_SOURCE_DIR}/cmake/fc.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") | |||
| endif () | |||
| if (BINARY64) | |||
| @@ -247,10 +247,10 @@ if (NOT DEFINED SYMBOLSUFFIX) | |||
| set(SYMBOLSUFFIX "") | |||
| endif () | |||
| set(KERNELDIR "${CMAKE_SOURCE_DIR}/kernel/${ARCH}") | |||
| set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") | |||
| # TODO: nead to convert these Makefiles | |||
| # include ${CMAKE_SOURCE_DIR}/cmake/${ARCH}.cmake | |||
| # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake | |||
| if (${CORE} STREQUAL "PPC440") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC") | |||
| @@ -410,8 +410,8 @@ set(LIBDEFNAME "${LIBNAME}.${LIBSUFFIX}.def") | |||
| set(LIBEXPNAME "${LIBNAME}.${LIBSUFFIX}.exp") | |||
| set(LIBZIPNAME "${LIBNAME}.${LIBSUFFIX}.zip") | |||
| set(LIBS "${CMAKE_SOURCE_DIR}/${LIBNAME}") | |||
| set(LIBS_P "${CMAKE_SOURCE_DIR}/${LIBNAME_P}") | |||
| set(LIBS "${PROJECT_SOURCE_DIR}/${LIBNAME}") | |||
| set(LIBS_P "${PROJECT_SOURCE_DIR}/${LIBNAME_P}") | |||
| set(LIB_COMPONENTS BLAS) | |||
| @@ -332,6 +332,13 @@ typedef int blasint; | |||
| #endif | |||
| #endif | |||
| #ifdef POWER8 | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| #endif | |||
| /* | |||
| #ifdef PILEDRIVER | |||
| #ifndef YIELDING | |||
| @@ -397,6 +404,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||
| #include "common_sparc.h" | |||
| #endif | |||
| #ifdef ARCH_MIPS | |||
| #include "common_mips.h" | |||
| #endif | |||
| #ifdef ARCH_MIPS64 | |||
| #include "common_mips64.h" | |||
| #endif | |||
| @@ -615,9 +626,14 @@ void gotoblas_profile_init(void); | |||
| void gotoblas_profile_quit(void); | |||
| #ifdef USE_OPENMP | |||
| #ifndef C_MSVC | |||
| int omp_in_parallel(void); | |||
| int omp_get_num_procs(void); | |||
| #else | |||
| __declspec(dllimport) int __cdecl omp_in_parallel(void); | |||
| __declspec(dllimport) int __cdecl omp_get_num_procs(void); | |||
| #endif | |||
| #else | |||
| #ifdef __ELF__ | |||
| int omp_in_parallel (void) __attribute__ ((weak)); | |||
| int omp_get_num_procs(void) __attribute__ ((weak)); | |||
| @@ -0,0 +1,109 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #ifndef COMMON_MIPS | |||
| #define COMMON_MIPS | |||
| #define MB | |||
| #define WMB | |||
| #define INLINE inline | |||
| #define RETURN_BY_COMPLEX | |||
| #ifndef ASSEMBLER | |||
| static void INLINE blas_lock(volatile unsigned long *address){ | |||
| } | |||
| #define BLAS_LOCK_DEFINED | |||
| static inline unsigned int rpcc(void){ | |||
| unsigned long ret; | |||
| __asm__ __volatile__(".set push \n" | |||
| "rdhwr %0, $30 \n" | |||
| ".set pop" : "=r"(ret) : : "memory"); | |||
| return ret; | |||
| } | |||
| #define RPCC_DEFINED | |||
| static inline int blas_quickdivide(blasint x, blasint y){ | |||
| return x / y; | |||
| } | |||
| #define GET_IMAGE(res) | |||
| #define GET_IMAGE_CANCEL | |||
| #endif | |||
| #ifndef F_INTERFACE | |||
| #define REALNAME ASMNAME | |||
| #else | |||
| #define REALNAME ASMFNAME | |||
| #endif | |||
| #if defined(ASSEMBLER) && !defined(NEEDPARAM) | |||
| #define PROLOGUE \ | |||
| .arm ;\ | |||
| .global REALNAME ;\ | |||
| .func REALNAME ;\ | |||
| REALNAME: | |||
| #define EPILOGUE | |||
| #define PROFCODE | |||
| #endif | |||
| #define SEEK_ADDRESS | |||
| #ifndef PAGESIZE | |||
| #define PAGESIZE ( 4 << 10) | |||
| #endif | |||
| #define HUGE_PAGESIZE ( 4 << 20) | |||
| #define BUFFER_SIZE (16 << 20) | |||
| #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) | |||
| #ifndef MAP_ANONYMOUS | |||
| #define MAP_ANONYMOUS MAP_ANON | |||
| #endif | |||
| #endif | |||
| @@ -102,7 +102,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ | |||
| static inline unsigned int rpcc(void){ | |||
| unsigned long ret; | |||
| #if defined(LOONGSON3A) || defined(LOONGSON3B) | |||
| // unsigned long long tmp; | |||
| //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); | |||
| //ret=tmp; | |||
| @@ -111,17 +111,10 @@ static inline unsigned int rpcc(void){ | |||
| "rdhwr %0, $2\n" | |||
| ".set pop": "=r"(ret):: "memory"); | |||
| #else | |||
| __asm__ __volatile__(".set push \n" | |||
| ".set mips32r2\n" | |||
| "rdhwr %0, $30 \n" | |||
| ".set pop" : "=r"(ret) : : "memory"); | |||
| #endif | |||
| return ret; | |||
| } | |||
| #define RPCC_DEFINED | |||
| #if defined(LOONGSON3A) || defined(LOONGSON3B) | |||
| #ifndef NO_AFFINITY | |||
| #define WHEREAMI | |||
| static inline int WhereAmI(void){ | |||
| @@ -134,7 +127,6 @@ static inline int WhereAmI(void){ | |||
| } | |||
| #endif | |||
| #endif | |||
| static inline int blas_quickdivide(blasint x, blasint y){ | |||
| return x / y; | |||
| @@ -39,8 +39,13 @@ | |||
| #ifndef COMMON_POWER | |||
| #define COMMON_POWER | |||
| #if defined(POWER8) | |||
| #define MB __asm__ __volatile__ ("eieio":::"memory") | |||
| #define WMB __asm__ __volatile__ ("eieio":::"memory") | |||
| #else | |||
| #define MB __asm__ __volatile__ ("sync") | |||
| #define WMB __asm__ __volatile__ ("sync") | |||
| #endif | |||
| #define INLINE inline | |||
| @@ -798,7 +803,7 @@ Lmcount$lazy_ptr: | |||
| #elif defined(PPC440FP2) | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #elif defined(POWER8) | |||
| #define BUFFER_SIZE ( 32 << 20) | |||
| #define BUFFER_SIZE ( 64 << 20) | |||
| #else | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #endif | |||
| @@ -71,15 +71,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /*********************************************************************/ | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_SICORTEX 1 | |||
| #define CPU_LOONGSON3A 2 | |||
| #define CPU_LOONGSON3B 3 | |||
| #define CPU_P5600 1 | |||
| static char *cpuname[] = { | |||
| "UNKOWN", | |||
| "SICORTEX", | |||
| "LOONGSON3A", | |||
| "LOONGSON3B" | |||
| "P5600" | |||
| }; | |||
| int detect(void){ | |||
| @@ -120,7 +116,7 @@ int detect(void){ | |||
| if (strstr(p, "loongson3a")) | |||
| return CPU_LOONGSON3A; | |||
| }else{ | |||
| return CPU_SICORTEX; | |||
| return CPU_UNKNOWN; | |||
| } | |||
| } | |||
| //Check model name for Loongson3 | |||
| @@ -149,64 +145,40 @@ char *get_corename(void){ | |||
| } | |||
| void get_architecture(void){ | |||
| printf("MIPS64"); | |||
| printf("MIPS"); | |||
| } | |||
| void get_subarchitecture(void){ | |||
| if(detect()==CPU_LOONGSON3A) { | |||
| printf("LOONGSON3A"); | |||
| }else if(detect()==CPU_LOONGSON3B){ | |||
| printf("LOONGSON3B"); | |||
| if(detect()==CPU_P5600){ | |||
| printf("P5600"); | |||
| }else{ | |||
| printf("SICORTEX"); | |||
| printf("UNKNOWN"); | |||
| } | |||
| } | |||
| void get_subdirname(void){ | |||
| printf("mips64"); | |||
| printf("mips"); | |||
| } | |||
| void get_cpuconfig(void){ | |||
| if(detect()==CPU_LOONGSON3A) { | |||
| printf("#define LOONGSON3A\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 512488\n"); | |||
| printf("#define L2_LINESIZE 32\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| }else if(detect()==CPU_LOONGSON3B){ | |||
| printf("#define LOONGSON3B\n"); | |||
| if(detect()==CPU_P5600){ | |||
| printf("#define P5600\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 512488\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 32\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| }else{ | |||
| printf("#define SICORTEX\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 512488\n"); | |||
| printf("#define L2_LINESIZE 32\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 32\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| }else{ | |||
| printf("#define UNKNOWN\n"); | |||
| } | |||
| } | |||
| void get_libname(void){ | |||
| if(detect()==CPU_LOONGSON3A) { | |||
| printf("loongson3a\n"); | |||
| }else if(detect()==CPU_LOONGSON3B) { | |||
| printf("loongson3b\n"); | |||
| if(detect()==CPU_P5600) { | |||
| printf("p5600\n"); | |||
| }else{ | |||
| #ifdef __mips64 | |||
| printf("mips64\n"); | |||
| #else | |||
| printf("mips32\n"); | |||
| #endif | |||
| printf("mips\n"); | |||
| } | |||
| } | |||
| @@ -0,0 +1,238 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011-2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_SICORTEX 1 | |||
| #define CPU_LOONGSON3A 2 | |||
| #define CPU_LOONGSON3B 3 | |||
| #define CPU_I6400 4 | |||
| #define CPU_P6600 5 | |||
| static char *cpuname[] = { | |||
| "UNKOWN", | |||
| "SICORTEX", | |||
| "LOONGSON3A", | |||
| "LOONGSON3B", | |||
| "I6400", | |||
| "P6600" | |||
| }; | |||
| int detect(void){ | |||
| #ifdef linux | |||
| FILE *infile; | |||
| char buffer[512], *p; | |||
| p = (char *)NULL; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("cpu", buffer, 3)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| #if 0 | |||
| fprintf(stderr, "%s\n", p); | |||
| #endif | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A")){ | |||
| return CPU_LOONGSON3A; | |||
| }else if(strstr(p, "Loongson-3B")){ | |||
| return CPU_LOONGSON3B; | |||
| }else if (strstr(p, "Loongson-3")){ | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| p = (char *)NULL; | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("system type", buffer, 11)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if (strstr(p, "loongson3a")) | |||
| return CPU_LOONGSON3A; | |||
| }else{ | |||
| return CPU_SICORTEX; | |||
| } | |||
| } | |||
| //Check model name for Loongson3 | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| p = (char *)NULL; | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("model name", buffer, 10)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A")){ | |||
| return CPU_LOONGSON3A; | |||
| }else if(strstr(p, "Loongson-3B")){ | |||
| return CPU_LOONGSON3B; | |||
| } | |||
| } | |||
| #endif | |||
| return CPU_UNKNOWN; | |||
| } | |||
| char *get_corename(void){ | |||
| return cpuname[detect()]; | |||
| } | |||
| void get_architecture(void){ | |||
| printf("MIPS64"); | |||
| } | |||
| void get_subarchitecture(void){ | |||
| if(detect()==CPU_LOONGSON3A) { | |||
| printf("LOONGSON3A"); | |||
| }else if(detect()==CPU_LOONGSON3B){ | |||
| printf("LOONGSON3B"); | |||
| }else if(detect()==CPU_I6400){ | |||
| printf("I6400"); | |||
| }else if(detect()==CPU_P6600){ | |||
| printf("P6600"); | |||
| }else{ | |||
| printf("SICORTEX"); | |||
| } | |||
| } | |||
| void get_subdirname(void){ | |||
| printf("mips64"); | |||
| } | |||
| void get_cpuconfig(void){ | |||
| if(detect()==CPU_LOONGSON3A) { | |||
| printf("#define LOONGSON3A\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 512488\n"); | |||
| printf("#define L2_LINESIZE 32\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| }else if(detect()==CPU_LOONGSON3B){ | |||
| printf("#define LOONGSON3B\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 512488\n"); | |||
| printf("#define L2_LINESIZE 32\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| }else if(detect()==CPU_I6400){ | |||
| printf("#define I6400\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 32\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| }else if(detect()==CPU_P6600){ | |||
| printf("#define P6600\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 32\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| }else{ | |||
| printf("#define SICORTEX\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 512488\n"); | |||
| printf("#define L2_LINESIZE 32\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 32\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| } | |||
| } | |||
| void get_libname(void){ | |||
| if(detect()==CPU_LOONGSON3A) { | |||
| printf("loongson3a\n"); | |||
| }else if(detect()==CPU_LOONGSON3B) { | |||
| printf("loongson3b\n"); | |||
| }else if(detect()==CPU_I6400) { | |||
| printf("i6400\n"); | |||
| }else if(detect()==CPU_P6600) { | |||
| printf("p6600\n"); | |||
| }else{ | |||
| printf("mips64\n"); | |||
| } | |||
| } | |||
| @@ -1172,6 +1172,8 @@ int get_cpuname(void){ | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 12: | |||
| // Braswell | |||
| case 13: | |||
| // Avoton | |||
| return CPUTYPE_NEHALEM; | |||
| @@ -1678,6 +1680,8 @@ int get_coretype(void){ | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| case 12: | |||
| // Braswell | |||
| case 13: | |||
| // Avoton | |||
| return CORE_NEHALEM; | |||
| @@ -110,7 +110,7 @@ ARCH_MIPS64 | |||
| #endif | |||
| #if defined(__mips32) || defined(__mips) | |||
| ARCH_MIPS32 | |||
| ARCH_MIPS | |||
| #endif | |||
| #ifdef __alpha | |||
| @@ -1,4 +1,4 @@ | |||
| include_directories(${CMAKE_SOURCE_DIR}) | |||
| include_directories(${PROJECT_SOURCE_DIR}) | |||
| enable_language(Fortran) | |||
| @@ -42,6 +42,7 @@ ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o | |||
| all :: all1 all2 all3 | |||
| all1: xscblat1 xdcblat1 xccblat1 xzcblat1 | |||
| ifndef CROSS | |||
| ifeq ($(USE_OPENMP), 1) | |||
| OMP_NUM_THREADS=2 ./xscblat1 | |||
| OMP_NUM_THREADS=2 ./xdcblat1 | |||
| @@ -53,8 +54,10 @@ else | |||
| OPENBLAS_NUM_THREADS=2 ./xccblat1 | |||
| OPENBLAS_NUM_THREADS=2 ./xzcblat1 | |||
| endif | |||
| endif | |||
| all2: xscblat2 xdcblat2 xccblat2 xzcblat2 | |||
| ifndef CROSS | |||
| ifeq ($(USE_OPENMP), 1) | |||
| OMP_NUM_THREADS=2 ./xscblat2 < sin2 | |||
| OMP_NUM_THREADS=2 ./xdcblat2 < din2 | |||
| @@ -66,8 +69,10 @@ else | |||
| OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2 | |||
| OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2 | |||
| endif | |||
| endif | |||
| all3: xscblat3 xdcblat3 xccblat3 xzcblat3 | |||
| ifndef CROSS | |||
| ifeq ($(USE_OPENMP), 1) | |||
| OMP_NUM_THREADS=2 ./xscblat3 < sin3 | |||
| OMP_NUM_THREADS=2 ./xdcblat3 < din3 | |||
| @@ -88,6 +93,7 @@ else | |||
| OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m | |||
| OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m | |||
| endif | |||
| endif | |||
| @@ -1,5 +1,5 @@ | |||
| include_directories(${CMAKE_SOURCE_DIR}) | |||
| include_directories(${PROJECT_SOURCE_DIR}) | |||
| # sources that need to be compiled twice, once with no flags and once with LOWER | |||
| set(UL_SOURCES | |||
| @@ -1,4 +1,4 @@ | |||
| include_directories(${CMAKE_SOURCE_DIR}) | |||
| include_directories(${PROJECT_SOURCE_DIR}) | |||
| # N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa | |||
| @@ -1,4 +1,4 @@ | |||
| include_directories(${CMAKE_SOURCE_DIR}) | |||
| include_directories(${PROJECT_SOURCE_DIR}) | |||
| if (${CORE} STREQUAL "PPC440") | |||
| set(MEMORY memory_qalloc.c) | |||
| @@ -261,8 +261,8 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| //Intel Avoton | |||
| if (model == 13) { | |||
| //Intel Braswell / Avoton | |||
| if (model == 12 || model == 13) { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| @@ -439,7 +439,7 @@ static gotoblas_t *force_coretype(char *coretype){ | |||
| char message[128]; | |||
| //char mname[20]; | |||
| for ( i=1 ; i <= 21; i++) | |||
| for ( i=1 ; i <= 22; i++) | |||
| { | |||
| if (!strncasecmp(coretype,corename[i],20)) | |||
| { | |||
| @@ -361,6 +361,9 @@ static void numa_mapping(void) { | |||
| unsigned long work, bit; | |||
| int count = 0; | |||
| int bitmask_idx = 0; | |||
| int current_cpu; | |||
| int current_node = 0; | |||
| int cpu_count = 0; | |||
| for (node = 0; node < common -> num_nodes; node ++) { | |||
| core = 0; | |||
| @@ -382,33 +385,84 @@ static void numa_mapping(void) { | |||
| fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); | |||
| #endif | |||
| h = 1; | |||
| while (h < count) h = 2 * h + 1; | |||
| while (h > 1) { | |||
| h /= 2; | |||
| for (i = h; i < count; i++) { | |||
| work = common -> cpu_info[i]; | |||
| bit = CPU_ISSET(i, &cpu_orig_mask[0]); | |||
| j = i - h; | |||
| while (work < common -> cpu_info[j]) { | |||
| common -> cpu_info[j + h] = common -> cpu_info[j]; | |||
| if (CPU_ISSET(j, &cpu_orig_mask[0])) { | |||
| CPU_SET(j + h, &cpu_orig_mask[0]); | |||
| } else { | |||
| CPU_CLR(j + h, &cpu_orig_mask[0]); | |||
| } | |||
| j -= h; | |||
| if (j < 0) break; | |||
| } | |||
| common -> cpu_info[j + h] = work; | |||
| if (bit) { | |||
| CPU_SET(j + h, &cpu_orig_mask[0]); | |||
| } else { | |||
| CPU_CLR(j + h, &cpu_orig_mask[0]); | |||
| current_cpu = sched_getcpu(); | |||
| for (cpu = 0; cpu < count; cpu++) { | |||
| if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) { | |||
| current_node = READ_NODE(common -> cpu_info[cpu]); | |||
| break; | |||
| } | |||
| } | |||
| for (i = 0; i < MAX_BITMASK_LEN; i++) | |||
| cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]); | |||
| /* | |||
| * If all the processes can be accommodated in the | |||
| * in the current node itself, then bind to cores | |||
| * from the current node only | |||
| */ | |||
| if (numprocs <= cpu_count) { | |||
| /* | |||
| * First sort all the cores in order from the current node. | |||
| * Then take remaining nodes one by one in order, | |||
| * and sort their cores in order. | |||
| */ | |||
| for (i = 0; i < count; i++) { | |||
| for (j = 0; j < count - 1; j++) { | |||
| int node_1, node_2; | |||
| int core_1, core_2; | |||
| int swap = 0; | |||
| node_1 = READ_NODE(common -> cpu_info[j]); | |||
| node_2 = READ_NODE(common -> cpu_info[j + 1]); | |||
| core_1 = READ_CORE(common -> cpu_info[j]); | |||
| core_2 = READ_CORE(common -> cpu_info[j + 1]); | |||
| if (node_1 == node_2) { | |||
| if (core_1 > core_2) | |||
| swap = 1; | |||
| } else { | |||
| if ((node_2 == current_node) || | |||
| ((node_1 != current_node) && (node_1 > node_2))) | |||
| swap = 1; | |||
| } | |||
| if (swap) { | |||
| unsigned long temp; | |||
| temp = common->cpu_info[j]; | |||
| common->cpu_info[j] = common->cpu_info[j + 1]; | |||
| common->cpu_info[j + 1] = temp; | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| h = 1; | |||
| while (h < count) h = 2 * h + 1; | |||
| while (h > 1) { | |||
| h /= 2; | |||
| for (i = h; i < count; i++) { | |||
| work = common -> cpu_info[i]; | |||
| bit = CPU_ISSET(i, &cpu_orig_mask[0]); | |||
| j = i - h; | |||
| while (work < common -> cpu_info[j]) { | |||
| common -> cpu_info[j + h] = common -> cpu_info[j]; | |||
| if (CPU_ISSET(j, &cpu_orig_mask[0])) { | |||
| CPU_SET(j + h, &cpu_orig_mask[0]); | |||
| } else { | |||
| CPU_CLR(j + h, &cpu_orig_mask[0]); | |||
| } | |||
| j -= h; | |||
| if (j < 0) break; | |||
| } | |||
| common -> cpu_info[j + h] = work; | |||
| if (bit) { | |||
| CPU_SET(j + h, &cpu_orig_mask[0]); | |||
| } else { | |||
| CPU_CLR(j + h, &cpu_orig_mask[0]); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -416,7 +470,10 @@ static void numa_mapping(void) { | |||
| fprintf(stderr, "\nSorting ...\n\n"); | |||
| for (cpu = 0; cpu < count; cpu++) | |||
| fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); | |||
| fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu], | |||
| READ_CPU(common -> cpu_info[cpu]), | |||
| READ_CORE(common -> cpu_info[cpu]), | |||
| READ_NODE(common -> cpu_info[cpu])); | |||
| #endif | |||
| } | |||
| @@ -167,7 +167,7 @@ int get_L2_size(void){ | |||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ | |||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | |||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ | |||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) | |||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | |||
| @@ -251,7 +251,7 @@ int get_L2_size(void){ | |||
| void blas_set_parameter(void){ | |||
| int factor; | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| int size = 16; | |||
| #else | |||
| int size = get_L2_size(); | |||
| @@ -110,9 +110,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def | |||
| endif | |||
| ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2)) | |||
| #only build without Fortran | |||
| $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| else | |||
| $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| endif | |||
| dllinit.$(SUFFIX) : dllinit.c | |||
| @@ -114,7 +114,7 @@ if ($compiler eq "") { | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($data =~ /IBM/) { | |||
| if ($data =~ /IBM XL/) { | |||
| $vendor = IBM; | |||
| $openmp = "-openmp"; | |||
| } | |||
| @@ -223,7 +223,12 @@ if (!$?) { | |||
| } | |||
| #For gfortran MIPS | |||
| if ($?) { | |||
| $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| $mips_data = `$compiler_bin -E -dM - < /dev/null`; | |||
| if ($mips_data =~ /_MIPS_ISA_MIPS64/) { | |||
| $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } else { | |||
| $link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| } | |||
| $binary = "" if ($?); | |||
| } | |||
| @@ -131,6 +131,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* #define FORCE_SICORTEX */ | |||
| /* #define FORCE_LOONGSON3A */ | |||
| /* #define FORCE_LOONGSON3B */ | |||
| /* #define FORCE_I6400 */ | |||
| /* #define FORCE_P6600 */ | |||
| /* #define FORCE_P5600 */ | |||
| /* #define FORCE_ITANIUM2 */ | |||
| /* #define FORCE_SPARC */ | |||
| /* #define FORCE_SPARCV7 */ | |||
| @@ -699,6 +702,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_I6400 | |||
| #define FORCE | |||
| #define ARCHITECTURE "MIPS" | |||
| #define SUBARCHITECTURE "I6400" | |||
| #define SUBDIRNAME "mips64" | |||
| #define ARCHCONFIG "-DI6400 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| #define LIBNAME "i6400" | |||
| #define CORENAME "I6400" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_P6600 | |||
| #define FORCE | |||
| #define ARCHITECTURE "MIPS" | |||
| #define SUBARCHITECTURE "P6600" | |||
| #define SUBDIRNAME "mips64" | |||
| #define ARCHCONFIG "-DP6600 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| #define LIBNAME "p6600" | |||
| #define CORENAME "P6600" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_P5600 | |||
| #define FORCE | |||
| #define ARCHITECTURE "MIPS" | |||
| #define SUBARCHITECTURE "P5600" | |||
| #define SUBDIRNAME "mips" | |||
| #define ARCHCONFIG "-DP5600 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| #define LIBNAME "p5600" | |||
| #define CORENAME "P5600" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_ITANIUM2 | |||
| #define FORCE | |||
| #define ARCHITECTURE "IA64" | |||
| @@ -888,7 +933,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #ifdef __mips__ | |||
| #ifdef __mips64 | |||
| #include "cpuid_mips64.c" | |||
| #else | |||
| #include "cpuid_mips.c" | |||
| #endif | |||
| #define OPENBLAS_SUPPORTED | |||
| #endif | |||
| @@ -1,5 +1,5 @@ | |||
| include_directories(${CMAKE_SOURCE_DIR}) | |||
| include_directories(${PROJECT_SOURCE_DIR}) | |||
| set(BLAS1_SOURCES | |||
| @@ -42,6 +42,10 @@ | |||
| #include "functable.h" | |||
| #endif | |||
| // Disable multi-threading as it does not show any performance | |||
| // benefits. Keep the multi-threading code for the record. | |||
| #undef SMP | |||
| #ifndef CBLAS | |||
| void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ | |||
| @@ -243,6 +243,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| { | |||
| buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT); | |||
| // It seems to be required for some K8 or Barcelona CPU | |||
| buffer_size += 8; | |||
| if(incx != 1) | |||
| buffer_size += n * 2; | |||
| } | |||
| @@ -1,6 +1,6 @@ | |||
| include_directories(${CMAKE_SOURCE_DIR}) | |||
| include("${CMAKE_SOURCE_DIR}/cmake/kernel.cmake") | |||
| include_directories(${PROJECT_SOURCE_DIR}) | |||
| include("${PROJECT_SOURCE_DIR}/cmake/kernel.cmake") | |||
| # Makefile | |||
| @@ -12,10 +12,6 @@ ifeq ($(ARCH), ia64) | |||
| USE_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(ARCH), MIPS) | |||
| USE_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(ARCH), arm) | |||
| USE_TRMM = 1 | |||
| endif | |||
| @@ -40,6 +40,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| { | |||
| BLASLONG i=0,j=0; | |||
| if ( (n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| while(j < n) | |||
| { | |||
| @@ -43,6 +43,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| BLASLONG ip = 0; | |||
| FLOAT temp; | |||
| if ( (n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| inc_x2 = 2 * inc_x; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| @@ -58,43 +58,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| str TMPF, [Y], #SZ | |||
| #else | |||
| #if !defined(DOUBLE) | |||
| ld1 {v0.2s}, [X], #8 | |||
| st1 {v0.2s}, [Y], #8 | |||
| ldr d0, [X], #8 | |||
| str d0, [Y], #8 | |||
| #else | |||
| ld1 {v0.2d}, [X], #16 | |||
| st1 {v0.2d}, [Y], #16 | |||
| ldr q0, [X], #16 | |||
| str q0, [Y], #16 | |||
| #endif | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F4 | |||
| #if !defined(COMPLEX) | |||
| #if !defined(DOUBLE) | |||
| ld1 {v0.4s}, [X], #16 | |||
| st1 {v0.4s}, [Y], #16 | |||
| ldr q0, [X], #16 | |||
| str q0, [Y], #16 | |||
| #else // DOUBLE | |||
| ld1 {v0.4s}, [X], #16 | |||
| ld1 {v1.4s}, [X], #16 | |||
| st1 {v0.4s}, [Y], #16 | |||
| st1 {v1.4s}, [Y], #16 | |||
| ldr q0, [X], #16 | |||
| str q0, [Y], #16 | |||
| ldr q1, [X], #16 | |||
| str q1, [Y], #16 | |||
| #endif | |||
| #else // COMPLEX | |||
| #if !defined(DOUBLE) | |||
| ld1 {v0.4s}, [X], #16 | |||
| ld1 {v1.4s}, [X], #16 | |||
| st1 {v0.4s}, [Y], #16 | |||
| st1 {v1.4s}, [Y], #16 | |||
| ldr q0, [X], #16 | |||
| str q0, [Y], #16 | |||
| ldr q1, [X], #16 | |||
| str q1, [Y], #16 | |||
| #else // DOUBLE | |||
| ld1 {v0.4s}, [X], #16 | |||
| ld1 {v1.4s}, [X], #16 | |||
| ld1 {v2.4s}, [X], #16 | |||
| ld1 {v3.4s}, [X], #16 | |||
| st1 {v0.4s}, [Y], #16 | |||
| st1 {v1.4s}, [Y], #16 | |||
| st1 {v2.4s}, [Y], #16 | |||
| st1 {v3.4s}, [Y], #16 | |||
| ldr q0, [X], #16 | |||
| str q0, [Y], #16 | |||
| ldr q1, [X], #16 | |||
| str q1, [Y], #16 | |||
| ldr q2, [X], #16 | |||
| str q2, [Y], #16 | |||
| ldr q3, [X], #16 | |||
| str q3, [Y], #16 | |||
| #endif | |||
| #endif | |||
| @@ -339,7 +339,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp q0, q1, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ldp q2, q3, [pCRow0] | |||
| fmla v2.2d, v18.2d, alphaV0 | |||
| @@ -356,7 +355,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp q4, q5, [pCRow1] | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ldp q6, q7, [pCRow1] | |||
| fmla v6.2d, v22.2d, alphaV0 | |||
| @@ -373,7 +371,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp q0, q1, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| ldp q2, q3, [pCRow2] | |||
| fmla v2.2d, v26.2d, alphaV0 | |||
| @@ -390,7 +387,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp q4, q5, [pCRow3] | |||
| add pCRow3, pCRow3, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| ldp q6, q7, [pCRow3] | |||
| fmla v6.2d, v30.2d, alphaV0 | |||
| @@ -434,33 +430,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro SAVE4x4 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d, v9.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| fmla v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow0, pCRow0, #32 | |||
| ld1 {v12.2d, v13.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| fmla v13.2d, v21.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, #32 | |||
| ld1 {v8.2d, v9.2d}, [pCRow2] | |||
| fmla v8.2d, v24.2d, alphaV0 | |||
| fmla v9.2d, v25.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow2] | |||
| add pCRow1, pCRow2, LDC | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow2, pCRow2, #32 | |||
| ld1 {v12.2d, v13.2d}, [pCRow1] | |||
| ld1 {v12.2d, v13.2d}, [pCRow3] | |||
| fmla v12.2d, v28.2d, alphaV0 | |||
| fmla v13.2d, v29.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| st1 {v12.2d, v13.2d}, [pCRow3] | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| add pCRow3, pCRow3, #32 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -487,29 +488,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro SAVE2x4 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow0, pCRow0, #16 | |||
| ld1 {v12.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, #16 | |||
| ld1 {v8.2d}, [pCRow2] | |||
| fmla v8.2d, v24.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow2] | |||
| add pCRow1, pCRow2, LDC | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow2, pCRow2, #16 | |||
| ld1 {v12.2d}, [pCRow1] | |||
| ld1 {v12.2d}, [pCRow3] | |||
| fmla v12.2d, v28.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| st1 {v12.2d}, [pCRow3] | |||
| add pCRow0, pCRow0, #16 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| add pCRow3, pCRow3, #16 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -532,7 +538,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro SAVE1x4 | |||
| fmov alpha0, alpha | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v8.d}[0], [pCRow0] | |||
| ld1 {v8.d}[1], [pCRow1] | |||
| @@ -540,16 +545,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| st1 {v8.d}[0], [pCRow0] | |||
| st1 {v8.d}[1], [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow1, pCRow2, LDC | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow0, pCRow0, #8 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, #8 | |||
| ld1 {v12.d}[0], [pCRow2] | |||
| ld1 {v12.d}[1], [pCRow1] | |||
| ld1 {v12.d}[1], [pCRow3] | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.d}[0], [pCRow2] | |||
| st1 {v12.d}[1], [pCRow1] | |||
| st1 {v12.d}[1], [pCRow3] | |||
| add pCRow0, pCRow0, #8 | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow2, pCRow2, #8 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| add pCRow3, pCRow3, #8 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -578,6 +588,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v22.2d, v2.2d, v8.d[1] | |||
| @@ -586,7 +598,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro SAVE8x2 | |||
| fmov alpha0, alpha | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| fmla v0.2d, v16.2d, alphaV0 | |||
| @@ -595,6 +606,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla v3.2d, v19.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow0, pCRow0, #64 | |||
| ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0 | |||
| fmla v5.2d, v21.2d, alphaV0 | |||
| @@ -602,7 +616,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla v7.2d, v23.2d, alphaV0 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #64 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, #64 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -628,19 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro SAVE4x2 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d, v9.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| fmla v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow0, pCRow0, #32 | |||
| ld1 {v12.2d, v13.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| fmla v13.2d, v21.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, #32 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -663,17 +681,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro SAVE2x2 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| add pCRow1 , pCRow0, LDC | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow0, pCRow0, #16 | |||
| ld1 {v12.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, #16 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -694,7 +715,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro SAVE1x2 | |||
| fmov alpha0, alpha | |||
| add pCRow1 , pCRow0, LDC | |||
| ld1 {v8.d}[0], [pCRow0] | |||
| ld1 {v8.d}[1], [pCRow1] | |||
| @@ -702,7 +722,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| st1 {v8.d}[0], [pCRow0] | |||
| st1 {v8.d}[1], [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow0, pCRow0, #8 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, #8 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -726,12 +749,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE8x1 | |||
| fmov alpha0, alpha | |||
| ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| fmla v0.2d, v16.2d, alphaV0 | |||
| fmla v1.2d, v17.2d, alphaV0 | |||
| @@ -739,6 +764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla v3.2d, v19.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow0, pCRow0, #64 | |||
| .endm | |||
| @@ -763,11 +789,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro SAVE4x1 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d, v9.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| fmla v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow0, pCRow0, #32 | |||
| .endm | |||
| @@ -790,10 +818,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro SAVE2x1 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow0, pCRow0, #16 | |||
| .endm | |||
| @@ -819,6 +849,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmadd d8, d16, alpha0, d8 | |||
| str d8, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow0, pCRow0, #8 | |||
| .endm | |||
| @@ -858,6 +889,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /******************************************************************************/ | |||
| .align 5 | |||
| dgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| @@ -989,17 +1021,26 @@ dgemm_kernel_L4_M4_20: | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L4_M4_40 | |||
| .align 5 | |||
| dgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL4x4_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL4x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL4x4_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL4x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL4x4_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL4x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL4x4_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M4_22 | |||
| @@ -1012,6 +1053,8 @@ dgemm_kernel_L4_M4_40: | |||
| dgemm_kernel_L4_M4_42: | |||
| KERNEL4x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M4_42 | |||
| @@ -1022,7 +1065,6 @@ dgemm_kernel_L4_M4_100: | |||
| dgemm_kernel_L4_M4_END: | |||
| dgemm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| @@ -1042,16 +1084,23 @@ dgemm_kernel_L4_M2_20: | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L4_M2_40 | |||
| .align 5 | |||
| dgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL2x4_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL2x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL2x4_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL2x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| @@ -1063,9 +1112,12 @@ dgemm_kernel_L4_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L4_M2_100 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| dgemm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M2_42 | |||
| @@ -1092,15 +1144,22 @@ dgemm_kernel_L4_M1_20: | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L4_M1_40 | |||
| .align 5 | |||
| dgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL1x4_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| @@ -1112,9 +1171,11 @@ dgemm_kernel_L4_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L4_M1_100 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| dgemm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M1_42 | |||
| @@ -1143,9 +1204,10 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| tst counterJ , #2 | |||
| ble dgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pC,pC,LDC, lsl #1 | |||
| add pC, pCRow1, LDC | |||
| mov pA, origPA // pA = A | |||
| @@ -1156,6 +1218,7 @@ dgemm_kernel_L2_M8_BEGIN: | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L2_M4_BEGIN | |||
| .align 5 | |||
| dgemm_kernel_L2_M8_20: | |||
| INIT8x2 | |||
| @@ -1165,28 +1228,31 @@ dgemm_kernel_L2_M8_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dgemm_kernel_L2_M8_40 | |||
| .align 5 | |||
| .align 5 | |||
| dgemm_kernel_L2_M8_22: | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M8_22 | |||
| dgemm_kernel_L2_M8_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M8_100 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| dgemm_kernel_L2_M8_42: | |||
| KERNEL8x2_SUB | |||
| @@ -1221,17 +1287,23 @@ dgemm_kernel_L2_M4_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dgemm_kernel_L2_M4_40 | |||
| .align 5 | |||
| .align 5 | |||
| dgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL4x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL4x2_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL4x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL4x2_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| @@ -1243,9 +1315,12 @@ dgemm_kernel_L2_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M4_100 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| dgemm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M4_42 | |||
| @@ -1279,19 +1354,26 @@ dgemm_kernel_L2_M2_20: | |||
| dgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M2_22 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| dgemm_kernel_L2_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| @@ -1329,18 +1411,24 @@ dgemm_kernel_L2_M1_20: | |||
| dgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M1_22 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| dgemm_kernel_L2_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| @@ -1380,6 +1468,7 @@ dgemm_kernel_L1_M8_BEGIN: | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L1_M4_BEGIN | |||
| .align 5 | |||
| dgemm_kernel_L1_M8_20: | |||
| INIT8x1 | |||
| @@ -1388,14 +1477,16 @@ dgemm_kernel_L1_M8_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M8_40 | |||
| .align 5 | |||
| .align 5 | |||
| dgemm_kernel_L1_M8_22: | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| @@ -1410,6 +1501,7 @@ dgemm_kernel_L1_M8_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M8_100 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| dgemm_kernel_L1_M8_42: | |||
| KERNEL8x1_SUB | |||
| @@ -1443,17 +1535,23 @@ dgemm_kernel_L1_M4_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M4_40 | |||
| .align 5 | |||
| .align 5 | |||
| dgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL4x1_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL4x1_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| @@ -1465,9 +1563,11 @@ dgemm_kernel_L1_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M4_100 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| dgemm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M4_42 | |||
| @@ -1501,18 +1601,24 @@ dgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M2_22 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| dgemm_kernel_L1_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| @@ -1547,14 +1653,17 @@ dgemm_kernel_L1_M1_20: | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M1_40 | |||
| dgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -1567,6 +1676,8 @@ dgemm_kernel_L1_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M1_100 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| dgemm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| @@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define pA x15 | |||
| #define temp x16 | |||
| #define tempOffset x17 | |||
| #define tempK x18 | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alpha x17 | |||
| #define temp x18 | |||
| #define tempOffset x19 | |||
| #define tempK x20 | |||
| #define alpha0 d10 | |||
| #define alphaV0 v10.d[0] | |||
| #define alpha1 d11 | |||
| #define alphaV1 v11.d[0] | |||
| #define alpha2 d14 | |||
| #define alphaV2 v14.d[0] | |||
| #define alpha3 d15 | |||
| #define alphaV3 v15.d[0] | |||
| #define A_PRE_SIZE 2560 | |||
| #define B_PRE_SIZE 448 | |||
| #define C_PRE_SIZE 128 | |||
| // 00 origM | |||
| // 01 origN | |||
| @@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //v05 pA1_2, pA1_3 | |||
| //v06 pA1_4, pA1_5 | |||
| //v07 pA1_6, pA1_7 | |||
| //v08 must save pB0_0, pB0_1 | |||
| //v09 must save pB0_2, pB0_3 | |||
| //v10 must save ALPHA0 | |||
| //v11 must save ALPHA1 | |||
| //v12 must save pB1_0, pB1_1 | |||
| //v13 must save pB1_2, pB1_3 | |||
| //v14 must save ALPHA2 | |||
| //v15 must save ALPHA3 | |||
| //v08 must save pB0_0 | |||
| //v09 must save pB0_1 | |||
| //v10 must save pB0_2 --> ALPHA0 | |||
| //v11 must save pB0_3 | |||
| //v12 must save pB1_0 | |||
| //v13 must save pB1_1 | |||
| //v14 must save pB1_2 | |||
| //v15 must save pB1_3 | |||
| //v16 must save C00, C01 | |||
| //v17 must save C02, C03 | |||
| //v18 C04, C05 | |||
| @@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_I | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp q0, q1, [pA], #32 | |||
| ldp d8, d9, [pB], #16 | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| fmul v20.2d, v0.2d, v9.d[0] | |||
| ldp d10, d11, [pB], #16 | |||
| fmul v17.2d, v1.2d, v8.d[0] | |||
| fmul v21.2d, v1.2d, v9.d[0] | |||
| ldp q2, q3, [pA], #32 | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| fmul v28.2d, v0.2d, v11.d[0] | |||
| ldp q4, q5, [pA], #32 | |||
| fmul v25.2d, v1.2d, v10.d[0] | |||
| fmul v29.2d, v1.2d, v11.d[0] | |||
| ldp d12, d13, [pB], #16 | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| fmul v19.2d, v3.2d, v8.d[0] | |||
| fmul v22.2d, v2.2d, v9.d[0] | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| fmul v21.2d, v1.2d, v8.d[1] | |||
| fmul v22.2d, v2.2d, v8.d[1] | |||
| fmul v23.2d, v3.2d, v8.d[1] | |||
| ldp d14, d15, [pB], #16 | |||
| fmul v24.2d, v0.2d, v9.d[0] | |||
| fmul v25.2d, v1.2d, v9.d[0] | |||
| fmul v26.2d, v2.2d, v9.d[0] | |||
| fmul v27.2d, v3.2d, v9.d[0] | |||
| fmul v26.2d, v2.2d, v10.d[0] | |||
| fmul v30.2d, v2.2d, v11.d[0] | |||
| fmul v28.2d, v0.2d, v9.d[1] | |||
| fmul v29.2d, v1.2d, v9.d[1] | |||
| fmul v30.2d, v2.2d, v9.d[1] | |||
| fmul v31.2d, v3.2d, v9.d[1] | |||
| ldp q6, q7, [pA], #32 | |||
| ld1 {v4.2d, v5.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v6.2d, v7.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v19.2d, v3.2d, v8.d[0] | |||
| fmul v27.2d, v3.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmul v31.2d, v3.2d, v11.d[0] | |||
| fmul v23.2d, v3.2d, v9.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| ldp q4, q5, [pA], #32 | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| ldp d12, d13, [pB], #16 | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v25.2d, v1.2d, v10.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v22.2d, v2.2d, v8.d[1] | |||
| fmla v23.2d, v3.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| fmla v26.2d, v2.2d, v9.d[0] | |||
| fmla v27.2d, v3.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v9.d[0] | |||
| fmla v29.2d, v1.2d, v11.d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| fmla v30.2d, v2.2d, v9.d[1] | |||
| fmla v31.2d, v3.2d, v9.d[1] | |||
| ldp d14, d15, [pB], #16 | |||
| ld1 {v4.2d, v5.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v6.2d, v7.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v22.2d, v2.2d, v9.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla v26.2d, v2.2d, v10.d[0] | |||
| fmla v30.2d, v2.2d, v11.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v23.2d, v3.2d, v9.d[0] | |||
| ldp q6, q7, [pA], #32 | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| fmla v27.2d, v3.2d, v10.d[0] | |||
| fmla v31.2d, v3.2d, v11.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v20.2d, v4.2d, v13.d[0] | |||
| fmla v24.2d, v4.2d, v14.d[0] | |||
| fmla v28.2d, v4.2d, v15.d[0] | |||
| ldp q0, q1, [pA], #32 | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v25.2d, v5.2d, v14.d[0] | |||
| ldp d8, d9, [pB], #16 | |||
| fmla v21.2d, v5.2d, v13.d[0] | |||
| fmla v29.2d, v5.2d, v15.d[0] | |||
| ldp d10, d11, [pB], #16 | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v22.2d, v6.2d, v13.d[0] | |||
| fmla v20.2d, v4.2d, v12.d[1] | |||
| fmla v21.2d, v5.2d, v12.d[1] | |||
| fmla v22.2d, v6.2d, v12.d[1] | |||
| fmla v23.2d, v7.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v24.2d, v4.2d, v13.d[0] | |||
| fmla v25.2d, v5.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v13.d[0] | |||
| fmla v27.2d, v7.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v14.d[0] | |||
| fmla v30.2d, v6.2d, v15.d[0] | |||
| fmla v28.2d, v4.2d, v13.d[1] | |||
| fmla v29.2d, v5.2d, v13.d[1] | |||
| fmla v30.2d, v6.2d, v13.d[1] | |||
| fmla v31.2d, v7.2d, v13.d[1] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v23.2d, v7.2d, v13.d[0] | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp q2, q3, [pA], #32 | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| fmla v27.2d, v7.2d, v14.d[0] | |||
| fmla v31.2d, v7.2d, v15.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v20.2d, v4.2d, v13.d[0] | |||
| fmla v24.2d, v4.2d, v14.d[0] | |||
| fmla v28.2d, v4.2d, v15.d[0] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v25.2d, v5.2d, v14.d[0] | |||
| fmla v21.2d, v5.2d, v13.d[0] | |||
| fmla v29.2d, v5.2d, v15.d[0] | |||
| fmla v20.2d, v4.2d, v12.d[1] | |||
| fmla v21.2d, v5.2d, v12.d[1] | |||
| fmla v22.2d, v6.2d, v12.d[1] | |||
| fmla v23.2d, v7.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v24.2d, v4.2d, v13.d[0] | |||
| fmla v25.2d, v5.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v13.d[0] | |||
| fmla v27.2d, v7.2d, v13.d[0] | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v22.2d, v6.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v14.d[0] | |||
| fmla v30.2d, v6.2d, v15.d[0] | |||
| fmla v28.2d, v4.2d, v13.d[1] | |||
| fmla v29.2d, v5.2d, v13.d[1] | |||
| fmla v30.2d, v6.2d, v13.d[1] | |||
| fmla v31.2d, v7.2d, v13.d[1] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v23.2d, v7.2d, v13.d[0] | |||
| fmla v27.2d, v7.2d, v14.d[0] | |||
| fmla v31.2d, v7.2d, v15.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_SUB | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp q0, q1, [pA], #32 | |||
| ldp d8, d9, [pB], #16 | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| ldp d10, d11, [pB], #16 | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v21.2d, v1.2d, v9.d[0] | |||
| ldp q2, q3, [pA], #32 | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| fmla v25.2d, v1.2d, v10.d[0] | |||
| fmla v29.2d, v1.2d, v11.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v22.2d, v2.2d, v9.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v22.2d, v2.2d, v8.d[1] | |||
| fmla v23.2d, v3.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| fmla v26.2d, v2.2d, v9.d[0] | |||
| fmla v27.2d, v3.2d, v9.d[0] | |||
| fmla v26.2d, v2.2d, v10.d[0] | |||
| fmla v30.2d, v2.2d, v11.d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| fmla v30.2d, v2.2d, v9.d[1] | |||
| fmla v31.2d, v3.2d, v9.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v27.2d, v3.2d, v10.d[0] | |||
| fmla v31.2d, v3.2d, v11.d[0] | |||
| fmla v23.2d, v3.2d, v9.d[0] | |||
| .endm | |||
| .macro SAVE8x4 | |||
| add pCRow1, pCRow0, LDC | |||
| fmov alpha0, alpha | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| fmul v0.2d, v16.2d, alphaV0 | |||
| fmul v1.2d, v17.2d, alphaV1 | |||
| fmul v2.2d, v18.2d, alphaV2 | |||
| fmul v3.2d, v19.2d, alphaV3 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| fmul v1.2d, v17.2d, alphaV0 | |||
| stp q0, q1, [pCRow0] | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| fmul v2.2d, v18.2d, alphaV0 | |||
| fmul v3.2d, v19.2d, alphaV0 | |||
| stp q2, q3, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| fmul v4.2d, v20.2d, alphaV0 | |||
| fmul v5.2d, v21.2d, alphaV1 | |||
| fmul v6.2d, v22.2d, alphaV2 | |||
| fmul v7.2d, v23.2d, alphaV3 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| fmul v5.2d, v21.2d, alphaV0 | |||
| stp q4, q5, [pCRow1] | |||
| add pCRow1, pCRow2, LDC | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| fmul v6.2d, v22.2d, alphaV0 | |||
| fmul v7.2d, v23.2d, alphaV0 | |||
| stp q6, q7, [pCRow1] | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| fmul v0.2d, v24.2d, alphaV0 | |||
| fmul v1.2d, v25.2d, alphaV1 | |||
| fmul v2.2d, v26.2d, alphaV2 | |||
| fmul v3.2d, v27.2d, alphaV3 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2] | |||
| fmul v1.2d, v25.2d, alphaV0 | |||
| stp q0, q1, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| fmul v2.2d, v26.2d, alphaV0 | |||
| fmul v3.2d, v27.2d, alphaV0 | |||
| stp q2, q3, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| fmul v4.2d, v28.2d, alphaV0 | |||
| fmul v5.2d, v29.2d, alphaV1 | |||
| fmul v6.2d, v30.2d, alphaV2 | |||
| fmul v7.2d, v31.2d, alphaV3 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| fmul v5.2d, v29.2d, alphaV0 | |||
| stp q4, q5, [pCRow3] | |||
| add pCRow0, pCRow0, #64 | |||
| add pCRow3, pCRow3, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| fmul v6.2d, v30.2d, alphaV0 | |||
| fmul v7.2d, v31.2d, alphaV0 | |||
| stp q6, q7, [pCRow3] | |||
| add pCRow3, pCRow3, #32 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x4 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| fmul v9.2d, v17.2d, alphaV1 | |||
| fmul v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v12.2d, v20.2d, alphaV2 | |||
| fmul v13.2d, v21.2d, alphaV3 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| fmul v13.2d, v21.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| fmul v8.2d, v24.2d, alphaV0 | |||
| fmul v9.2d, v25.2d, alphaV1 | |||
| fmul v9.2d, v25.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow2] | |||
| add pCRow1, pCRow2, LDC | |||
| fmul v12.2d, v28.2d, alphaV2 | |||
| fmul v13.2d, v29.2d, alphaV3 | |||
| fmul v12.2d, v28.2d, alphaV0 | |||
| fmul v13.2d, v29.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x4 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v12.2d, v20.2d, alphaV1 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| fmul v8.2d, v24.2d, alphaV2 | |||
| fmul v8.2d, v24.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow2] | |||
| add pCRow1, pCRow2, LDC | |||
| fmul v12.2d, v28.2d, alphaV3 | |||
| fmul v12.2d, v28.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x4 | |||
| fmov alpha0, alpha | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| @@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow1, pCRow2, LDC | |||
| fmul v12.2d, v20.2d, alphaV1 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.d}[0], [pCRow2] | |||
| st1 {v12.d}[1], [pCRow1] | |||
| @@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE8x2 | |||
| fmov alpha0, alpha | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v0.2d, v16.2d, alphaV0 | |||
| fmul v1.2d, v17.2d, alphaV1 | |||
| fmul v2.2d, v18.2d, alphaV2 | |||
| fmul v3.2d, v19.2d, alphaV3 | |||
| fmul v1.2d, v17.2d, alphaV0 | |||
| fmul v2.2d, v18.2d, alphaV0 | |||
| fmul v3.2d, v19.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| fmul v4.2d, v20.2d, alphaV0 | |||
| fmul v5.2d, v21.2d, alphaV1 | |||
| fmul v6.2d, v22.2d, alphaV2 | |||
| fmul v7.2d, v23.2d, alphaV3 | |||
| fmul v5.2d, v21.2d, alphaV0 | |||
| fmul v6.2d, v22.2d, alphaV0 | |||
| fmul v7.2d, v23.2d, alphaV0 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x2 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| fmul v9.2d, v17.2d, alphaV1 | |||
| fmul v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v12.2d, v20.2d, alphaV2 | |||
| fmul v13.2d, v21.2d, alphaV3 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| fmul v13.2d, v21.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x2 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| add pCRow1 , pCRow0, LDC | |||
| fmul v12.2d, v20.2d, alphaV1 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x2 | |||
| fmov alpha0, alpha | |||
| add pCRow1 , pCRow0, LDC | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| @@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE8x1 | |||
| fmov alpha0, alpha | |||
| fmul v0.2d, v16.2d, alphaV0 | |||
| fmul v1.2d, v17.2d, alphaV1 | |||
| fmul v2.2d, v18.2d, alphaV2 | |||
| fmul v3.2d, v19.2d, alphaV3 | |||
| fmul v1.2d, v17.2d, alphaV0 | |||
| fmul v2.2d, v18.2d, alphaV0 | |||
| fmul v3.2d, v19.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x1 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| fmul v9.2d, v17.2d, alphaV1 | |||
| fmul v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x1 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| @@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x1 | |||
| fmov alpha0, alpha | |||
| fmul d8, d16, alpha0 | |||
| str d8, [pCRow0] | |||
| @@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| fmov alpha0, d0 | |||
| fmov alpha1, d0 | |||
| fmov alpha2, d0 | |||
| fmov alpha3, d0 | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alpha, d0 | |||
| lsl LDC, LDC, #3 // ldc = ldc * 8 | |||
| @@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /******************************************************************************/ | |||
| dtrmm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow3, pCRow2, LDC | |||
| add pC, pCRow3, LDC | |||
| #if defined(LEFT) | |||
| mov tempOffset, offset | |||
| @@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN: | |||
| cmp counterI, #0 | |||
| ble dtrmm_kernel_L4_M4_BEGIN | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| @@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20: | |||
| add tempK, tempOffset, #4 | |||
| #endif | |||
| asr counterL , tempK, #1 // L = K / 2 | |||
| asr counterL , tempK, #3 // L = K / 8 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt dtrmm_kernel_L4_M8_32 | |||
| KERNEL8x4_I // do one in the K | |||
| KERNEL8x4_M2 // do another in the K | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble dtrmm_kernel_L4_M8_22a | |||
| .align 5 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_22: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M8_22 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dtrmm_kernel_L4_M8_44 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_32: | |||
| tst counterL, #1 | |||
| ble dtrmm_kernel_L4_M8_40 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dtrmm_kernel_L4_M8_44 | |||
| @@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40: | |||
| dtrmm_kernel_L4_M8_44: | |||
| ands counterL , tempK, #1 | |||
| ands counterL , tempK, #7 | |||
| ble dtrmm_kernel_L4_M8_100 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_46: | |||
| KERNEL8x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne dtrmm_kernel_L4_M8_46 | |||
| dtrmm_kernel_L4_M8_100: | |||
| SAVE8x4 | |||
| @@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100: | |||
| #if defined(LEFT) | |||
| add tempOffset, tempOffset, #8 | |||
| #endif | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| dtrmm_kernel_L4_M8_END: | |||
| subs counterI, counterI, #1 | |||
| @@ -68,6 +68,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SHZ 3 | |||
| #endif | |||
| #define A_PRE_SIZE 768 | |||
| #define Y_PRE_SIZE 768 | |||
| /******************************************************************************/ | |||
| .macro SAVE_REGS | |||
| @@ -105,36 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v2.4s, v3.4s}, [A_PTR], #32 | |||
| ld1 {v4.4s, v5.4s}, [Y_IPTR], #32 | |||
| fmla v4.4s, v1.4s, v2.4s | |||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||
| fmla v5.4s, v1.4s, v3.4s | |||
| st1 {v4.4s, v5.4s}, [Y_OPTR], #32 | |||
| ld1 {v6.4s, v7.4s}, [A_PTR], #32 | |||
| ld1 {v8.4s, v9.4s}, [Y_IPTR], #32 | |||
| fmla v8.4s, v1.4s, v6.4s | |||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||
| fmla v9.4s, v1.4s, v7.4s | |||
| st1 {v8.4s, v9.4s}, [Y_OPTR], #32 | |||
| #else //DOUBLE | |||
| ld1 {v2.2d, v3.2d}, [A_PTR], #32 | |||
| ld1 {v4.2d, v5.2d}, [Y_IPTR], #32 | |||
| fmla v4.2d, v1.2d, v2.2d | |||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||
| fmla v5.2d, v1.2d, v3.2d | |||
| st1 {v4.2d, v5.2d}, [Y_OPTR], #32 | |||
| ld1 {v6.2d, v7.2d}, [A_PTR], #32 | |||
| ld1 {v8.2d, v9.2d}, [Y_IPTR], #32 | |||
| fmla v8.2d, v1.2d, v6.2d | |||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||
| fmla v9.2d, v1.2d, v7.2d | |||
| st1 {v8.2d, v9.2d}, [Y_OPTR], #32 | |||
| ld1 {v10.2d, v11.2d}, [A_PTR], #32 | |||
| ld1 {v12.2d, v13.2d}, [Y_IPTR], #32 | |||
| fmla v12.2d, v1.2d, v10.2d | |||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||
| fmla v13.2d, v1.2d, v11.2d | |||
| st1 {v12.2d, v13.2d}, [Y_OPTR], #32 | |||
| ld1 {v14.2d, v15.2d}, [A_PTR], #32 | |||
| ld1 {v16.2d, v17.2d}, [Y_IPTR], #32 | |||
| fmla v16.2d, v1.2d, v14.2d | |||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||
| fmla v17.2d, v1.2d, v15.2d | |||
| st1 {v16.2d, v17.2d}, [Y_OPTR], #32 | |||
| #endif | |||
| @@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define J x11 /* loop variable */ | |||
| #define I x12 /* loop variable */ | |||
| #define X_PREFETCH_SIZE 768 | |||
| #define A_PREFETCH_SIZE 768 | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| @@ -112,42 +115,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64 | |||
| ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64 | |||
| fmla v1.4s, v5.4s, v9.4s | |||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||
| fmla v2.4s, v6.4s, v10.4s | |||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||
| fmla v3.4s, v7.4s, v11.4s | |||
| ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 | |||
| fmla v4.4s, v8.4s, v12.4s | |||
| ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 | |||
| ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64 | |||
| fmla v1.4s, v13.4s, v17.4s | |||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||
| fmla v2.4s, v14.4s, v18.4s | |||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||
| fmla v3.4s, v15.4s, v19.4s | |||
| fmla v4.4s, v16.4s, v20.4s | |||
| #else | |||
| ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 | |||
| ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 | |||
| fmla v1.2d, v5.2d, v9.2d | |||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||
| fmla v2.2d, v6.2d, v10.2d | |||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||
| fmla v3.2d, v7.2d, v11.2d | |||
| fmla v4.2d, v8.2d, v12.2d | |||
| ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 | |||
| ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 | |||
| fmla v1.2d, v13.2d, v17.2d | |||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||
| fmla v2.2d, v14.2d, v18.2d | |||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||
| fmla v3.2d, v15.2d, v19.2d | |||
| fmla v4.2d, v16.2d, v20.2d | |||
| ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 | |||
| ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 | |||
| fmla v1.2d, v5.2d, v9.2d | |||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||
| fmla v2.2d, v6.2d, v10.2d | |||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||
| fmla v3.2d, v7.2d, v11.2d | |||
| fmla v4.2d, v8.2d, v12.2d | |||
| ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 | |||
| ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 | |||
| fmla v1.2d, v13.2d, v17.2d | |||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||
| fmla v2.2d, v14.2d, v18.2d | |||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||
| fmla v3.2d, v15.2d, v19.2d | |||
| fmla v4.2d, v16.2d, v20.2d | |||
| #endif | |||
| @@ -72,6 +72,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fabs MAXF, MAXF | |||
| .endm | |||
| .macro KERNEL_F8 | |||
| #if !defined(DOUBLE) | |||
| ldp q2, q3, [X], #32 | |||
| fabs v2.4s, v2.4s | |||
| fabs v3.4s, v3.4s | |||
| fmax v2.4s, v2.4s, v3.4s | |||
| fmaxv TMPF, v2.4s | |||
| fcmp MAXF, TMPF | |||
| fcsel MAXF, MAXF, TMPF, COND | |||
| csel INDEX, INDEX, Z, COND | |||
| add Z, Z, #8 | |||
| #else | |||
| ldp q2, q3, [X], #32 | |||
| ldp q4, q5, [X], #32 | |||
| fabs v2.2d, v2.2d | |||
| fabs v3.2d, v3.2d | |||
| fabs v4.2d, v4.2d | |||
| fabs v5.2d, v5.2d | |||
| fmax v2.2d, v2.2d, v3.2d | |||
| fmax v4.2d, v4.2d, v5.2d | |||
| fmax v2.2d, v2.2d, v4.2d | |||
| fmaxp TMPF, v2.2d | |||
| fcmp MAXF, TMPF | |||
| fcsel MAXF, MAXF, TMPF, COND | |||
| csel INDEX, INDEX, Z, COND | |||
| add Z, Z, #8 | |||
| #endif | |||
| PRFM PLDL1KEEP, [X, #1024] | |||
| .endm | |||
| .macro KERNEL_F8_FINALIZE | |||
| sub x6, INDEX, #1 | |||
| #if !defined(DOUBLE) | |||
| lsl x6, x6, #2 | |||
| add x7, x7, x6 | |||
| ldp q2, q3, [x7] | |||
| fabs v2.4s, v2.4s | |||
| fabs v3.4s, v3.4s | |||
| ins v4.s[0], v3.s[0] | |||
| ins v5.s[0], v3.s[1] | |||
| ins v6.s[0], v3.s[2] | |||
| ins v7.s[0], v3.s[3] | |||
| add x6, INDEX, #7 | |||
| fcmp MAXF, s7 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s6 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s5 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v2.s[0] | |||
| ins v5.s[0], v2.s[1] | |||
| ins v6.s[0], v2.s[2] | |||
| ins v7.s[0], v2.s[3] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s7 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s6 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s5 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| #else | |||
| add x6, x6, #4 | |||
| lsl x6, x6, #3 | |||
| add x7, x7, x6 | |||
| ldp q2, q3, [x7] | |||
| fabs v2.2d, v2.2d | |||
| fabs v3.2d, v3.2d | |||
| ins v4.d[0], v2.d[0] | |||
| ins v5.d[0], v2.d[1] | |||
| ins v6.d[0], v3.d[0] | |||
| ins v7.d[0], v3.d[1] | |||
| add x6, INDEX, #7 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d6 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d5 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d4 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x7, x7, #32 | |||
| ldp q2, q3, [x7] | |||
| fabs v2.2d, v2.2d | |||
| fabs v3.2d, v3.2d | |||
| ins v4.d[0], v2.d[0] | |||
| ins v5.d[0], v2.d[1] | |||
| ins v6.d[0], v3.d[0] | |||
| ins v7.d[0], v3.d[1] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d6 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d5 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d4 | |||
| csel INDEX, x6, INDEX, eq | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| ld1 TMPVF, [X], INC_X | |||
| add Z, Z, #1 | |||
| @@ -92,6 +234,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp INC_X, xzr | |||
| ble iamax_kernel_zero | |||
| cmp INC_X, #1 | |||
| bne iamax_kernel_S_BEGIN | |||
| mov x7, X | |||
| iamax_kernel_F_BEGIN: | |||
| INIT_S | |||
| subs N, N, #1 | |||
| ble iamax_kernel_L999 | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| beq iamax_kernel_F1 | |||
| add Z, Z, #1 | |||
| iamax_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_F8 | |||
| KERNEL_F8_FINALIZE | |||
| sub Z, Z, #1 | |||
| iamax_kernel_F1: | |||
| ands I, N, #7 | |||
| ble iamax_kernel_L999 | |||
| iamax_kernel_F10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_F10 | |||
| b iamax_kernel_L999 | |||
| iamax_kernel_S_BEGIN: | |||
| INIT_S | |||
| subs N, N, #1 | |||
| @@ -78,6 +78,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F8 | |||
| #if !defined(DOUBLE) | |||
| ldp q2, q3, [X], #32 | |||
| ldp q4, q5, [X], #32 | |||
| fabs v2.4s, v2.4s | |||
| fabs v3.4s, v3.4s | |||
| fabs v4.4s, v4.4s | |||
| fabs v5.4s, v5.4s | |||
| faddp v2.4s, v2.4s, v3.4s | |||
| faddp v3.4s, v4.4s, v5.4s | |||
| fmax v2.4s, v2.4s, v3.4s | |||
| fmaxv TMPF, v2.4s | |||
| fcmp MAXF, TMPF | |||
| fcsel MAXF, MAXF, TMPF, COND | |||
| csel INDEX, INDEX, Z, COND | |||
| add Z, Z, #8 | |||
| #else | |||
| ldp q2, q3, [X], #32 | |||
| ldp q4, q5, [X], #32 | |||
| ldp q16, q17, [X], #32 | |||
| ldp q18, q19, [X], #32 | |||
| fabs v2.2d, v2.2d | |||
| fabs v3.2d, v3.2d | |||
| fabs v4.2d, v4.2d | |||
| fabs v5.2d, v5.2d | |||
| fabs v16.2d, v16.2d | |||
| fabs v17.2d, v17.2d | |||
| fabs v18.2d, v18.2d | |||
| fabs v19.2d, v19.2d | |||
| faddp v2.2d, v2.2d, v3.2d | |||
| faddp v3.2d, v4.2d, v5.2d | |||
| faddp v4.2d, v16.2d, v17.2d | |||
| faddp v5.2d, v18.2d, v19.2d | |||
| fmax v2.2d, v2.2d, v3.2d | |||
| fmax v4.2d, v4.2d, v5.2d | |||
| fmax v2.2d, v2.2d, v4.2d | |||
| fmaxp TMPF, v2.2d | |||
| fcmp MAXF, TMPF | |||
| fcsel MAXF, MAXF, TMPF, COND | |||
| csel INDEX, INDEX, Z, COND | |||
| add Z, Z, #8 | |||
| #endif | |||
| PRFM PLDL1KEEP, [X, #1024] | |||
| .endm | |||
| .macro KERNEL_F8_FINALIZE | |||
| sub x6, INDEX, #1 | |||
| #if !defined(DOUBLE) | |||
| lsl x6, x6, #3 | |||
| add x7, x7, x6 | |||
| ldp q2, q3, [x7] | |||
| ldp q4, q5, [x7, #32] | |||
| fabs v2.4s, v2.4s | |||
| fabs v3.4s, v3.4s | |||
| fabs v4.4s, v4.4s | |||
| fabs v5.4s, v5.4s | |||
| faddp v2.4s, v2.4s, v3.4s | |||
| faddp v3.4s, v4.4s, v5.4s | |||
| ins v4.s[0], v3.s[3] | |||
| add x6, INDEX, #7 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v3.s[2] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v3.s[1] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v3.s[0] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v2.s[3] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v2.s[2] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v2.s[1] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v2.s[0] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| #else | |||
| lsl x6, x6, #4 | |||
| add x7, x7, x6 | |||
| ldp q2, q3, [x7] | |||
| ldp q4, q5, [x7, #32] | |||
| ldp q16, q17, [x7, #64] | |||
| ldp q18, q19, [x7, #96] | |||
| fabs v2.2d, v2.2d | |||
| fabs v3.2d, v3.2d | |||
| fabs v4.2d, v4.2d | |||
| fabs v5.2d, v5.2d | |||
| fabs v16.2d, v16.2d | |||
| fabs v17.2d, v17.2d | |||
| fabs v18.2d, v18.2d | |||
| fabs v19.2d, v19.2d | |||
| faddp v2.2d, v2.2d, v3.2d | |||
| faddp v3.2d, v4.2d, v5.2d | |||
| faddp v4.2d, v16.2d, v17.2d | |||
| faddp v5.2d, v18.2d, v19.2d | |||
| ins v7.d[0], v5.d[1] | |||
| add x6, INDEX, #7 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v7.d[0], v5.d[0] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v7.d[0], v4.d[1] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v7.d[0], v4.d[0] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v7.d[0], v3.d[1] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v7.d[0], v3.d[0] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v7.d[0], v2.d[1] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v7.d[0], v2.d[0] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| #if !defined(DOUBLE) | |||
| ld1 {v1.2s}, [X], INC_X | |||
| @@ -107,6 +280,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp INC_X, xzr | |||
| ble iamax_kernel_zero | |||
| cmp INC_X, #1 | |||
| bne iamax_kernel_S_BEGIN | |||
| mov x7, X | |||
| iamax_kernel_F_BEGIN: | |||
| INIT_S | |||
| subs N, N, #1 | |||
| ble iamax_kernel_L999 | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| ble iamax_kernel_F1 | |||
| add Z, Z, #1 | |||
| iamax_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_F8 | |||
| KERNEL_F8_FINALIZE | |||
| sub Z, Z, #1 | |||
| iamax_kernel_F1: | |||
| ands I, N, #7 | |||
| ble iamax_kernel_L999 | |||
| iamax_kernel_F10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_F10 | |||
| b iamax_kernel_L999 | |||
| iamax_kernel_S_BEGIN: | |||
| INIT_S | |||
| subs N, N, #1 | |||
| @@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define pA x15 | |||
| #define alpha_save_R x16 | |||
| #define alpha_save_I x17 | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alphaR x17 | |||
| #define alphaI x18 | |||
| #define alpha0_R d10 | |||
| #define alphaV0_R v10.d[0] | |||
| #define alpha0_I d11 | |||
| #define alphaV0_I v11.d[0] | |||
| #define alpha1_R d14 | |||
| #define alphaV1_R v14.d[0] | |||
| #define alpha1_I d15 | |||
| #define alphaV1_I v15.d[0] | |||
| #define A_PRE_SIZE 2560 | |||
| #define B_PRE_SIZE 448 | |||
| #define C_PRE_SIZE 128 | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define OP_rr fmla | |||
| @@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 pA | |||
| // 16 alpha_save_R | |||
| // 17 alpha_save_I | |||
| // 18 must save | |||
| // 15 pCRow3 | |||
| // 16 pA | |||
| // 17 alpha_save_R | |||
| // 18 must save alpha_save_I | |||
| // 19 must save | |||
| // 20 must save | |||
| // 21 must save | |||
| @@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x4_I | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| @@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.2d, v2.2d, v9.d[0] | |||
| #else | |||
| fmul v19.2d, v2.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| @@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.2d, v2.2d, v9.d[0] | |||
| #else | |||
| fmul v19.2d, v2.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| fmul v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v29.2d, v1.2d, v10.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmul v30.2d, v2.2d, v10.d[1] | |||
| OP_ii v30.2d, v3.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v31.2d, v3.2d, v10.d[1] | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| @@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| ld2 {v12.2d, v13.2d}, [pB] // For next round | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| @@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v14.2d, v15.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v4.2d, v5.2d} , [pA] // For next round | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| @@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| ld2 {v6.2d, v7.2d} , [pA] // For next round | |||
| add pA, pA, #32 | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| OP_rr v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| OP_ri v27.2d, v2.2d, v11.d[0] | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| OP_rr v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| @@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v17.2d, v4.2d, v13.d[0] | |||
| OP_ir v17.2d, v5.2d, v12.d[0] | |||
| ld2 {v8.2d, v9.2d}, [pB] // For next round | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v6.2d, v12.d[0] | |||
| @@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v19.2d, v6.2d, v13.d[0] | |||
| OP_ir v19.2d, v7.2d, v12.d[0] | |||
| ld2 {v10.2d, v11.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v4.2d, v12.d[1] | |||
| OP_ii v20.2d, v5.2d, v13.d[1] | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| ld2 {v0.2d, v1.2d}, [pA] // For next round | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| @@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| OP_ir v23.2d, v7.2d, v12.d[1] | |||
| ld2 {v2.2d, v3.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v24.2d, v4.2d, v14.d[0] | |||
| OP_ii v24.2d, v5.2d, v15.d[0] | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| OP_ir v27.2d, v7.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| OP_rr v28.2d, v4.2d, v14.d[1] | |||
| OP_ii v28.2d, v5.2d, v15.d[1] | |||
| @@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| OP_ii v22.2d, v7.2d, v13.d[1] | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| @@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| @@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x4_SUB | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| @@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| ld2 {v0.2d, v1.2d}, [pCRow0] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow0] | |||
| fmla v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmla v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, LDC | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmla v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmla v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow1] | |||
| fmla v6.2d, v22.2d, alphaV0_R | |||
| fmls v6.2d, v23.2d, alphaV0_I | |||
| fmla v7.2d, v22.2d, alphaV1_I | |||
| fmla v7.2d, v23.2d, alphaV1_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmla v7.2d, v22.2d, alphaV0_I | |||
| fmla v7.2d, v23.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| ld2 {v0.2d, v1.2d}, [pCRow2] | |||
| fmla v0.2d, v24.2d, alphaV0_R | |||
| fmls v0.2d, v25.2d, alphaV0_I | |||
| fmla v1.2d, v24.2d, alphaV1_I | |||
| fmla v1.2d, v25.2d, alphaV1_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmla v1.2d, v24.2d, alphaV0_I | |||
| fmla v1.2d, v25.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v2.2d, v26.2d, alphaV0_R | |||
| fmls v2.2d, v27.2d, alphaV0_I | |||
| fmla v3.2d, v26.2d, alphaV1_I | |||
| fmla v3.2d, v27.2d, alphaV1_R | |||
| fmla v3.2d, v26.2d, alphaV0_I | |||
| fmla v3.2d, v27.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, LDC | |||
| add pCRow2, pCRow2, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| ld2 {v4.2d, v5.2d}, [pCRow3] | |||
| fmla v4.2d, v28.2d, alphaV0_R | |||
| fmls v4.2d, v29.2d, alphaV0_I | |||
| fmla v5.2d, v28.2d, alphaV1_I | |||
| fmla v5.2d, v29.2d, alphaV1_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmla v5.2d, v28.2d, alphaV0_I | |||
| fmla v5.2d, v29.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow3] | |||
| add pCRow3, pCRow3, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow3] | |||
| fmla v6.2d, v30.2d, alphaV0_R | |||
| fmls v6.2d, v31.2d, alphaV0_I | |||
| fmla v7.2d, v30.2d, alphaV1_I | |||
| fmla v7.2d, v31.2d, alphaV1_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmla v7.2d, v30.2d, alphaV0_I | |||
| fmla v7.2d, v31.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow3] | |||
| add pCRow0, pCRow0, #64 | |||
| add pCRow3, pCRow3, #32 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmla v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmla v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v24.2d, alphaV0_R | |||
| fmls v0.2d, v25.2d, alphaV0_I | |||
| fmla v1.2d, v24.2d, alphaV1_I | |||
| fmla v1.2d, v25.2d, alphaV1_R | |||
| fmla v1.2d, v24.2d, alphaV0_I | |||
| fmla v1.2d, v25.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v28.2d, alphaV0_R | |||
| fmls v4.2d, v29.2d, alphaV0_I | |||
| fmla v5.2d, v28.2d, alphaV1_I | |||
| fmla v5.2d, v29.2d, alphaV1_R | |||
| fmla v5.2d, v28.2d, alphaV0_I | |||
| fmla v5.2d, v29.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.d, v1.d}[0], [pCRow1] | |||
| fmla d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmla d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmla d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.d, v5.d}[0], [pCRow1] | |||
| fmla d4, d20, alphaV0_R | |||
| fmls d4, d21, alphaV0_I | |||
| fmla d5, d20, alphaV1_I | |||
| fmla d5, d21, alphaV1_R | |||
| fmla d5, d20, alphaV0_I | |||
| fmla d5, d21, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.d, v1.d}[0], [pCRow1] | |||
| fmla d0, d24, alphaV0_R | |||
| fmls d0, d25, alphaV0_I | |||
| fmla d1, d24, alphaV1_I | |||
| fmla d1, d25, alphaV1_R | |||
| fmla d1, d24, alphaV0_I | |||
| fmla d1, d25, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.d, v5.d}[0], [pCRow1] | |||
| fmla d4, d28, alphaV0_R | |||
| fmls d4, d29, alphaV0_I | |||
| fmla d5, d28, alphaV1_I | |||
| fmla d5, d29, alphaV1_R | |||
| fmla d5, d28, alphaV0_I | |||
| fmla d5, d29, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmla v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| fmla v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmla v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmla v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmla v6.2d, v22.2d, alphaV0_R | |||
| fmls v6.2d, v23.2d, alphaV0_I | |||
| fmla v7.2d, v22.2d, alphaV1_I | |||
| fmla v7.2d, v23.2d, alphaV1_R | |||
| fmla v7.2d, v22.2d, alphaV0_I | |||
| fmla v7.2d, v23.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmla v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmla v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.d, v1.d}[0], [pCRow1] | |||
| fmla d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmla d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmla d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.d, v5.d}[0], [pCRow1] | |||
| fmla d4, d20, alphaV0_R | |||
| fmls d4, d21, alphaV0_I | |||
| fmla d5, d20, alphaV1_I | |||
| fmla d5, d21, alphaV1_R | |||
| fmla d5, d20, alphaV0_I | |||
| fmla d5, d21, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmla v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| fmla v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.d, v1.d}[0], [pCRow1] | |||
| fmla d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmla d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmla d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| fmov alpha_save_R, d0 | |||
| fmov alpha_save_I, d1 | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alphaR, d0 | |||
| fmov alphaI, d1 | |||
| lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | |||
| @@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ble zgemm_kernel_L2_BEGIN | |||
| zgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow3, pCRow2, LDC | |||
| add pC, pCRow3, LDC | |||
| mov pA, origPA // pA = start of A array | |||
| zgemm_kernel_L4_M4_BEGIN: | |||
| @@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN: | |||
| cmp counterI, #0 | |||
| ble zgemm_kernel_L4_M2_BEGIN | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| asr counterL , origK, #3 | |||
| cmp counterL , #2 | |||
| blt zgemm_kernel_L4_M4_32 | |||
| KERNEL4x4_I // do one in the K | |||
| KERNEL4x4_M2 // do another in the K | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble zgemm_kernel_L4_M4_22a | |||
| .align 5 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L4_M4_22 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b zgemm_kernel_L4_M4_44 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble zgemm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b zgemm_kernel_L4_M4_44 | |||
| @@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40: | |||
| zgemm_kernel_L4_M4_44: | |||
| ands counterL , origK, #1 | |||
| ands counterL , origK, #7 | |||
| ble zgemm_kernel_L4_M4_100 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne zgemm_kernel_L4_M4_46 | |||
| zgemm_kernel_L4_M4_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVE4x4 | |||
| @@ -43,6 +43,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define Y_OPTR x13 /* loop Y vector address */ | |||
| #define X_PTR x14 /* loop X vector address */ | |||
| #define A_PRE_SIZE 768 | |||
| #define Y_PRE_SIZE 768 | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| @@ -50,14 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if !defined(DOUBLE) | |||
| #define ALPHA_R s0 | |||
| #define ALPHA_I s1 | |||
| #define ALPHA_R_COPY s7 | |||
| #define ALPHA_I_COPY s8 | |||
| #define SHZ 3 | |||
| #else | |||
| #define ALPHA_R d0 | |||
| #define ALPHA_I d1 | |||
| #define ALPHA_R_COPY d7 | |||
| #define ALPHA_I_COPY d8 | |||
| #define SHZ 4 | |||
| #endif | |||
| @@ -95,20 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro INIT | |||
| /********** INIT FOR F4 LOOP **********/ | |||
| fmov ALPHA_R_COPY, ALPHA_R | |||
| fmov ALPHA_I_COPY, ALPHA_I | |||
| #if !defined(DOUBLE) | |||
| ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA) | |||
| ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA) | |||
| ins v7.d[1], v7.d[0] | |||
| ins v8.d[1], v8.d[0] | |||
| #else | |||
| ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA) | |||
| ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA) | |||
| #endif | |||
| /******* INIT FOR F1 AND S1 LOOP ******/ | |||
| #if !defined(DOUBLE) | |||
| ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) | |||
| eor v2.16b, v2.16b, v2.16b | |||
| @@ -129,47 +114,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro INIT_LOOP | |||
| /********** INIT_LOOP FOR F4 LOOP **********/ | |||
| #if !defined(DOUBLE) | |||
| ld1 {v9.2s}, [X_PTR] // [I(X), R(X)] | |||
| ins v10.s[0], v9.s[1] | |||
| ins v9.s[1], v9.s[0] // [R(X), R(X)] | |||
| ins v10.s[1], v10.s[0] // [I(X), I(X)] | |||
| ins v9.d[1], v9.d[0] | |||
| ins v10.d[1], v10.d[0] | |||
| ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] | |||
| ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] | |||
| fmul v2.2s, v0.2s, v2.2s | |||
| fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] | |||
| ins v3.s[0], v2.s[1] | |||
| /********** INIT_LOOP FOR F4 LOOP **********/ | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||
| fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | |||
| fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] | |||
| fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] | |||
| dup v21.4s, v2.s[0] // R[TEMP] | |||
| dup v22.4s, v2.s[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub s25, s25, s3 | |||
| dup v23.4s, v25.s[0] // -I[TEMP] | |||
| dup v24.4s, v3.s[0] // I[TEMP] | |||
| #else | |||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||
| fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] | |||
| fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] | |||
| fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | |||
| dup v21.4s, v2.s[0] // R[TEMP] | |||
| dup v22.4s, v2.s[0] // R[TEMP] | |||
| dup v23.4s, v3.s[0] // I[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub s25, s25, s3 | |||
| dup v24.4s, v25.s[0] // -I[TEMP] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||
| fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] | |||
| fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] | |||
| fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] | |||
| dup v21.4s, v2.s[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub s25, s25, s2 | |||
| dup v22.4s, v25.s[0] // R[TEMP] | |||
| dup v23.4s, v3.s[0] // I[TEMP] | |||
| dup v24.4s, v3.s[0] // I[TEMP] | |||
| #else | |||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||
| fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | |||
| eor v12.16b, v12.16b, v12.16b | |||
| fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] | |||
| fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | |||
| dup v21.4s, v2.s[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub s25, s25, s2 | |||
| dup v22.4s, v25.s[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub s25, s25, s3 | |||
| dup v23.4s, v25.s[0] // I[TEMP] | |||
| dup v24.4s, v25.s[0] // I[TEMP] | |||
| #endif | |||
| #endif // CONJ | |||
| /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ | |||
| ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] | |||
| ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] | |||
| fmul v2.2s, v0.2s, v2.2s | |||
| fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] | |||
| ins v3.s[0], v2.s[1] | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| eor v4.16b, v4.16b, v4.16b | |||
| @@ -200,45 +191,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif // CONJ | |||
| #else // DOUBLE | |||
| ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] | |||
| ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] | |||
| fmul v2.2d, v0.2d, v2.2d | |||
| fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] | |||
| ins v3.d[0], v2.d[1] // I(TEMP) | |||
| /********** INIT_LOOP FOR F4 LOOP **********/ | |||
| ld1 {v9.2d}, [X_PTR] // [I(X), R(X)] | |||
| ins v10.d[0], v9.d[1] | |||
| ins v9.d[1], v9.d[0] // [R(X), R(X)] | |||
| ins v10.d[1], v10.d[0] // [I(X), I(X)] | |||
| /****** INIT_LOOP FOR F4 LOOP ******/ | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||
| fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | |||
| fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] | |||
| fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] | |||
| dup v21.2d, v2.d[0] // R[TEMP] | |||
| dup v22.2d, v2.d[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub d25, d25, d3 | |||
| dup v23.2d, v25.d[0] // -I[TEMP] | |||
| dup v24.2d, v3.d[0] // I[TEMP] | |||
| #else | |||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||
| fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] | |||
| fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] | |||
| fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | |||
| dup v21.2d, v2.d[0] // R[TEMP] | |||
| dup v22.2d, v2.d[0] // R[TEMP] | |||
| dup v23.2d, v3.d[0] // I[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub d25, d25, d3 | |||
| dup v24.2d, v25.d[0] // -I[TEMP] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||
| fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] | |||
| fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] | |||
| fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] | |||
| dup v21.2d, v2.d[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub d25, d25, d2 | |||
| dup v22.2d, v25.d[0] // R[TEMP] | |||
| dup v23.2d, v3.d[0] // I[TEMP] | |||
| dup v24.2d, v3.d[0] // I[TEMP] | |||
| #else | |||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||
| fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | |||
| eor v12.16b, v12.16b, v12.16b | |||
| fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] | |||
| fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | |||
| dup v21.2d, v2.d[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub d25, d25, d2 | |||
| dup v22.2d, v25.d[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub d25, d25, d3 | |||
| dup v23.2d, v25.d[0] // I[TEMP] | |||
| dup v24.2d, v25.d[0] // I[TEMP] | |||
| #endif | |||
| #endif // CONJ | |||
| /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ | |||
| ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] | |||
| ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] | |||
| fmul v2.2d, v0.2d, v2.2d | |||
| fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] | |||
| ins v3.d[0], v2.d[1] // I(TEMP) | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| eor v4.16b, v4.16b, v4.16b | |||
| @@ -276,91 +274,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v13.4s, v14.4s}, [A_PTR], #32 | |||
| ld2 {v15.4s, v16.4s}, [Y_IPTR], #32 | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||
| fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] | |||
| fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] | |||
| fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||
| fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] | |||
| fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] | |||
| fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||
| fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] | |||
| fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] | |||
| fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||
| fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] | |||
| fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] | |||
| fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] | |||
| #endif | |||
| #endif // CONJ | |||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||
| fmla v15.4s, v21.4s, v13.4s | |||
| fmla v15.4s, v23.4s, v14.4s | |||
| fmla v16.4s, v22.4s, v14.4s | |||
| fmla v16.4s, v24.4s, v13.4s | |||
| st2 {v15.4s, v16.4s}, [Y_OPTR], #32 | |||
| #else // DOUBLE | |||
| ld2 {v13.2d, v14.2d}, [A_PTR], #32 | |||
| ld2 {v15.2d, v16.2d}, [Y_IPTR], #32 | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||
| fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] | |||
| fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] | |||
| fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||
| fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] | |||
| fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] | |||
| fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||
| fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] | |||
| fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] | |||
| fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||
| fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] | |||
| fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] | |||
| fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] | |||
| #endif | |||
| #endif // CONJ | |||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||
| fmla v15.2d, v21.2d, v13.2d | |||
| fmla v15.2d, v23.2d, v14.2d | |||
| fmla v16.2d, v22.2d, v14.2d | |||
| fmla v16.2d, v24.2d, v13.2d | |||
| st2 {v15.2d, v16.2d}, [Y_OPTR], #32 | |||
| ld2 {v17.2d, v18.2d}, [A_PTR], #32 | |||
| ld2 {v19.2d, v20.2d}, [Y_IPTR], #32 | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||
| fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||
| fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||
| fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||
| fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||
| fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||
| fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||
| fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||
| fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||
| fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||
| fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||
| fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||
| fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||
| #endif | |||
| #endif // CONJ | |||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||
| fmla v19.2d, v21.2d, v17.2d | |||
| fmla v19.2d, v23.2d, v18.2d | |||
| fmla v20.2d, v22.2d, v18.2d | |||
| fmla v20.2d, v24.2d, v17.2d | |||
| st2 {v19.2d, v20.2d}, [Y_OPTR], #32 | |||
| #endif | |||
| @@ -445,10 +391,7 @@ zgemv_n_kernel_F_LOOP: | |||
| zgemv_n_kernel_F4: | |||
| KERNEL_F1 | |||
| KERNEL_F1 | |||
| KERNEL_F1 | |||
| KERNEL_F1 | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne zgemv_n_kernel_F4 | |||
| @@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define J x11 /* loop variable */ | |||
| #define I x12 /* loop variable */ | |||
| #define A_PRE_SIZE 768 | |||
| #define X_PRE_SIZE 768 | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| @@ -139,6 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v11.4s, v12.4s}, [X_PTR], #32 | |||
| ld2 {v13.4s, v14.4s}, [A_PTR], #32 | |||
| prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] | |||
| prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] | |||
| #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||
| @@ -155,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else // DOUBLE | |||
| ld2 {v11.2d, v12.2d}, [X_PTR], #32 | |||
| ld2 {v13.2d, v14.2d}, [A_PTR], #32 | |||
| prfm PLDL1STRM, [X_PTR, #512] | |||
| prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] | |||
| #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||
| @@ -171,7 +176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v17.2d, v18.2d}, [X_PTR], #32 | |||
| ld2 {v19.2d, v20.2d}, [A_PTR], #32 | |||
| prfm PLDL1STRM, [A_PTR, #512] | |||
| prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] | |||
| #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||
| @@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define pA x15 | |||
| #define alpha_save_R x16 | |||
| #define alpha_save_I x17 | |||
| #define temp x18 | |||
| #define tempOffset x19 | |||
| #define tempK x20 | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alphaR x17 | |||
| #define alphaI x18 | |||
| #define temp x19 | |||
| #define tempOffset x20 | |||
| #define tempK x21 | |||
| #define alpha0_R d10 | |||
| #define alphaV0_R v10.d[0] | |||
| #define alpha0_I d11 | |||
| #define alphaV0_I v11.d[0] | |||
| #define alpha1_R d14 | |||
| #define alphaV1_R v14.d[0] | |||
| #define alpha1_I d15 | |||
| #define alphaV1_I v15.d[0] | |||
| #define A_PRE_SIZE 2560 | |||
| #define B_PRE_SIZE 448 | |||
| #define C_PRE_SIZE 128 | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define OP_rr fmla | |||
| @@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 04 origPB | |||
| // 05 pC | |||
| // 06 origLDC -> LDC | |||
| // 07 offset | |||
| // 07 offset -> temp | |||
| // 08 counterL | |||
| // 09 counterI | |||
| // 10 counterJ | |||
| @@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 pA | |||
| // 16 alpha_save_R | |||
| // 17 alpha_save_I | |||
| // 18 must save temp | |||
| // 19 must save tempOffset | |||
| // 20 must save tempK | |||
| // 21 must save | |||
| // 15 pCRow3 | |||
| // 16 pA | |||
| // 17 alpha_save_R | |||
| // 18 must save alpha_save_I | |||
| // 19 must save temp | |||
| // 20 must save tempOffset | |||
| // 21 must save tempK | |||
| // 22 must save | |||
| // 23 must save | |||
| // 24 must save | |||
| @@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x4_I | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| @@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.2d, v2.2d, v9.d[0] | |||
| #else | |||
| fmul v19.2d, v2.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| @@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.2d, v2.2d, v9.d[0] | |||
| #else | |||
| fmul v19.2d, v2.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| fmul v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v29.2d, v1.2d, v10.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmul v30.2d, v2.2d, v10.d[1] | |||
| OP_ii v30.2d, v3.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v31.2d, v3.2d, v10.d[1] | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| @@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| ld2 {v12.2d, v13.2d}, [pB] // For next round | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| @@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v14.2d, v15.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v4.2d, v5.2d} , [pA] // For next round | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| @@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| ld2 {v6.2d, v7.2d} , [pA] // For next round | |||
| add pA, pA, #32 | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| OP_rr v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| OP_ri v27.2d, v2.2d, v11.d[0] | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| OP_rr v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| @@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v17.2d, v4.2d, v13.d[0] | |||
| OP_ir v17.2d, v5.2d, v12.d[0] | |||
| ld2 {v8.2d, v9.2d}, [pB] // For next round | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v6.2d, v12.d[0] | |||
| @@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v19.2d, v6.2d, v13.d[0] | |||
| OP_ir v19.2d, v7.2d, v12.d[0] | |||
| ld2 {v10.2d, v11.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v4.2d, v12.d[1] | |||
| OP_ii v20.2d, v5.2d, v13.d[1] | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| ld2 {v0.2d, v1.2d}, [pA] // For next round | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| @@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| OP_ir v23.2d, v7.2d, v12.d[1] | |||
| ld2 {v2.2d, v3.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v24.2d, v4.2d, v14.d[0] | |||
| OP_ii v24.2d, v5.2d, v15.d[0] | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| OP_ir v27.2d, v7.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| OP_rr v28.2d, v4.2d, v14.d[1] | |||
| OP_ii v28.2d, v5.2d, v15.d[1] | |||
| @@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| OP_ii v22.2d, v7.2d, v13.d[1] | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| @@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| @@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x4_SUB | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| @@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| fmul v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmul v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmul v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmul v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmul v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| add pCRow1, pCRow1, #32 | |||
| fmul v6.2d, v22.2d, alphaV0_R | |||
| fmls v6.2d, v23.2d, alphaV0_I | |||
| fmul v7.2d, v22.2d, alphaV1_I | |||
| fmla v7.2d, v23.2d, alphaV1_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmul v7.2d, v22.2d, alphaV0_I | |||
| fmla v7.2d, v23.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v0.2d, v24.2d, alphaV0_R | |||
| fmls v0.2d, v25.2d, alphaV0_I | |||
| fmul v1.2d, v24.2d, alphaV1_I | |||
| fmla v1.2d, v25.2d, alphaV1_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v1.2d, v24.2d, alphaV0_I | |||
| fmla v1.2d, v25.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| fmul v2.2d, v26.2d, alphaV0_R | |||
| fmls v2.2d, v27.2d, alphaV0_I | |||
| fmul v3.2d, v26.2d, alphaV1_I | |||
| fmla v3.2d, v27.2d, alphaV1_R | |||
| fmul v3.2d, v26.2d, alphaV0_I | |||
| fmla v3.2d, v27.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, LDC | |||
| add pCRow2, pCRow2, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| fmul v4.2d, v28.2d, alphaV0_R | |||
| fmls v4.2d, v29.2d, alphaV0_I | |||
| fmul v5.2d, v28.2d, alphaV1_I | |||
| fmla v5.2d, v29.2d, alphaV1_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v5.2d, v28.2d, alphaV0_I | |||
| fmla v5.2d, v29.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow3] | |||
| add pCRow3, pCRow3, #32 | |||
| fmul v6.2d, v30.2d, alphaV0_R | |||
| fmls v6.2d, v31.2d, alphaV0_I | |||
| fmul v7.2d, v30.2d, alphaV1_I | |||
| fmla v7.2d, v31.2d, alphaV1_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmul v7.2d, v30.2d, alphaV0_I | |||
| fmla v7.2d, v31.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow3] | |||
| add pCRow0, pCRow0, #64 | |||
| add pCRow3, pCRow3, #32 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmul v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmul v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v0.2d, v24.2d, alphaV0_R | |||
| fmls v0.2d, v25.2d, alphaV0_I | |||
| fmul v1.2d, v24.2d, alphaV1_I | |||
| fmla v1.2d, v25.2d, alphaV1_R | |||
| fmul v1.2d, v24.2d, alphaV0_I | |||
| fmla v1.2d, v25.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v28.2d, alphaV0_R | |||
| fmls v4.2d, v29.2d, alphaV0_I | |||
| fmul v5.2d, v28.2d, alphaV1_I | |||
| fmla v5.2d, v29.2d, alphaV1_R | |||
| fmul v5.2d, v28.2d, alphaV0_I | |||
| fmla v5.2d, v29.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmul d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmul d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul d4, d20, alphaV0_R | |||
| fmls d4, d21, alphaV0_I | |||
| fmul d5, d20, alphaV1_I | |||
| fmla d5, d21, alphaV1_R | |||
| fmul d5, d20, alphaV0_I | |||
| fmla d5, d21, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul d0, d24, alphaV0_R | |||
| fmls d0, d25, alphaV0_I | |||
| fmul d1, d24, alphaV1_I | |||
| fmla d1, d25, alphaV1_R | |||
| fmul d1, d24, alphaV0_I | |||
| fmla d1, d25, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul d4, d28, alphaV0_R | |||
| fmls d4, d29, alphaV0_I | |||
| fmul d5, d28, alphaV1_I | |||
| fmla d5, d29, alphaV1_R | |||
| fmul d5, d28, alphaV0_I | |||
| fmla d5, d29, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmul v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| fmul v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmul v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmul v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v6.2d, v22.2d, alphaV0_R | |||
| fmls v6.2d, v23.2d, alphaV0_I | |||
| fmul v7.2d, v22.2d, alphaV1_I | |||
| fmla v7.2d, v23.2d, alphaV1_R | |||
| fmul v7.2d, v22.2d, alphaV0_I | |||
| fmla v7.2d, v23.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmul v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmul v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmul d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmul d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul d4, d20, alphaV0_R | |||
| fmls d4, d21, alphaV0_I | |||
| fmul d5, d20, alphaV1_I | |||
| fmla d5, d21, alphaV1_R | |||
| fmul d5, d20, alphaV0_I | |||
| fmla d5, d21, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmul v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| fmul v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmul d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmul d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| fmov alpha_save_R, d0 | |||
| fmov alpha_save_I, d1 | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alphaR, d0 | |||
| fmov alphaI, d1 | |||
| lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | |||
| @@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ble ztrmm_kernel_L2_BEGIN | |||
| ztrmm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow3, pCRow2, LDC | |||
| add pC, pCRow3, LDC | |||
| #if defined(LEFT) | |||
| mov tempOffset, offset | |||
| @@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN: | |||
| cmp counterI, #0 | |||
| ble ztrmm_kernel_L4_M2_BEGIN | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| @@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20: | |||
| add tempK, tempOffset, #4 | |||
| #endif | |||
| asr counterL , tempK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| asr counterL , tempK, #3 | |||
| cmp counterL , #2 | |||
| blt ztrmm_kernel_L4_M4_32 | |||
| KERNEL4x4_I // do one in the K | |||
| KERNEL4x4_M2 // do another in the K | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #2 | |||
| ble ztrmm_kernel_L4_M4_22a | |||
| .align 5 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L4_M4_22 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b ztrmm_kernel_L4_M4_44 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble ztrmm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b ztrmm_kernel_L4_M4_44 | |||
| @@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40: | |||
| ztrmm_kernel_L4_M4_44: | |||
| ands counterL , tempK, #1 | |||
| ands counterL , tempK, #7 | |||
| ble ztrmm_kernel_L4_M4_100 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne ztrmm_kernel_L4_M4_46 | |||
| ztrmm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| @@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| ztrmm_kernel_L4_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bne ztrmm_kernel_L4_M4_20 | |||
| @@ -0,0 +1,46 @@ | |||
| ifndef SNRM2KERNEL | |||
| SNRM2KERNEL = nrm2.c | |||
| endif | |||
| ifndef DNRM2KERNEL | |||
| DNRM2KERNEL = nrm2.c | |||
| endif | |||
| ifndef CNRM2KERNEL | |||
| CNRM2KERNEL = znrm2.c | |||
| endif | |||
| ifndef ZNRM2KERNEL | |||
| ZNRM2KERNEL = znrm2.c | |||
| endif | |||
| ifndef SCABS_KERNEL | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| endif | |||
| ifndef DCABS_KERNEL | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| endif | |||
| ifndef QCABS_KERNEL | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| endif | |||
| ifndef LSAME_KERNEL | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| endif | |||
| ifndef SGEMM_BETA | |||
| SGEMM_BETA = ../generic/gemm_beta.c | |||
| endif | |||
| ifndef DGEMM_BETA | |||
| DGEMM_BETA = ../generic/gemm_beta.c | |||
| endif | |||
| ifndef CGEMM_BETA | |||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||
| endif | |||
| ifndef ZGEMM_BETA | |||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| endif | |||
| @@ -0,0 +1,221 @@ | |||
| SAMAXKERNEL = ../mips/amax.c | |||
| DAMAXKERNEL = ../mips/amax.c | |||
| CAMAXKERNEL = ../mips/zamax.c | |||
| ZAMAXKERNEL = ../mips/zamax.c | |||
| SAMINKERNEL = ../mips/amin.c | |||
| DAMINKERNEL = ../mips/amin.c | |||
| CAMINKERNEL = ../mips/zamin.c | |||
| ZAMINKERNEL = ../mips/zamin.c | |||
| SMAXKERNEL = ../mips/max.c | |||
| DMAXKERNEL = ../mips/max.c | |||
| SMINKERNEL = ../mips/min.c | |||
| DMINKERNEL = ../mips/min.c | |||
| ISAMAXKERNEL = ../mips/iamax.c | |||
| IDAMAXKERNEL = ../mips/iamax.c | |||
| ICAMAXKERNEL = ../mips/izamax.c | |||
| IZAMAXKERNEL = ../mips/izamax.c | |||
| ISAMINKERNEL = ../mips/iamin.c | |||
| IDAMINKERNEL = ../mips/iamin.c | |||
| ICAMINKERNEL = ../mips/izamin.c | |||
| IZAMINKERNEL = ../mips/izamin.c | |||
| ISMAXKERNEL = ../mips/imax.c | |||
| IDMAXKERNEL = ../mips/imax.c | |||
| ISMINKERNEL = ../mips/imin.c | |||
| IDMINKERNEL = ../mips/imin.c | |||
| ifdef HAVE_MSA | |||
| SASUMKERNEL = ../mips/sasum_msa.c | |||
| DASUMKERNEL = ../mips/dasum_msa.c | |||
| CASUMKERNEL = ../mips/casum_msa.c | |||
| ZASUMKERNEL = ../mips/zasum_msa.c | |||
| else | |||
| SASUMKERNEL = ../mips/asum.c | |||
| DASUMKERNEL = ../mips/asum.c | |||
| CASUMKERNEL = ../mips/asum.c | |||
| ZASUMKERNEL = ../mips/asum.c | |||
| endif | |||
| SAXPYKERNEL = ../mips/axpy.c | |||
| DAXPYKERNEL = ../mips/axpy.c | |||
| CAXPYKERNEL = ../mips/zaxpy.c | |||
| ZAXPYKERNEL = ../mips/zaxpy.c | |||
| SCOPYKERNEL = ../mips/copy.c | |||
| DCOPYKERNEL = ../mips/copy.c | |||
| CCOPYKERNEL = ../mips/zcopy.c | |||
| ZCOPYKERNEL = ../mips/zcopy.c | |||
| ifdef HAVE_MSA | |||
| SDOTKERNEL = ../mips/sdot_msa.c | |||
| DDOTKERNEL = ../mips/ddot_msa.c | |||
| CDOTKERNEL = ../mips/cdot_msa.c | |||
| ZDOTKERNEL = ../mips/zdot_msa.c | |||
| else | |||
| SDOTKERNEL = ../mips/dot.c | |||
| DDOTKERNEL = ../mips/dot.c | |||
| CDOTKERNEL = ../mips/zdot.c | |||
| ZDOTKERNEL = ../mips/zdot.c | |||
| endif | |||
| SNRM2KERNEL = ../mips/nrm2.c | |||
| DNRM2KERNEL = ../mips/nrm2.c | |||
| CNRM2KERNEL = ../mips/znrm2.c | |||
| ZNRM2KERNEL = ../mips/znrm2.c | |||
| SROTKERNEL = ../mips/rot.c | |||
| DROTKERNEL = ../mips/rot.c | |||
| CROTKERNEL = ../mips/zrot.c | |||
| ZROTKERNEL = ../mips/zrot.c | |||
| SSCALKERNEL = ../mips/scal.c | |||
| DSCALKERNEL = ../mips/scal.c | |||
| CSCALKERNEL = ../mips/zscal.c | |||
| ZSCALKERNEL = ../mips/zscal.c | |||
| SSWAPKERNEL = ../mips/swap.c | |||
| DSWAPKERNEL = ../mips/swap.c | |||
| CSWAPKERNEL = ../mips/zswap.c | |||
| ZSWAPKERNEL = ../mips/zswap.c | |||
| ifdef HAVE_MSA | |||
| SGEMVNKERNEL = ../mips/sgemv_n_msa.c | |||
| DGEMVNKERNEL = ../mips/dgemv_n_msa.c | |||
| CGEMVNKERNEL = ../mips/cgemv_n_msa.c | |||
| ZGEMVNKERNEL = ../mips/zgemv_n_msa.c | |||
| else | |||
| SGEMVNKERNEL = ../mips/gemv_n.c | |||
| DGEMVNKERNEL = ../mips/gemv_n.c | |||
| CGEMVNKERNEL = ../mips/zgemv_n.c | |||
| ZGEMVNKERNEL = ../mips/zgemv_n.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| SGEMVTKERNEL = ../mips/sgemv_t_msa.c | |||
| DGEMVTKERNEL = ../mips/dgemv_t_msa.c | |||
| CGEMVTKERNEL = ../mips/cgemv_t_msa.c | |||
| ZGEMVTKERNEL = ../mips/zgemv_t_msa.c | |||
| else | |||
| SGEMVTKERNEL = ../mips/gemv_t.c | |||
| DGEMVTKERNEL = ../mips/gemv_t.c | |||
| CGEMVTKERNEL = ../mips/zgemv_t.c | |||
| ZGEMVTKERNEL = ../mips/zgemv_t.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c | |||
| SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c | |||
| SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| else | |||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| endif | |||
| ifdef HAVE_MSA | |||
| DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c | |||
| DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c | |||
| DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c | |||
| DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c | |||
| DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| else | |||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| endif | |||
| ifdef HAVE_MSA | |||
| CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c | |||
| CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c | |||
| CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c | |||
| CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c | |||
| CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| else | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c | |||
| ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c | |||
| ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| else | |||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| endif | |||
| ifdef HAVE_MSA | |||
| STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c | |||
| STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c | |||
| STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c | |||
| STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c | |||
| else | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c | |||
| DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c | |||
| DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c | |||
| DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c | |||
| else | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| else | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| else | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| @@ -0,0 +1,2 @@ | |||
| clean :: | |||
| @@ -0,0 +1,66 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0; | |||
| FLOAT maxf=0.0; | |||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||
| maxf=ABS(x[0]); | |||
| ix += inc_x; | |||
| i++; | |||
| while(i < n) | |||
| { | |||
| if( ABS(x[ix]) > maxf ) | |||
| { | |||
| maxf = ABS(x[ix]); | |||
| } | |||
| ix += inc_x; | |||
| i++; | |||
| } | |||
| return(maxf); | |||
| } | |||
| @@ -0,0 +1,66 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0; | |||
| FLOAT minf=0.0; | |||
| if (n <= 0 || inc_x <= 0) return(minf); | |||
| minf=ABS(x[0]); | |||
| ix += inc_x; | |||
| i++; | |||
| while(i < n) | |||
| { | |||
| if( ABS(x[ix]) < minf ) | |||
| { | |||
| minf = ABS(x[ix]); | |||
| } | |||
| ix += inc_x; | |||
| i++; | |||
| } | |||
| return(minf); | |||
| } | |||
| @@ -0,0 +1,57 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| n *= inc_x; | |||
| while(i < n) | |||
| { | |||
| sumf += ABS(x[i]); | |||
| i += inc_x; | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,95 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix,iy; | |||
| if ( n < 0 ) return(0); | |||
| ix = 0; | |||
| iy = 0; | |||
| if ( beta == 0.0 ) | |||
| { | |||
| if ( alpha == 0.0 ) | |||
| { | |||
| while(i < n) | |||
| { | |||
| y[iy] = 0.0 ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(i < n) | |||
| { | |||
| y[iy] = alpha * x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( alpha == 0.0 ) | |||
| { | |||
| while(i < n) | |||
| { | |||
| y[iy] = beta * y[iy] ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(i < n) | |||
| { | |||
| y[iy] = alpha * x[ix] + beta * y[iy] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,54 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix,iy; | |||
| if ( n < 0 ) return(0); | |||
| if ( da == 0.0 ) return(0); | |||
| ix = 0; | |||
| iy = 0; | |||
| while(i < n) | |||
| { | |||
| y[iy] += da * x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,338 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include "macros_msa.h" | |||
| #define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec)) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i, inc_x2; | |||
| FLOAT sumf = 0.0; | |||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3; | |||
| v4f32 zero_v = {0}; | |||
| v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; | |||
| if (n <= 0 || inc_x <= 0) return (sumf); | |||
| if (1 == inc_x) | |||
| { | |||
| if (n > 15) | |||
| { | |||
| n -= 16; | |||
| LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 = AND_VEC_W(src0); | |||
| sum_abs1 = AND_VEC_W(src1); | |||
| sum_abs2 = AND_VEC_W(src2); | |||
| sum_abs3 = AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| sum_abs3 += AND_VEC_W(src7); | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = zero_v; | |||
| sum_abs1 = zero_v; | |||
| sum_abs2 = zero_v; | |||
| sum_abs3 = zero_v; | |||
| } | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| sum_abs3 += AND_VEC_W(src7); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if ((n & 8) && (n & 4) && (n & 2)) | |||
| { | |||
| LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if ((n & 8) && (n & 4)) | |||
| { | |||
| LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if ((n & 8) && (n & 2)) | |||
| { | |||
| LD_SP5_INC(x, 4, src0, src1, src2, src3, src4); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_SP3_INC(x, 4, src0, src1, src2); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if (n & 8) | |||
| { | |||
| LD_SP4_INC(x, 4, src0, src1, src2, src3); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if (n & 4) | |||
| { | |||
| LD_SP2_INC(x, 4, src0, src1); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| src0 = LD_SP(x); x += 4; | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| sumf += fabsf(*(x + 0)); | |||
| sumf += fabsf(*(x + 1)); | |||
| } | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x; | |||
| if (n > 8) | |||
| { | |||
| n -= 8; | |||
| LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 = AND_VEC_W(src0); | |||
| sum_abs1 = AND_VEC_W(src1); | |||
| sum_abs2 = AND_VEC_W(src2); | |||
| sum_abs3 = AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| sum_abs3 += AND_VEC_W(src7); | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = zero_v; | |||
| sum_abs1 = zero_v; | |||
| sum_abs2 = zero_v; | |||
| sum_abs3 = zero_v; | |||
| } | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| sum_abs3 += AND_VEC_W(src7); | |||
| } | |||
| if (n & 7) | |||
| { | |||
| if ((n & 4) && (n & 2) && (n & 1)) | |||
| { | |||
| LD_SP7_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_SP6_INC(x, inc_x2, src0, src1, src2, src3, src4, src5); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| } | |||
| else if ((n & 4) && (n & 1)) | |||
| { | |||
| LD_SP5_INC(x, inc_x2, src0, src1, src2, src3, src4); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| } | |||
| else if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_SP3_INC(x, inc_x2, src0, src1, src2); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| } | |||
| else if (n & 4) | |||
| { | |||
| LD_SP4_INC(x, inc_x2, src0, src1, src2, src3); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| LD_SP2_INC(x, inc_x2, src0, src1); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| } | |||
| else if (n & 1) | |||
| { | |||
| src0 = LD_SP(x); x += inc_x2; | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| } | |||
| } | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0] + sum_abs0[1]; | |||
| } | |||
| return (sumf); | |||
| } | |||
| @@ -0,0 +1,361 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| #if !defined(CONJ) | |||
| #define OP2 += | |||
| #define OP3 - | |||
| #define OP4 + | |||
| #else | |||
| #define OP2 -= | |||
| #define OP3 + | |||
| #define OP4 - | |||
| #endif | |||
| #define DOT16_KERNEL(OPR0, OPR1) \ | |||
| dot0 += (vx0r * vy0r); \ | |||
| dot0 OPR0## = (vx0i * vy0i); \ | |||
| dot1 OPR1## = (vx0i * vy0r); \ | |||
| dot1 += (vx0r * vy0i); \ | |||
| \ | |||
| dot0 += (vx1r * vy1r); \ | |||
| dot0 OPR0## = (vx1i * vy1i); \ | |||
| dot1 OPR1## = (vx1i * vy1r); \ | |||
| dot1 += (vx1r * vy1i); \ | |||
| \ | |||
| dot0 += (vx2r * vy2r); \ | |||
| dot0 OPR0## = (vx2i * vy2i); \ | |||
| dot1 OPR1## = (vx2i * vy2r); \ | |||
| dot1 += (vx2r * vy2i); \ | |||
| \ | |||
| dot0 += (vx3r * vy3r); \ | |||
| dot0 OPR0## = (vx3i * vy3i); \ | |||
| dot1 OPR1## = (vx3i * vy3r); \ | |||
| dot1 += (vx3r * vy3i); | |||
| #define DOT12_KERNEL(OPR0, OPR1) \ | |||
| dot0 += (vx0r * vy0r); \ | |||
| dot0 OPR0## = (vx0i * vy0i); \ | |||
| dot1 OPR1## = (vx0i * vy0r); \ | |||
| dot1 += (vx0r * vy0i); \ | |||
| \ | |||
| dot0 += (vx1r * vy1r); \ | |||
| dot0 OPR0## = (vx1i * vy1i); \ | |||
| dot1 OPR1## = (vx1i * vy1r); \ | |||
| dot1 += (vx1r * vy1i); \ | |||
| \ | |||
| dot0 += (vx2r * vy2r); \ | |||
| dot0 OPR0## = (vx2i * vy2i); \ | |||
| dot1 OPR1## = (vx2i * vy2r); \ | |||
| dot1 += (vx2r * vy2i); | |||
| #define DOT8_KERNEL(OPR0, OPR1) \ | |||
| dot0 += (vx0r * vy0r); \ | |||
| dot0 OPR0## = (vx0i * vy0i); \ | |||
| dot1 OPR1## = (vx0i * vy0r); \ | |||
| dot1 += (vx0r * vy0i); \ | |||
| \ | |||
| dot0 += (vx1r * vy1r); \ | |||
| dot0 OPR0## = (vx1i * vy1i); \ | |||
| dot1 OPR1## = (vx1i * vy1r); \ | |||
| dot1 += (vx1r * vy1i); | |||
| #define DOT4_KERNEL(OPR0, OPR1) \ | |||
| dot0 += (vx0r * vy0r); \ | |||
| dot0 OPR0## = (vx0i * vy0i); \ | |||
| dot1 OPR1## = (vx0i * vy0r); \ | |||
| dot1 += (vx0r * vy0i); | |||
| /* return float, x,y float */ | |||
| /* cdotc - CONJ */ | |||
| /* cdotu - !CONJ */ | |||
| #ifndef _MSC_VER | |||
| #include <complex.h> | |||
| FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #else | |||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #endif | |||
| { | |||
| BLASLONG i = 0; | |||
| FLOAT dot[2]; | |||
| BLASLONG inc_x2; | |||
| BLASLONG inc_y2; | |||
| FLOAT x0, x1, x2, x3, x4, x5, x6, x7; | |||
| FLOAT y0, y1, y2, y3, y4, y5, y6, y7; | |||
| v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; | |||
| v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; | |||
| v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; | |||
| v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; | |||
| v4f32 dot0 = {0, 0, 0, 0}; | |||
| v4f32 dot1 = {0, 0, 0, 0}; | |||
| openblas_complex_float result; | |||
| dot[0] = 0.0; | |||
| dot[1] = 0.0; | |||
| __real__(result) = 0.0; | |||
| __imag__(result) = 0.0; | |||
| if ( n < 1 ) return(result); | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); | |||
| LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); | |||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); | |||
| PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); | |||
| PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i); | |||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||
| PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); | |||
| PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); | |||
| PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i); | |||
| #if !defined(CONJ) | |||
| DOT16_KERNEL(-, +); | |||
| #else | |||
| DOT16_KERNEL(+, -); | |||
| #endif | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if ((n & 8) && (n & 4)) | |||
| { | |||
| LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); | |||
| LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); | |||
| LD_SP2_INC(x, 4, vx4, vx5); | |||
| LD_SP2_INC(y, 4, vy4, vy5); | |||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); | |||
| PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); | |||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||
| PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); | |||
| PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); | |||
| #if !defined(CONJ) | |||
| DOT12_KERNEL(-, +); | |||
| #else | |||
| DOT12_KERNEL(+, -); | |||
| #endif | |||
| } | |||
| else if (n & 8) | |||
| { | |||
| LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); | |||
| LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); | |||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); | |||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||
| PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); | |||
| #if !defined(CONJ) | |||
| DOT8_KERNEL(-, +); | |||
| #else | |||
| DOT8_KERNEL(+, -); | |||
| #endif | |||
| } | |||
| else if (n & 4) | |||
| { | |||
| LD_SP2_INC(x, 4, vx0, vx1); | |||
| LD_SP2_INC(y, 4, vy0, vy1); | |||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||
| #if !defined(CONJ) | |||
| DOT4_KERNEL(-, +); | |||
| #else | |||
| DOT4_KERNEL(+, -); | |||
| #endif | |||
| } | |||
| if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_GP6_INC(x, 1, x0, x1, x2, x3, x4, x5); | |||
| LD_GP6_INC(y, 1, y0, y1, y2, y3, y4, y5); | |||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||
| dot[0] += ( x4 * y4 OP3 x5 * y5 ); | |||
| dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| LD_GP4_INC(x, 1, x0, x1, x2, x3); | |||
| LD_GP4_INC(y, 1, y0, y1, y2, y3); | |||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||
| } | |||
| else if (n & 1) | |||
| { | |||
| LD_GP2_INC(x, 1, x0, x1); | |||
| LD_GP2_INC(y, 1, y0, y1); | |||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||
| } | |||
| } | |||
| dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]); | |||
| dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]); | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| x0 = *x; | |||
| x1 = *(x + 1); | |||
| x += inc_x2; | |||
| x2 = *x; | |||
| x3 = *(x + 1); | |||
| x += inc_x2; | |||
| x4 = *x; | |||
| x5 = *(x + 1); | |||
| x += inc_x2; | |||
| x6 = *x; | |||
| x7 = *(x + 1); | |||
| x += inc_x2; | |||
| y0 = *y; | |||
| y1 = *(y + 1); | |||
| y += inc_y2; | |||
| y2 = *y; | |||
| y3 = *(y + 1); | |||
| y += inc_y2; | |||
| y4 = *y; | |||
| y5 = *(y + 1); | |||
| y += inc_y2; | |||
| y6 = *y; | |||
| y7 = *(y + 1); | |||
| y += inc_y2; | |||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||
| dot[0] += ( x4 * y4 OP3 x5 * y5 ); | |||
| dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); | |||
| dot[0] += ( x6 * y6 OP3 x7 * y7 ); | |||
| dot[1] OP2 ( x7 * y6 OP4 x6 * y7 ); | |||
| } | |||
| if ((n & 2) && (n & 1)) | |||
| { | |||
| x0 = *x; | |||
| x1 = *(x + 1); | |||
| x += inc_x2; | |||
| x2 = *x; | |||
| x3 = *(x + 1); | |||
| x += inc_x2; | |||
| x4 = *x; | |||
| x5 = *(x + 1); | |||
| x += inc_x2; | |||
| y0 = *y; | |||
| y1 = *(y + 1); | |||
| y += inc_y2; | |||
| y2 = *y; | |||
| y3 = *(y + 1); | |||
| y += inc_y2; | |||
| y4 = *y; | |||
| y5 = *(y + 1); | |||
| y += inc_y2; | |||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||
| dot[0] += ( x4 * y4 OP3 x5 * y5 ); | |||
| dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| x0 = *x; | |||
| x1 = *(x + 1); | |||
| x += inc_x2; | |||
| x2 = *x; | |||
| x3 = *(x + 1); | |||
| x += inc_x2; | |||
| y0 = *y; | |||
| y1 = *(y + 1); | |||
| y += inc_y2; | |||
| y2 = *y; | |||
| y3 = *(y + 1); | |||
| y += inc_y2; | |||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||
| } | |||
| else if (n & 1) | |||
| { | |||
| x0 = *x; | |||
| x1 = *(x + 1); | |||
| x += inc_x2; | |||
| y0 = *y; | |||
| y1 = *(y + 1); | |||
| y += inc_y2; | |||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||
| } | |||
| } | |||
| __real__(result) = dot[0]; | |||
| __imag__(result) = dot[1]; | |||
| return(result); | |||
| } | |||
| @@ -0,0 +1,195 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v4f32 dst0, dst1, dst4, dst5; | |||
| psrc0 = src; | |||
| pdst = dst; | |||
| lda *= 2; | |||
| for (j = (n >> 2); j--;) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc3 = psrc2 + lda; | |||
| psrc4 = psrc3 + lda; | |||
| psrc0 += 4 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||
| LD_SP2_INC(psrc3, 4, src4, src5); | |||
| LD_SP2_INC(psrc4, 4, src6, src7); | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||
| ILVRL_D2_SP(src7, src5, dst1, dst5); | |||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src2 = LD_SP(psrc2); | |||
| src4 = LD_SP(psrc3); | |||
| src6 = LD_SP(psrc4); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| psrc3 += 4; | |||
| psrc4 += 4; | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| ctemp05 = *(psrc3 + 0); | |||
| ctemp06 = *(psrc3 + 1); | |||
| ctemp07 = *(psrc4 + 0); | |||
| ctemp08 = *(psrc4 + 1); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| psrc3 += 2; | |||
| psrc4 += 2; | |||
| *(pdst + 0) = ctemp01; | |||
| *(pdst + 1) = ctemp02; | |||
| *(pdst + 2) = ctemp03; | |||
| *(pdst + 3) = ctemp04; | |||
| *(pdst + 4) = ctemp05; | |||
| *(pdst + 5) = ctemp06; | |||
| *(pdst + 6) = ctemp07; | |||
| *(pdst + 7) = ctemp08; | |||
| pdst += 8; | |||
| } | |||
| } | |||
| if (n & 2) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc0 += 2 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src2 = LD_SP(psrc2); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| *(pdst + 0) = ctemp01; | |||
| *(pdst + 1) = ctemp02; | |||
| *(pdst + 2) = ctemp03; | |||
| *(pdst + 3) = ctemp04; | |||
| pdst += 4; | |||
| } | |||
| } | |||
| if (n & 1) | |||
| { | |||
| psrc1 = psrc0; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| ST_SP2_INC(src0, src1, pdst, 4); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| psrc1 += 4; | |||
| ST_SP(src0, pdst); | |||
| pdst += 4; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| psrc1 += 2; | |||
| *(pdst + 0) = ctemp01; | |||
| *(pdst + 1) = ctemp02; | |||
| pdst += 2; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,310 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; | |||
| FLOAT *psrc8, *pdst; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04, ctemp05, ctemp06, ctemp07; | |||
| FLOAT ctemp08, ctemp09, ctemp10, ctemp11, ctemp12, ctemp13, ctemp14; | |||
| FLOAT ctemp15, ctemp16; | |||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v4f32 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | |||
| psrc0 = src; | |||
| pdst = dst; | |||
| lda *= 2; | |||
| for (j = (n >> 3); j--;) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc3 = psrc2 + lda; | |||
| psrc4 = psrc3 + lda; | |||
| psrc5 = psrc4 + lda; | |||
| psrc6 = psrc5 + lda; | |||
| psrc7 = psrc6 + lda; | |||
| psrc8 = psrc7 + lda; | |||
| psrc0 += 8 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||
| LD_SP2_INC(psrc3, 4, src4, src5); | |||
| LD_SP2_INC(psrc4, 4, src6, src7); | |||
| LD_SP2_INC(psrc5, 4, src8, src9); | |||
| LD_SP2_INC(psrc6, 4, src10, src11); | |||
| LD_SP2_INC(psrc7, 4, src12, src13); | |||
| LD_SP2_INC(psrc8, 4, src14, src15); | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||
| ILVRL_D2_SP(src10, src8, dst2, dst6); | |||
| ILVRL_D2_SP(src14, src12, dst3, dst7); | |||
| ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); | |||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||
| ILVRL_D2_SP(src7, src5, dst1, dst5); | |||
| ILVRL_D2_SP(src11, src9, dst2, dst6); | |||
| ILVRL_D2_SP(src15, src13, dst3, dst7); | |||
| ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src2 = LD_SP(psrc2); | |||
| src4 = LD_SP(psrc3); | |||
| src6 = LD_SP(psrc4); | |||
| src8 = LD_SP(psrc5); | |||
| src10 = LD_SP(psrc6); | |||
| src12 = LD_SP(psrc7); | |||
| src14 = LD_SP(psrc8); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| psrc3 += 4; | |||
| psrc4 += 4; | |||
| psrc5 += 4; | |||
| psrc6 += 4; | |||
| psrc7 += 4; | |||
| psrc8 += 4; | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||
| ILVRL_D2_SP(src10, src8, dst2, dst6); | |||
| ILVRL_D2_SP(src14, src12, dst3, dst7); | |||
| ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| ctemp05 = *(psrc3 + 0); | |||
| ctemp06 = *(psrc3 + 1); | |||
| ctemp07 = *(psrc4 + 0); | |||
| ctemp08 = *(psrc4 + 1); | |||
| ctemp09 = *(psrc5 + 0); | |||
| ctemp10 = *(psrc5 + 1); | |||
| ctemp11 = *(psrc6 + 0); | |||
| ctemp12 = *(psrc6 + 1); | |||
| ctemp13 = *(psrc7 + 0); | |||
| ctemp14 = *(psrc7 + 1); | |||
| ctemp15 = *(psrc8 + 0); | |||
| ctemp16 = *(psrc8 + 1); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| psrc3 += 2; | |||
| psrc4 += 2; | |||
| psrc5 += 2; | |||
| psrc6 += 2; | |||
| psrc7 += 2; | |||
| psrc8 += 2; | |||
| *(pdst + 0) = ctemp01; | |||
| *(pdst + 1) = ctemp02; | |||
| *(pdst + 2) = ctemp03; | |||
| *(pdst + 3) = ctemp04; | |||
| *(pdst + 4) = ctemp05; | |||
| *(pdst + 5) = ctemp06; | |||
| *(pdst + 6) = ctemp07; | |||
| *(pdst + 7) = ctemp08; | |||
| *(pdst + 8) = ctemp09; | |||
| *(pdst + 9) = ctemp10; | |||
| *(pdst + 10) = ctemp11; | |||
| *(pdst + 11) = ctemp12; | |||
| *(pdst + 12) = ctemp13; | |||
| *(pdst + 13) = ctemp14; | |||
| *(pdst + 14) = ctemp15; | |||
| *(pdst + 15) = ctemp16; | |||
| pdst += 16; | |||
| } | |||
| } | |||
| if (n & 4) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc3 = psrc2 + lda; | |||
| psrc4 = psrc3 + lda; | |||
| psrc0 += 4 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||
| LD_SP2_INC(psrc3, 4, src4, src5); | |||
| LD_SP2_INC(psrc4, 4, src6, src7); | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||
| ILVRL_D2_SP(src7, src5, dst1, dst5); | |||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src2 = LD_SP(psrc2); | |||
| src4 = LD_SP(psrc3); | |||
| src6 = LD_SP(psrc4); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| psrc3 += 4; | |||
| psrc4 += 4; | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_SP(src6, src4, dst1, dst5); | |||
| ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| ctemp05 = *(psrc3 + 0); | |||
| ctemp06 = *(psrc3 + 1); | |||
| ctemp07 = *(psrc4 + 0); | |||
| ctemp08 = *(psrc4 + 1); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| psrc3 += 2; | |||
| psrc4 += 2; | |||
| *(pdst + 0) = ctemp01; | |||
| *(pdst + 1) = ctemp02; | |||
| *(pdst + 2) = ctemp03; | |||
| *(pdst + 3) = ctemp04; | |||
| *(pdst + 4) = ctemp05; | |||
| *(pdst + 5) = ctemp06; | |||
| *(pdst + 6) = ctemp07; | |||
| *(pdst + 7) = ctemp08; | |||
| pdst += 8; | |||
| } | |||
| } | |||
| if (n & 2) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc0 += 2 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| LD_SP2_INC(psrc2, 4, src2, src3); | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||
| ILVRL_D2_SP(src3, src1, dst0, dst4); | |||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src2 = LD_SP(psrc2); | |||
| psrc1 += 4; | |||
| psrc2 += 4; | |||
| ILVRL_D2_SP(src2, src0, dst0, dst4); | |||
| ST_SP2_INC(dst0, dst4, pdst, 4); | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| *(pdst + 0) = ctemp01; | |||
| *(pdst + 1) = ctemp02; | |||
| *(pdst + 2) = ctemp03; | |||
| *(pdst + 3) = ctemp04; | |||
| pdst += 4; | |||
| } | |||
| } | |||
| if (n & 1) | |||
| { | |||
| psrc1 = psrc0; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2_INC(psrc1, 4, src0, src1); | |||
| ST_SP2_INC(src0, src1, pdst, 4); | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| psrc1 += 4; | |||
| ST_SP(src0, pdst); | |||
| pdst += 4; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| psrc1 += 2; | |||
| *(pdst + 0) = ctemp01; | |||
| *(pdst + 1) = ctemp02; | |||
| pdst += 2; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,125 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0; | |||
| FLOAT *psrc1, *psrc2; | |||
| FLOAT *pdst0; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| v4f32 src0, src1, src2, src3; | |||
| psrc0 = src; | |||
| pdst0 = dst; | |||
| lda *= 2; | |||
| for (j = (n >> 2); j--;) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc0 + lda; | |||
| psrc0 += 8; | |||
| for (i = (m >> 1); i--;) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| LD_SP2(psrc2, 4, src2, src3); | |||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| ST_SP2_INC(src0, src1, pdst0, 4); | |||
| } | |||
| } | |||
| if (n & 2) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc0 + lda; | |||
| psrc0 += 4; | |||
| for (i = (m >> 1); i--;) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src1 = LD_SP(psrc2); | |||
| ST_SP2_INC(src0, src1, pdst0, 4); | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| ST_SP(src0, pdst0); | |||
| pdst0 += 4; | |||
| } | |||
| } | |||
| if (n & 1) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc0 + lda; | |||
| psrc0 += 2; | |||
| for (i = (m >> 1); i--;) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| *(pdst0 + 0) = ctemp01; | |||
| *(pdst0 + 1) = ctemp02; | |||
| *(pdst0 + 2) = ctemp03; | |||
| *(pdst0 + 3) = ctemp04; | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| pdst0 += 4; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| *(pdst0 + 0) = ctemp01; | |||
| *(pdst0 + 1) = ctemp02; | |||
| pdst0 += 2; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,214 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *pdst0; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v4f32 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| psrc0 = src; | |||
| pdst0 = dst; | |||
| lda *= 2; | |||
| for (j = (n >> 3); j--;) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc0 + lda; | |||
| psrc0 += 16; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP4(psrc1, 4, src0, src1, src2, src3); | |||
| LD_SP4(psrc2, 4, src4, src5, src6, src7); | |||
| LD_SP4(psrc1 + 2 * lda, 4, src8, src9, src10, src11); | |||
| LD_SP4(psrc2 + 2 * lda, 4, src12, src13, src14, src15); | |||
| ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4); | |||
| ST_SP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, pdst0, 4); | |||
| psrc1 += 4 * lda; | |||
| psrc2 += 4 * lda; | |||
| } | |||
| if (m & 2) | |||
| { | |||
| LD_SP4(psrc1, 4, src0, src1, src2, src3); | |||
| LD_SP4(psrc2, 4, src4, src5, src6, src7); | |||
| ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4); | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| LD_SP4(psrc1, 4, src0, src1, src2, src3); | |||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||
| } | |||
| } | |||
| if (n & 4) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc0 + lda; | |||
| psrc0 += 8; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| LD_SP2(psrc2, 4, src2, src3); | |||
| LD_SP2(psrc1 + 2 * lda, 4, src4, src5); | |||
| LD_SP2(psrc2 + 2 * lda, 4, src6, src7); | |||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||
| ST_SP4_INC(src4, src5, src6, src7, pdst0, 4); | |||
| psrc1 += 4 * lda; | |||
| psrc2 += 4 * lda; | |||
| } | |||
| if (m & 2) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| LD_SP2(psrc2, 4, src2, src3); | |||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| LD_SP2(psrc1, 4, src0, src1); | |||
| ST_SP2_INC(src0, src1, pdst0, 4); | |||
| } | |||
| } | |||
| if (n & 2) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc0 + lda; | |||
| psrc0 += 4; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src1 = LD_SP(psrc2); | |||
| src2 = LD_SP(psrc1 + 2 * lda); | |||
| src3 = LD_SP(psrc2 + 2 * lda); | |||
| ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); | |||
| psrc1 += 4 * lda; | |||
| psrc2 += 4 * lda; | |||
| } | |||
| if (m & 2) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| src1 = LD_SP(psrc2); | |||
| ST_SP2_INC(src0, src1, pdst0, 4); | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| src0 = LD_SP(psrc1); | |||
| ST_SP(src0, pdst0); | |||
| pdst0 += 4; | |||
| } | |||
| } | |||
| if (n & 1) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc0 + lda; | |||
| psrc0 += 2; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| *(pdst0 + 0) = ctemp01; | |||
| *(pdst0 + 1) = ctemp02; | |||
| *(pdst0 + 2) = ctemp03; | |||
| *(pdst0 + 3) = ctemp04; | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| pdst0 += 4; | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| *(pdst0 + 0) = ctemp01; | |||
| *(pdst0 + 1) = ctemp02; | |||
| *(pdst0 + 2) = ctemp03; | |||
| *(pdst0 + 3) = ctemp04; | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| pdst0 += 4; | |||
| } | |||
| if (m & 2) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| ctemp03 = *(psrc2 + 0); | |||
| ctemp04 = *(psrc2 + 1); | |||
| *(pdst0 + 0) = ctemp01; | |||
| *(pdst0 + 1) = ctemp02; | |||
| *(pdst0 + 2) = ctemp03; | |||
| *(pdst0 + 3) = ctemp04; | |||
| psrc1 += 2 * lda; | |||
| psrc2 += 2 * lda; | |||
| pdst0 += 4; | |||
| } | |||
| if (m & 1) | |||
| { | |||
| ctemp01 = *(psrc1 + 0); | |||
| ctemp02 = *(psrc1 + 1); | |||
| *(pdst0 + 0) = ctemp01; | |||
| *(pdst0 + 1) = ctemp02; | |||
| pdst0 += 2; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,611 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| #undef OP0 | |||
| #undef OP1 | |||
| #undef OP2 | |||
| #undef OP3 | |||
| #undef OP4 | |||
| #if !defined(XCONJ) | |||
| #define OP3 -= | |||
| #define OP4 += | |||
| #else | |||
| #define OP3 += | |||
| #define OP4 -= | |||
| #endif | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| #define OP0 -= | |||
| #define OP1 += | |||
| #define OP2 += | |||
| #else | |||
| #define OP0 += | |||
| #define OP1 += | |||
| #define OP2 -= | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| #define OP0 += | |||
| #define OP1 -= | |||
| #define OP2 -= | |||
| #else | |||
| #define OP0 -= | |||
| #define OP1 -= | |||
| #define OP2 += | |||
| #endif | |||
| #endif | |||
| #define CGEMV_N_8x4() \ | |||
| LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ | |||
| LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ | |||
| LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \ | |||
| LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \ | |||
| \ | |||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||
| PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ | |||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||
| PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ | |||
| PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ | |||
| PCKEVOD_W2_SP(t11, t10, src5r, src5i); \ | |||
| PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ | |||
| PCKEVOD_W2_SP(t15, t14, src7r, src7i); \ | |||
| \ | |||
| y0r += tp0r * src0r; \ | |||
| y1r += tp0r * src1r; \ | |||
| y0r += tp1r * src2r; \ | |||
| y1r += tp1r * src3r; \ | |||
| y0r += tp2r * src4r; \ | |||
| y1r += tp2r * src5r; \ | |||
| y0r += tp3r * src6r; \ | |||
| y1r += tp3r * src7r; \ | |||
| \ | |||
| y0r OP0 tp0i * src0i; \ | |||
| y1r OP0 tp0i * src1i; \ | |||
| y0r OP0 tp1i * src2i; \ | |||
| y1r OP0 tp1i * src3i; \ | |||
| y0r OP0 tp2i * src4i; \ | |||
| y1r OP0 tp2i * src5i; \ | |||
| y0r OP0 tp3i * src6i; \ | |||
| y1r OP0 tp3i * src7i; \ | |||
| \ | |||
| y0i OP1 tp0r * src0i; \ | |||
| y1i OP1 tp0r * src1i; \ | |||
| y0i OP1 tp1r * src2i; \ | |||
| y1i OP1 tp1r * src3i; \ | |||
| y0i OP1 tp2r * src4i; \ | |||
| y1i OP1 tp2r * src5i; \ | |||
| y0i OP1 tp3r * src6i; \ | |||
| y1i OP1 tp3r * src7i; \ | |||
| \ | |||
| y0i OP2 tp0i * src0r; \ | |||
| y1i OP2 tp0i * src1r; \ | |||
| y0i OP2 tp1i * src2r; \ | |||
| y1i OP2 tp1i * src3r; \ | |||
| y0i OP2 tp2i * src4r; \ | |||
| y1i OP2 tp2i * src5r; \ | |||
| y0i OP2 tp3i * src6r; \ | |||
| y1i OP2 tp3i * src7r; \ | |||
| #define CGEMV_N_4x4() \ | |||
| LD_SP2(pa0 + k, 4, t0, t1); \ | |||
| LD_SP2(pa1 + k, 4, t4, t5); \ | |||
| LD_SP2(pa2 + k, 4, t8, t9); \ | |||
| LD_SP2(pa3 + k, 4, t12, t13); \ | |||
| \ | |||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||
| PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ | |||
| PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ | |||
| \ | |||
| y0r += tp0r * src0r; \ | |||
| y0r += tp1r * src2r; \ | |||
| y0r += tp2r * src4r; \ | |||
| y0r += tp3r * src6r; \ | |||
| \ | |||
| y0r OP0 tp0i * src0i; \ | |||
| y0r OP0 tp1i * src2i; \ | |||
| y0r OP0 tp2i * src4i; \ | |||
| y0r OP0 tp3i * src6i; \ | |||
| \ | |||
| y0i OP1 tp0r * src0i; \ | |||
| y0i OP1 tp1r * src2i; \ | |||
| y0i OP1 tp2r * src4i; \ | |||
| y0i OP1 tp3r * src6i; \ | |||
| \ | |||
| y0i OP2 tp0i * src0r; \ | |||
| y0i OP2 tp1i * src2r; \ | |||
| y0i OP2 tp2i * src4r; \ | |||
| y0i OP2 tp3i * src6r; \ | |||
| #define CGEMV_N_1x4() \ | |||
| res0 = y[0 * inc_y2]; \ | |||
| res1 = y[0 * inc_y2 + 1]; \ | |||
| \ | |||
| res0 += temp0_r * pa0[k]; \ | |||
| res0 OP0 temp0_i * pa0[k + 1]; \ | |||
| res0 += temp1_r * pa1[k]; \ | |||
| res0 OP0 temp1_i * pa1[k + 1]; \ | |||
| res0 += temp2_r * pa2[k]; \ | |||
| res0 OP0 temp2_i * pa2[k + 1]; \ | |||
| res0 += temp3_r * pa3[k]; \ | |||
| res0 OP0 temp3_i * pa3[k + 1]; \ | |||
| \ | |||
| res1 OP1 temp0_r * pa0[k + 1]; \ | |||
| res1 OP2 temp0_i * pa0[k]; \ | |||
| res1 OP1 temp1_r * pa1[k + 1]; \ | |||
| res1 OP2 temp1_i * pa1[k]; \ | |||
| res1 OP1 temp2_r * pa2[k + 1]; \ | |||
| res1 OP2 temp2_i * pa2[k]; \ | |||
| res1 OP1 temp3_r * pa3[k + 1]; \ | |||
| res1 OP2 temp3_i * pa3[k]; \ | |||
| \ | |||
| y[0 * inc_y2] = res0; \ | |||
| y[0 * inc_y2 + 1] = res1; \ | |||
| #define CGEMV_N_8x2() \ | |||
| LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ | |||
| LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ | |||
| \ | |||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||
| PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ | |||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||
| PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ | |||
| \ | |||
| y0r += tp0r * src0r; \ | |||
| y1r += tp0r * src1r; \ | |||
| y0r += tp1r * src2r; \ | |||
| y1r += tp1r * src3r; \ | |||
| \ | |||
| y0r OP0 tp0i * src0i; \ | |||
| y1r OP0 tp0i * src1i; \ | |||
| y0r OP0 tp1i * src2i; \ | |||
| y1r OP0 tp1i * src3i; \ | |||
| \ | |||
| y0i OP1 tp0r * src0i; \ | |||
| y1i OP1 tp0r * src1i; \ | |||
| y0i OP1 tp1r * src2i; \ | |||
| y1i OP1 tp1r * src3i; \ | |||
| \ | |||
| y0i OP2 tp0i * src0r; \ | |||
| y1i OP2 tp0i * src1r; \ | |||
| y0i OP2 tp1i * src2r; \ | |||
| y1i OP2 tp1i * src3r; \ | |||
| #define CGEMV_N_4x2() \ | |||
| LD_SP2(pa0 + k, 4, t0, t1); \ | |||
| LD_SP2(pa1 + k, 4, t4, t5); \ | |||
| \ | |||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||
| \ | |||
| y0r += tp0r * src0r; \ | |||
| y0r += tp1r * src2r; \ | |||
| \ | |||
| y0r OP0 tp0i * src0i; \ | |||
| y0r OP0 tp1i * src2i; \ | |||
| \ | |||
| y0i OP1 tp0r * src0i; \ | |||
| y0i OP1 tp1r * src2i; \ | |||
| \ | |||
| y0i OP2 tp0i * src0r; \ | |||
| y0i OP2 tp1i * src2r; \ | |||
| #define CGEMV_N_1x2() \ | |||
| res0 = y[0 * inc_y2]; \ | |||
| res1 = y[0 * inc_y2 + 1]; \ | |||
| \ | |||
| res0 += temp0_r * pa0[k]; \ | |||
| res0 OP0 temp0_i * pa0[k + 1]; \ | |||
| res0 += temp1_r * pa1[k]; \ | |||
| res0 OP0 temp1_i * pa1[k + 1]; \ | |||
| \ | |||
| res1 OP1 temp0_r * pa0[k + 1]; \ | |||
| res1 OP2 temp0_i * pa0[k]; \ | |||
| res1 OP1 temp1_r * pa1[k + 1]; \ | |||
| res1 OP2 temp1_i * pa1[k]; \ | |||
| \ | |||
| y[0 * inc_y2] = res0; \ | |||
| y[0 * inc_y2 + 1] = res1; \ | |||
| #define CGEMV_N_1x1() \ | |||
| res0 = y[0 * inc_y2]; \ | |||
| res1 = y[0 * inc_y2 + 1]; \ | |||
| \ | |||
| res0 += temp_r * pa0[k]; \ | |||
| res0 OP0 temp_i * pa0[k + 1]; \ | |||
| \ | |||
| res1 OP1 temp_r * pa0[k + 1]; \ | |||
| res1 OP2 temp_i * pa0[k]; \ | |||
| \ | |||
| y[0 * inc_y2] = res0; \ | |||
| y[0 * inc_y2 + 1] = res1; \ | |||
| #define CLOAD_X4_SCALE_VECTOR() \ | |||
| LD_SP2(x, 4, x0, x1); \ | |||
| \ | |||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ | |||
| \ | |||
| tp4r = alphar * x0r; \ | |||
| tp4r OP3 alphai * x0i; \ | |||
| tp4i = alphar * x0i; \ | |||
| tp4i OP4 alphai * x0r; \ | |||
| \ | |||
| SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \ | |||
| SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \ | |||
| #define CLOAD_X4_SCALE_GP() \ | |||
| x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ | |||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ | |||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ | |||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ | |||
| x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ | |||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ | |||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ | |||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ | |||
| \ | |||
| tp4r = alphar * x0r; \ | |||
| tp4r OP3 alphai * x0i; \ | |||
| tp4i = alphar * x0i; \ | |||
| tp4i OP4 alphai * x0r; \ | |||
| \ | |||
| SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \ | |||
| SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \ | |||
| #define CLOAD_X2_SCALE_GP() \ | |||
| temp0_r = alpha_r * x[0 * inc_x2]; \ | |||
| temp0_r OP3 alpha_i * x[0 * inc_x2 + 1]; \ | |||
| temp0_i = alpha_r * x[0 * inc_x2 + 1]; \ | |||
| temp0_i OP4 alpha_i * x[0 * inc_x2]; \ | |||
| \ | |||
| temp1_r = alpha_r * x[1 * inc_x2]; \ | |||
| temp1_r OP3 alpha_i * x[1 * inc_x2 + 1]; \ | |||
| temp1_i = alpha_r * x[1 * inc_x2 + 1]; \ | |||
| temp1_i OP4 alpha_i * x[1 * inc_x2]; \ | |||
| \ | |||
| tp0r = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_r); \ | |||
| tp0i = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_i); \ | |||
| tp1r = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_r); \ | |||
| tp1i = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_i); \ | |||
| #define CLOAD_X1_SCALE_GP() \ | |||
| temp_r = alpha_r * x[0 * inc_x2]; \ | |||
| temp_r OP3 alpha_i * x[0 * inc_x2 + 1]; \ | |||
| temp_i = alpha_r * x[0 * inc_x2 + 1]; \ | |||
| temp_i OP4 alpha_i * x[0 * inc_x2]; \ | |||
| #define CLOAD_Y8_VECTOR() \ | |||
| LD_SP4(y, 4, y0, y1, y2, y3); \ | |||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); \ | |||
| PCKEVOD_W2_SP(y3, y2, y1r, y1i); \ | |||
| #define CLOAD_Y4_VECTOR() \ | |||
| LD_SP2(y, 4, y0, y1); \ | |||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); \ | |||
| #define CSTORE_Y8_VECTOR() \ | |||
| ILVRL_W2_SP(y0i, y0r, y0, y1); \ | |||
| ILVRL_W2_SP(y1i, y1r, y2, y3); \ | |||
| ST_SP4(y0, y1, y2, y3, y, 4); \ | |||
| #define CSTORE_Y4_VECTOR() \ | |||
| ILVRL_W2_SP(y0i, y0r, y0, y1); \ | |||
| ST_SP2(y0, y1, y, 4); \ | |||
| #define CLOAD_Y8_GP() \ | |||
| y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \ | |||
| y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \ | |||
| y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \ | |||
| y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \ | |||
| y1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2))); \ | |||
| y1r = (v4f32) __msa_insert_w((v4i32) y1r, 1, *((int *)(y + 5 * inc_y2))); \ | |||
| y1r = (v4f32) __msa_insert_w((v4i32) y1r, 2, *((int *)(y + 6 * inc_y2))); \ | |||
| y1r = (v4f32) __msa_insert_w((v4i32) y1r, 3, *((int *)(y + 7 * inc_y2))); \ | |||
| y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \ | |||
| y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \ | |||
| y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \ | |||
| y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \ | |||
| y1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2 + 1))); \ | |||
| y1i = (v4f32) __msa_insert_w((v4i32) y1i, 1, *((int *)(y + 5 * inc_y2 + 1))); \ | |||
| y1i = (v4f32) __msa_insert_w((v4i32) y1i, 2, *((int *)(y + 6 * inc_y2 + 1))); \ | |||
| y1i = (v4f32) __msa_insert_w((v4i32) y1i, 3, *((int *)(y + 7 * inc_y2 + 1))); \ | |||
| #define CLOAD_Y4_GP() \ | |||
| y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \ | |||
| y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \ | |||
| y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \ | |||
| y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \ | |||
| y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \ | |||
| y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \ | |||
| y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \ | |||
| y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \ | |||
| #define CSTORE_Y8_GP() \ | |||
| *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \ | |||
| *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \ | |||
| *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \ | |||
| *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \ | |||
| *((int *)(y + 4 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 0); \ | |||
| *((int *)(y + 5 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 1); \ | |||
| *((int *)(y + 6 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 2); \ | |||
| *((int *)(y + 7 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 3); \ | |||
| *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \ | |||
| *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \ | |||
| *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \ | |||
| *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \ | |||
| *((int *)(y + 4 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 0); \ | |||
| *((int *)(y + 5 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 1); \ | |||
| *((int *)(y + 6 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 2); \ | |||
| *((int *)(y + 7 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 3); \ | |||
| #define CSTORE_Y4_GP() \ | |||
| *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \ | |||
| *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \ | |||
| *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \ | |||
| *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \ | |||
| *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \ | |||
| *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \ | |||
| *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \ | |||
| *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \ | |||
| #define CGEMV_N_MSA() \ | |||
| for (j = (n >> 2); j--;) \ | |||
| { \ | |||
| CLOAD_X4_SCALE(); \ | |||
| \ | |||
| k = 0; \ | |||
| y = y_org; \ | |||
| \ | |||
| for (i = (m >> 3); i--;) \ | |||
| { \ | |||
| CLOAD_Y8() \ | |||
| CGEMV_N_8x4(); \ | |||
| CSTORE_Y8(); \ | |||
| \ | |||
| k += 2 * 8; \ | |||
| y += inc_y2 * 8; \ | |||
| } \ | |||
| \ | |||
| if (m & 4) \ | |||
| { \ | |||
| CLOAD_Y4(); \ | |||
| CGEMV_N_4x4(); \ | |||
| CSTORE_Y4(); \ | |||
| \ | |||
| k += 2 * 4; \ | |||
| y += inc_y2 * 4; \ | |||
| } \ | |||
| \ | |||
| if (m & 3) \ | |||
| { \ | |||
| temp0_r = tp4r[0]; \ | |||
| temp1_r = tp4r[1]; \ | |||
| temp2_r = tp4r[2]; \ | |||
| temp3_r = tp4r[3]; \ | |||
| \ | |||
| temp0_i = tp4i[0]; \ | |||
| temp1_i = tp4i[1]; \ | |||
| temp2_i = tp4i[2]; \ | |||
| temp3_i = tp4i[3]; \ | |||
| \ | |||
| for (i = (m & 3); i--;) \ | |||
| { \ | |||
| CGEMV_N_1x4(); \ | |||
| \ | |||
| k += 2; \ | |||
| y += inc_y2; \ | |||
| } \ | |||
| } \ | |||
| \ | |||
| pa0 += 4 * lda2; \ | |||
| pa1 += 4 * lda2; \ | |||
| pa2 += 4 * lda2; \ | |||
| pa3 += 4 * lda2; \ | |||
| \ | |||
| x += 4 * inc_x2; \ | |||
| } \ | |||
| \ | |||
| if (n & 2) \ | |||
| { \ | |||
| CLOAD_X2_SCALE(); \ | |||
| \ | |||
| k = 0; \ | |||
| y = y_org; \ | |||
| \ | |||
| for (i = (m >> 3); i--;) \ | |||
| { \ | |||
| CLOAD_Y8(); \ | |||
| CGEMV_N_8x2(); \ | |||
| CSTORE_Y8(); \ | |||
| \ | |||
| k += 2 * 8; \ | |||
| y += inc_y2 * 8; \ | |||
| } \ | |||
| \ | |||
| if (m & 4) \ | |||
| { \ | |||
| CLOAD_Y4(); \ | |||
| CGEMV_N_4x2(); \ | |||
| CSTORE_Y4(); \ | |||
| \ | |||
| k += 2 * 4; \ | |||
| y += inc_y2 * 4; \ | |||
| } \ | |||
| \ | |||
| for (i = (m & 3); i--;) \ | |||
| { \ | |||
| CGEMV_N_1x2(); \ | |||
| \ | |||
| k += 2; \ | |||
| y += inc_y2; \ | |||
| } \ | |||
| \ | |||
| pa0 += 2 * lda2; \ | |||
| pa1 += 2 * lda2; \ | |||
| \ | |||
| x += 2 * inc_x2; \ | |||
| } \ | |||
| \ | |||
| if (n & 1) \ | |||
| { \ | |||
| CLOAD_X1_SCALE(); \ | |||
| \ | |||
| k = 0; \ | |||
| y = y_org; \ | |||
| \ | |||
| for (i = m; i--;) \ | |||
| { \ | |||
| CGEMV_N_1x1(); \ | |||
| \ | |||
| k += 2; \ | |||
| y += inc_y2; \ | |||
| } \ | |||
| \ | |||
| pa0 += lda2; \ | |||
| x += inc_x2; \ | |||
| } \ | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y, | |||
| BLASLONG inc_y2, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j, k; | |||
| FLOAT *y_org = y; | |||
| FLOAT *pa0, *pa1, *pa2, *pa3; | |||
| FLOAT temp_r, temp_i, res0, res1, temp0_r; | |||
| FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i; | |||
| v4f32 alphar, alphai; | |||
| v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i; | |||
| v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; | |||
| v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; | |||
| v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; | |||
| v4f32 tp0r, tp1r, tp2r, tp3r, tp4r, tp0i, tp1i, tp2i, tp3i, tp4i; | |||
| lda2 = 2 * lda2; | |||
| inc_x2 = 2 * inc_x2; | |||
| inc_y2 = 2 * inc_y2; | |||
| pa0 = A; | |||
| pa1 = A + lda2; | |||
| pa2 = A + 2 * lda2; | |||
| pa3 = A + 3 * lda2; | |||
| alphar = COPY_FLOAT_TO_VECTOR(alpha_r); | |||
| alphai = COPY_FLOAT_TO_VECTOR(alpha_i); | |||
| if ((2 == inc_x2) && (2 == inc_y2)) | |||
| { | |||
| #define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR | |||
| #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP | |||
| #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP | |||
| #define CLOAD_Y8 CLOAD_Y8_VECTOR | |||
| #define CLOAD_Y4 CLOAD_Y4_VECTOR | |||
| #define CSTORE_Y8 CSTORE_Y8_VECTOR | |||
| #define CSTORE_Y4 CSTORE_Y4_VECTOR | |||
| CGEMV_N_MSA(); | |||
| #undef CLOAD_X4_SCALE | |||
| #undef CLOAD_X2_SCALE | |||
| #undef CLOAD_X1_SCALE | |||
| #undef CLOAD_Y8 | |||
| #undef CLOAD_Y4 | |||
| #undef CSTORE_Y8 | |||
| #undef CSTORE_Y4 | |||
| } | |||
| else if (2 == inc_x2) | |||
| { | |||
| #define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR | |||
| #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP | |||
| #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP | |||
| #define CLOAD_Y8 CLOAD_Y8_GP | |||
| #define CLOAD_Y4 CLOAD_Y4_GP | |||
| #define CSTORE_Y8 CSTORE_Y8_GP | |||
| #define CSTORE_Y4 CSTORE_Y4_GP | |||
| CGEMV_N_MSA(); | |||
| #undef CLOAD_X4_SCALE | |||
| #undef CLOAD_X2_SCALE | |||
| #undef CLOAD_X1_SCALE | |||
| #undef CLOAD_Y8 | |||
| #undef CLOAD_Y4 | |||
| #undef CSTORE_Y8 | |||
| #undef CSTORE_Y4 | |||
| } | |||
| else if (2 == inc_y2) | |||
| { | |||
| #define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP | |||
| #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP | |||
| #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP | |||
| #define CLOAD_Y8 CLOAD_Y8_VECTOR | |||
| #define CLOAD_Y4 CLOAD_Y4_VECTOR | |||
| #define CSTORE_Y8 CSTORE_Y8_VECTOR | |||
| #define CSTORE_Y4 CSTORE_Y4_VECTOR | |||
| CGEMV_N_MSA(); | |||
| #undef CLOAD_X4_SCALE | |||
| #undef CLOAD_X2_SCALE | |||
| #undef CLOAD_X1_SCALE | |||
| #undef CLOAD_Y8 | |||
| #undef CLOAD_Y4 | |||
| #undef CSTORE_Y8 | |||
| #undef CSTORE_Y4 | |||
| } | |||
| else | |||
| { | |||
| #define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP | |||
| #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP | |||
| #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP | |||
| #define CLOAD_Y8 CLOAD_Y8_GP | |||
| #define CLOAD_Y4 CLOAD_Y4_GP | |||
| #define CSTORE_Y8 CSTORE_Y8_GP | |||
| #define CSTORE_Y4 CSTORE_Y4_GP | |||
| CGEMV_N_MSA(); | |||
| #undef CLOAD_X4_SCALE | |||
| #undef CLOAD_X2_SCALE | |||
| #undef CLOAD_X1_SCALE | |||
| #undef CLOAD_Y8 | |||
| #undef CLOAD_Y4 | |||
| #undef CSTORE_Y8 | |||
| #undef CSTORE_Y4 | |||
| } | |||
| return(0); | |||
| } | |||
| #undef OP0 | |||
| #undef OP1 | |||
| #undef OP2 | |||
| #undef OP3 | |||
| #undef OP4 | |||
| @@ -0,0 +1,583 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| #undef OP0 | |||
| #undef OP1 | |||
| #undef OP2 | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| #define OP0 -= | |||
| #define OP1 += | |||
| #define OP2 += | |||
| #else | |||
| #define OP0 += | |||
| #define OP1 += | |||
| #define OP2 -= | |||
| #endif | |||
| #define CGEMV_T_8x4() \ | |||
| LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ | |||
| LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ | |||
| LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \ | |||
| LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \ | |||
| \ | |||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||
| PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ | |||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||
| PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ | |||
| PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ | |||
| PCKEVOD_W2_SP(t11, t10, src5r, src5i); \ | |||
| PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ | |||
| PCKEVOD_W2_SP(t15, t14, src7r, src7i); \ | |||
| \ | |||
| tp0r += src0r * x0r; \ | |||
| tp0r += src1r * x1r; \ | |||
| tp0r OP0 src0i * x0i; \ | |||
| tp0r OP0 src1i * x1i; \ | |||
| \ | |||
| tp1r += src2r * x0r; \ | |||
| tp1r += src3r * x1r; \ | |||
| tp1r OP0 src2i * x0i; \ | |||
| tp1r OP0 src3i * x1i; \ | |||
| \ | |||
| tp2r += src4r * x0r; \ | |||
| tp2r += src5r * x1r; \ | |||
| tp2r OP0 src4i * x0i; \ | |||
| tp2r OP0 src5i * x1i; \ | |||
| \ | |||
| tp3r += src6r * x0r; \ | |||
| tp3r += src7r * x1r; \ | |||
| tp3r OP0 src6i * x0i; \ | |||
| tp3r OP0 src7i * x1i; \ | |||
| \ | |||
| tp0i OP1 src0r * x0i; \ | |||
| tp0i OP1 src1r * x1i; \ | |||
| tp0i OP2 src0i * x0r; \ | |||
| tp0i OP2 src1i * x1r; \ | |||
| \ | |||
| tp1i OP1 src2r * x0i; \ | |||
| tp1i OP1 src3r * x1i; \ | |||
| tp1i OP2 src2i * x0r; \ | |||
| tp1i OP2 src3i * x1r; \ | |||
| \ | |||
| tp2i OP1 src4r * x0i; \ | |||
| tp2i OP1 src5r * x1i; \ | |||
| tp2i OP2 src4i * x0r; \ | |||
| tp2i OP2 src5i * x1r; \ | |||
| \ | |||
| tp3i OP1 src6r * x0i; \ | |||
| tp3i OP1 src7r * x1i; \ | |||
| tp3i OP2 src6i * x0r; \ | |||
| tp3i OP2 src7i * x1r; \ | |||
| #define CGEMV_T_8x2() \ | |||
| LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ | |||
| LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ | |||
| \ | |||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||
| PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ | |||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||
| PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ | |||
| \ | |||
| tp0r += src0r * x0r; \ | |||
| tp0r += src1r * x1r; \ | |||
| tp0r OP0 src0i * x0i; \ | |||
| tp0r OP0 src1i * x1i; \ | |||
| \ | |||
| tp1r += src2r * x0r; \ | |||
| tp1r += src3r * x1r; \ | |||
| tp1r OP0 src2i * x0i; \ | |||
| tp1r OP0 src3i * x1i; \ | |||
| \ | |||
| tp0i OP1 src0r * x0i; \ | |||
| tp0i OP1 src1r * x1i; \ | |||
| tp0i OP2 src0i * x0r; \ | |||
| tp0i OP2 src1i * x1r; \ | |||
| \ | |||
| tp1i OP1 src2r * x0i; \ | |||
| tp1i OP1 src3r * x1i; \ | |||
| tp1i OP2 src2i * x0r; \ | |||
| tp1i OP2 src3i * x1r; \ | |||
| #define CGEMV_T_8x1() \ | |||
| LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ | |||
| \ | |||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||
| PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ | |||
| \ | |||
| tp0r += src0r * x0r; \ | |||
| tp0r += src1r * x1r; \ | |||
| tp0r OP0 src0i * x0i; \ | |||
| tp0r OP0 src1i * x1i; \ | |||
| \ | |||
| tp0i OP1 src0r * x0i; \ | |||
| tp0i OP1 src1r * x1i; \ | |||
| tp0i OP2 src0i * x0r; \ | |||
| tp0i OP2 src1i * x1r; \ | |||
| #define CGEMV_T_4x4() \ | |||
| LD_SP2(pa0 + k, 4, t0, t1); \ | |||
| LD_SP2(pa1 + k, 4, t4, t5); \ | |||
| LD_SP2(pa2 + k, 4, t8, t9); \ | |||
| LD_SP2(pa3 + k, 4, t12, t13); \ | |||
| \ | |||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||
| PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ | |||
| PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ | |||
| \ | |||
| tp0r += src0r * x0r; \ | |||
| tp0r OP0 src0i * x0i; \ | |||
| \ | |||
| tp1r += src2r * x0r; \ | |||
| tp1r OP0 src2i * x0i; \ | |||
| \ | |||
| tp2r += src4r * x0r; \ | |||
| tp2r OP0 src4i * x0i; \ | |||
| \ | |||
| tp3r += src6r * x0r; \ | |||
| tp3r OP0 src6i * x0i; \ | |||
| \ | |||
| tp0i OP1 src0r * x0i; \ | |||
| tp0i OP2 src0i * x0r; \ | |||
| \ | |||
| tp1i OP1 src2r * x0i; \ | |||
| tp1i OP2 src2i * x0r; \ | |||
| \ | |||
| tp2i OP1 src4r * x0i; \ | |||
| tp2i OP2 src4i * x0r; \ | |||
| \ | |||
| tp3i OP1 src6r * x0i; \ | |||
| tp3i OP2 src6i * x0r; \ | |||
| #define CGEMV_T_4x2() \ | |||
| LD_SP2(pa0 + k, 4, t0, t1); \ | |||
| LD_SP2(pa1 + k, 4, t4, t5); \ | |||
| \ | |||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||
| PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ | |||
| \ | |||
| tp0r += src0r * x0r; \ | |||
| tp0r OP0 src0i * x0i; \ | |||
| \ | |||
| tp1r += src2r * x0r; \ | |||
| tp1r OP0 src2i * x0i; \ | |||
| \ | |||
| tp0i OP1 src0r * x0i; \ | |||
| tp0i OP2 src0i * x0r; \ | |||
| \ | |||
| tp1i OP1 src2r * x0i; \ | |||
| tp1i OP2 src2i * x0r; \ | |||
| #define CGEMV_T_4x1() \ | |||
| LD_SP2(pa0 + k, 4, t0, t1); \ | |||
| \ | |||
| PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ | |||
| \ | |||
| tp0r += src0r * x0r; \ | |||
| tp0r OP0 src0i * x0i; \ | |||
| \ | |||
| tp0i OP1 src0r * x0i; \ | |||
| tp0i OP2 src0i * x0r; \ | |||
| #define CGEMV_T_1x4() \ | |||
| temp0r += pa0[k + 0] * x[0 * inc_x2]; \ | |||
| temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ | |||
| temp1r += pa1[k + 0] * x[0 * inc_x2]; \ | |||
| temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \ | |||
| temp2r += pa2[k + 0] * x[0 * inc_x2]; \ | |||
| temp2r OP0 pa2[k + 1] * x[0 * inc_x2 + 1]; \ | |||
| temp3r += pa3[k + 0] * x[0 * inc_x2]; \ | |||
| temp3r OP0 pa3[k + 1] * x[0 * inc_x2 + 1]; \ | |||
| \ | |||
| temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ | |||
| temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ | |||
| temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \ | |||
| temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \ | |||
| temp2i OP1 pa2[k + 0] * x[0 * inc_x2 + 1]; \ | |||
| temp2i OP2 pa2[k + 1] * x[0 * inc_x2]; \ | |||
| temp3i OP1 pa3[k + 0] * x[0 * inc_x2 + 1]; \ | |||
| temp3i OP2 pa3[k + 1] * x[0 * inc_x2]; \ | |||
| #define CGEMV_T_1x2() \ | |||
| temp0r += pa0[k + 0] * x[0 * inc_x2]; \ | |||
| temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ | |||
| temp1r += pa1[k + 0] * x[0 * inc_x2]; \ | |||
| temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \ | |||
| \ | |||
| temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ | |||
| temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ | |||
| temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \ | |||
| temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \ | |||
| #define CGEMV_T_1x1() \ | |||
| temp0r += pa0[k + 0] * x[0 * inc_x2]; \ | |||
| temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ | |||
| \ | |||
| temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ | |||
| temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ | |||
| #define CSCALE_STORE_Y4_GP() \ | |||
| res0r = y[0 * inc_y2]; \ | |||
| res1r = y[1 * inc_y2]; \ | |||
| res2r = y[2 * inc_y2]; \ | |||
| res3r = y[3 * inc_y2]; \ | |||
| \ | |||
| res0i = y[0 * inc_y2 + 1]; \ | |||
| res1i = y[1 * inc_y2 + 1]; \ | |||
| res2i = y[2 * inc_y2 + 1]; \ | |||
| res3i = y[3 * inc_y2 + 1]; \ | |||
| \ | |||
| res0r += alphar * temp0r; \ | |||
| res0r OP0 alphai * temp0i; \ | |||
| res1r += alphar * temp1r; \ | |||
| res1r OP0 alphai * temp1i; \ | |||
| res2r += alphar * temp2r; \ | |||
| res2r OP0 alphai * temp2i; \ | |||
| res3r += alphar * temp3r; \ | |||
| res3r OP0 alphai * temp3i; \ | |||
| \ | |||
| res0i OP1 alphar * temp0i; \ | |||
| res0i OP2 alphai * temp0r; \ | |||
| res1i OP1 alphar * temp1i; \ | |||
| res1i OP2 alphai * temp1r; \ | |||
| res2i OP1 alphar * temp2i; \ | |||
| res2i OP2 alphai * temp2r; \ | |||
| res3i OP1 alphar * temp3i; \ | |||
| res3i OP2 alphai * temp3r; \ | |||
| \ | |||
| y[0 * inc_y2] = res0r; \ | |||
| y[1 * inc_y2] = res1r; \ | |||
| y[2 * inc_y2] = res2r; \ | |||
| y[3 * inc_y2] = res3r; \ | |||
| \ | |||
| y[0 * inc_y2 + 1] = res0i; \ | |||
| y[1 * inc_y2 + 1] = res1i; \ | |||
| y[2 * inc_y2 + 1] = res2i; \ | |||
| y[3 * inc_y2 + 1] = res3i; \ | |||
| #define CSCALE_STORE_Y2_GP() \ | |||
| res0r = y[0 * inc_y2]; \ | |||
| res1r = y[1 * inc_y2]; \ | |||
| \ | |||
| res0i = y[0 * inc_y2 + 1]; \ | |||
| res1i = y[1 * inc_y2 + 1]; \ | |||
| \ | |||
| res0r += alphar * temp0r; \ | |||
| res0r OP0 alphai * temp0i; \ | |||
| res1r += alphar * temp1r; \ | |||
| res1r OP0 alphai * temp1i; \ | |||
| \ | |||
| res0i OP1 alphar * temp0i; \ | |||
| res0i OP2 alphai * temp0r; \ | |||
| res1i OP1 alphar * temp1i; \ | |||
| res1i OP2 alphai * temp1r; \ | |||
| \ | |||
| y[0 * inc_y2] = res0r; \ | |||
| y[1 * inc_y2] = res1r; \ | |||
| \ | |||
| y[0 * inc_y2 + 1] = res0i; \ | |||
| y[1 * inc_y2 + 1] = res1i; \ | |||
| #define CSCALE_STORE_Y1_GP() \ | |||
| res0r = y[0 * inc_y2]; \ | |||
| res0i = y[0 * inc_y2 + 1]; \ | |||
| \ | |||
| res0r += alphar * temp0r; \ | |||
| res0r OP0 alphai * temp0i; \ | |||
| \ | |||
| res0i OP1 alphar * temp0i; \ | |||
| res0i OP2 alphai * temp0r; \ | |||
| \ | |||
| y[0 * inc_y2] = res0r; \ | |||
| y[0 * inc_y2 + 1] = res0i; \ | |||
| #define CLOAD_X8_VECTOR() \ | |||
| LD_SP4(x, 4, x0, x1, x2, x3); \ | |||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ | |||
| PCKEVOD_W2_SP(x3, x2, x1r, x1i); \ | |||
| #define CLOAD_X4_VECTOR() \ | |||
| LD_SP2(x, 4, x0, x1); \ | |||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ | |||
| #define CLOAD_X8_GP() \ | |||
| x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ | |||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ | |||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ | |||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ | |||
| x1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2))); \ | |||
| x1r = (v4f32) __msa_insert_w((v4i32) x1r, 1, *((int *) (x + 5 * inc_x2))); \ | |||
| x1r = (v4f32) __msa_insert_w((v4i32) x1r, 2, *((int *) (x + 6 * inc_x2))); \ | |||
| x1r = (v4f32) __msa_insert_w((v4i32) x1r, 3, *((int *) (x + 7 * inc_x2))); \ | |||
| x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ | |||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ | |||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ | |||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ | |||
| x1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2 + 1))); \ | |||
| x1i = (v4f32) __msa_insert_w((v4i32) x1i, 1, *((int *) (x + 5 * inc_x2 + 1))); \ | |||
| x1i = (v4f32) __msa_insert_w((v4i32) x1i, 2, *((int *) (x + 6 * inc_x2 + 1))); \ | |||
| x1i = (v4f32) __msa_insert_w((v4i32) x1i, 3, *((int *) (x + 7 * inc_x2 + 1))); \ | |||
| #define CLOAD_X4_GP() \ | |||
| x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ | |||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ | |||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ | |||
| x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ | |||
| x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ | |||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ | |||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ | |||
| x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ | |||
| #define CGEMV_T_MSA() \ | |||
| for (j = (n >> 2); j--;) \ | |||
| { \ | |||
| tp0r = tp1r = tp2r = tp3r = zero; \ | |||
| tp0i = tp1i = tp2i = tp3i = zero; \ | |||
| \ | |||
| k = 0; \ | |||
| x = srcx_org; \ | |||
| \ | |||
| for (i = (m >> 3); i--;) \ | |||
| { \ | |||
| CLOAD_X8() \ | |||
| CGEMV_T_8x4(); \ | |||
| \ | |||
| k += 2 * 8; \ | |||
| x += inc_x2 * 8; \ | |||
| } \ | |||
| \ | |||
| if (m & 4) \ | |||
| { \ | |||
| CLOAD_X4(); \ | |||
| \ | |||
| CGEMV_T_4x4(); \ | |||
| \ | |||
| k += 2 * 4; \ | |||
| x += inc_x2 * 4; \ | |||
| } \ | |||
| \ | |||
| TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp2r, tp3r, \ | |||
| tp0r, tp1r, tp2r, tp3r); \ | |||
| TRANSPOSE4x4_SP_SP(tp0i, tp1i, tp2i, tp3i, \ | |||
| tp0i, tp1i, tp2i, tp3i); \ | |||
| \ | |||
| tp0r += tp1r; \ | |||
| tp0r += tp2r; \ | |||
| tp0r += tp3r; \ | |||
| tp0i += tp1i; \ | |||
| tp0i += tp2i; \ | |||
| tp0i += tp3i; \ | |||
| \ | |||
| temp0r = tp0r[0]; \ | |||
| temp1r = tp0r[1]; \ | |||
| temp2r = tp0r[2]; \ | |||
| temp3r = tp0r[3]; \ | |||
| temp0i = tp0i[0]; \ | |||
| temp1i = tp0i[1]; \ | |||
| temp2i = tp0i[2]; \ | |||
| temp3i = tp0i[3]; \ | |||
| \ | |||
| for (i = (m & 3); i--;) \ | |||
| { \ | |||
| CGEMV_T_1x4(); \ | |||
| \ | |||
| k += 2; \ | |||
| x += inc_x2; \ | |||
| } \ | |||
| \ | |||
| CSCALE_STORE_Y4_GP(); \ | |||
| \ | |||
| pa0 += 4 * lda2; \ | |||
| pa1 += 4 * lda2; \ | |||
| pa2 += 4 * lda2; \ | |||
| pa3 += 4 * lda2; \ | |||
| y += 4 * inc_y2; \ | |||
| } \ | |||
| \ | |||
| if (n & 2) \ | |||
| { \ | |||
| tp0r = tp1r = zero; \ | |||
| tp0i = tp1i = zero; \ | |||
| \ | |||
| k = 0; \ | |||
| x = srcx_org; \ | |||
| \ | |||
| for (i = (m >> 3); i--;) \ | |||
| { \ | |||
| CLOAD_X8(); \ | |||
| \ | |||
| CGEMV_T_8x2(); \ | |||
| \ | |||
| k += 2 * 8; \ | |||
| x += inc_x2 * 8; \ | |||
| } \ | |||
| \ | |||
| if (m & 4) \ | |||
| { \ | |||
| CLOAD_X4(); \ | |||
| \ | |||
| CGEMV_T_4x2(); \ | |||
| \ | |||
| k += 2 * 4; \ | |||
| x += inc_x2 * 4; \ | |||
| } \ | |||
| \ | |||
| TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp0i, tp1i, \ | |||
| tp0r, tp1r, tp0i, tp1i); \ | |||
| \ | |||
| tp0r += tp1r; \ | |||
| tp0r += tp0i; \ | |||
| tp0r += tp1i; \ | |||
| \ | |||
| temp0r = tp0r[0]; \ | |||
| temp1r = tp0r[1]; \ | |||
| temp0i = tp0r[2]; \ | |||
| temp1i = tp0r[3]; \ | |||
| \ | |||
| for (i = (m & 3); i--;) \ | |||
| { \ | |||
| CGEMV_T_1x2(); \ | |||
| \ | |||
| k += 2; \ | |||
| x += inc_x2; \ | |||
| } \ | |||
| \ | |||
| CSCALE_STORE_Y2_GP(); \ | |||
| \ | |||
| pa0 += 2 * lda2; \ | |||
| pa1 += 2 * lda2; \ | |||
| y += 2 * inc_y2; \ | |||
| } \ | |||
| \ | |||
| if (n & 1) \ | |||
| { \ | |||
| tp0r = zero; \ | |||
| tp0i = zero; \ | |||
| \ | |||
| k = 0; \ | |||
| x = srcx_org; \ | |||
| \ | |||
| for (i = (m >> 3); i--;) \ | |||
| { \ | |||
| CLOAD_X8(); \ | |||
| \ | |||
| CGEMV_T_8x1(); \ | |||
| \ | |||
| k += 2 * 8; \ | |||
| x += inc_x2 * 8; \ | |||
| } \ | |||
| \ | |||
| if (m & 4) \ | |||
| { \ | |||
| CLOAD_X4(); \ | |||
| \ | |||
| CGEMV_T_4x1(); \ | |||
| \ | |||
| k += 2 * 4; \ | |||
| x += inc_x2 * 4; \ | |||
| } \ | |||
| \ | |||
| ILVRL_W2_SP(tp0i, tp0r, t0, t1); \ | |||
| \ | |||
| t0 += t1; \ | |||
| \ | |||
| temp0r = t0[0] + t0[2]; \ | |||
| temp0i = t0[1] + t0[3]; \ | |||
| \ | |||
| for (i = (m & 3); i--;) \ | |||
| { \ | |||
| CGEMV_T_1x1(); \ | |||
| \ | |||
| k += 2; \ | |||
| x += inc_x2; \ | |||
| } \ | |||
| \ | |||
| CSCALE_STORE_Y1_GP(); \ | |||
| \ | |||
| pa0 += lda2; \ | |||
| y += inc_y2; \ | |||
| } \ | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai, | |||
| FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||
| BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j, k; | |||
| FLOAT *pa0, *pa1, *pa2, *pa3; | |||
| FLOAT *srcx_org = x; | |||
| FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i; | |||
| FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i; | |||
| BLASLONG inc_x2, inc_y2, lda2; | |||
| v4f32 zero = {0}; | |||
| v4f32 x0, x1, x2, x3, x0r, x1r, x0i, x1i; | |||
| v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; | |||
| v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; | |||
| v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; | |||
| v4f32 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i; | |||
| lda2 = 2 * lda; | |||
| pa0 = A; | |||
| pa1 = A + lda2; | |||
| pa2 = A + 2 * lda2; | |||
| pa3 = A + 3 * lda2; | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| if (2 == inc_x2) | |||
| { | |||
| #define CLOAD_X8 CLOAD_X8_VECTOR | |||
| #define CLOAD_X4 CLOAD_X4_VECTOR | |||
| CGEMV_T_MSA(); | |||
| #undef CLOAD_X8 | |||
| #undef CLOAD_X4 | |||
| } | |||
| else | |||
| { | |||
| #define CLOAD_X8 CLOAD_X8_GP | |||
| #define CLOAD_X4 CLOAD_X4_GP | |||
| CGEMV_T_MSA(); | |||
| #undef CLOAD_X8 | |||
| #undef CLOAD_X4 | |||
| } | |||
| return(0); | |||
| } | |||
| #undef OP0 | |||
| #undef OP1 | |||
| #undef OP2 | |||
| @@ -0,0 +1,50 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| if ( n < 0 ) return(0); | |||
| while(i < n) | |||
| { | |||
| y[iy] = x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,278 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include "macros_msa.h" | |||
| #define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec)) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT sumf = 0.0; | |||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3; | |||
| v2f64 zero_v = {0}; | |||
| v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF}; | |||
| if (n <= 0 || inc_x <= 0) return (sumf); | |||
| if (1 == inc_x) | |||
| { | |||
| if (n > 15) | |||
| { | |||
| n -= 16; | |||
| LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 = AND_VEC_D(src0); | |||
| sum_abs1 = AND_VEC_D(src1); | |||
| sum_abs2 = AND_VEC_D(src2); | |||
| sum_abs3 = AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| sum_abs3 += AND_VEC_D(src7); | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = zero_v; | |||
| sum_abs1 = zero_v; | |||
| sum_abs2 = zero_v; | |||
| sum_abs3 = zero_v; | |||
| } | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| sum_abs3 += AND_VEC_D(src7); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if ((n & 8) && (n & 4) && (n & 2)) | |||
| { | |||
| LD_DP7_INC(x, 2, src0, src1, src2, src3, src4, src5, src6); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| } | |||
| else if ((n & 8) && (n & 4)) | |||
| { | |||
| LD_DP6_INC(x, 2, src0, src1, src2, src3, src4, src5); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| } | |||
| else if ((n & 8) && (n & 2)) | |||
| { | |||
| LD_DP5_INC(x, 2, src0, src1, src2, src3, src4); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_DP3_INC(x, 2, src0, src1, src2); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| } | |||
| else if (n & 8) | |||
| { | |||
| LD_DP4_INC(x, 2, src0, src1, src2, src3); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| } | |||
| else if (n & 4) | |||
| { | |||
| LD_DP2_INC(x, 2, src0, src1); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| src0 = LD_DP(x); x += 2; | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| } | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0] + sum_abs0[1]; | |||
| if (n & 1) | |||
| { | |||
| sumf += fabs(*x); | |||
| } | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0] + sum_abs0[1]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if (n > 8) | |||
| { | |||
| n -= 8; | |||
| LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 = AND_VEC_D(src0); | |||
| sum_abs1 = AND_VEC_D(src1); | |||
| sum_abs2 = AND_VEC_D(src2); | |||
| sum_abs3 = AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| sum_abs3 += AND_VEC_D(src7); | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = zero_v; | |||
| sum_abs1 = zero_v; | |||
| sum_abs2 = zero_v; | |||
| sum_abs3 = zero_v; | |||
| } | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| sum_abs3 += AND_VEC_D(src7); | |||
| } | |||
| if (n & 7) | |||
| { | |||
| if ((n & 4) && (n & 2) && (n & 1)) | |||
| { | |||
| LD_DP7_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_DP6_INC(x, inc_x, src0, src1, src2, src3, src4, src5); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| } | |||
| else if ((n & 4) && (n & 1)) | |||
| { | |||
| LD_DP5_INC(x, inc_x, src0, src1, src2, src3, src4); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| } | |||
| else if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_DP3_INC(x, inc_x, src0, src1, src2); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| } | |||
| else if (n & 4) | |||
| { | |||
| LD_DP4_INC(x, inc_x, src0, src1, src2, src3); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| LD_DP2_INC(x, inc_x, src0, src1); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| } | |||
| else if (n & 1) | |||
| { | |||
| src0 = LD_DP(x); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| } | |||
| } | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| } | |||
| return (sumf); | |||
| } | |||
| @@ -0,0 +1,189 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| /* return float, x,y float */ | |||
| #if defined(DSDOT) | |||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #else | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #endif | |||
| { | |||
| BLASLONG i = 0; | |||
| double dot = 0.0; | |||
| FLOAT x0, x1, x2, x3, y0, y1, y2, y3; | |||
| v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; | |||
| v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; | |||
| v2f64 dot0 = {0, 0}; | |||
| if (n < 0) return (dot); | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); | |||
| LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| dot0 += (vy4 * vx4); | |||
| dot0 += (vy5 * vx5); | |||
| dot0 += (vy6 * vx6); | |||
| dot0 += (vy7 * vx7); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if ((n & 8) && (n & 4) && (n & 2)) | |||
| { | |||
| LD_DP7_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6); | |||
| LD_DP7_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| dot0 += (vy4 * vx4); | |||
| dot0 += (vy5 * vx5); | |||
| dot0 += (vy6 * vx6); | |||
| } | |||
| else if ((n & 8) && (n & 4)) | |||
| { | |||
| LD_DP6_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5); | |||
| LD_DP6_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| dot0 += (vy4 * vx4); | |||
| dot0 += (vy5 * vx5); | |||
| } | |||
| else if ((n & 8) && (n & 2)) | |||
| { | |||
| LD_DP5_INC(x, 2, vx0, vx1, vx2, vx3, vx4); | |||
| LD_DP5_INC(y, 2, vy0, vy1, vy2, vy3, vy4); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| dot0 += (vy4 * vx4); | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_DP3_INC(x, 2, vx0, vx1, vx2); | |||
| LD_DP3_INC(y, 2, vy0, vy1, vy2); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| } | |||
| else if (n & 8) | |||
| { | |||
| LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3); | |||
| LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| } | |||
| else if (n & 4) | |||
| { | |||
| LD_DP2_INC(x, 2, vx0, vx1); | |||
| LD_DP2_INC(y, 2, vy0, vy1); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| vx0 = LD_DP(x); x += 2; | |||
| vy0 = LD_DP(y); y += 2; | |||
| dot0 += (vy0 * vx0); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| x0 = *x; | |||
| y0 = *y; | |||
| dot += (y0 * x0); | |||
| } | |||
| } | |||
| dot += dot0[0]; | |||
| dot += dot0[1]; | |||
| } | |||
| else | |||
| { | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| LD_GP4_INC(x, inc_x, x0, x1, x2, x3); | |||
| LD_GP4_INC(y, inc_y, y0, y1, y2, y3); | |||
| dot += (y0 * x0); | |||
| dot += (y1 * x1); | |||
| dot += (y2 * x2); | |||
| dot += (y3 * x3); | |||
| } | |||
| if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_GP3_INC(x, inc_x, x0, x1, x2); | |||
| LD_GP3_INC(y, inc_y, y0, y1, y2); | |||
| dot += (y0 * x0); | |||
| dot += (y1 * x1); | |||
| dot += (y2 * x2); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| LD_GP2_INC(x, inc_x, x0, x1); | |||
| LD_GP2_INC(y, inc_y, y0, y1); | |||
| dot += (y0 * x0); | |||
| dot += (y1 * x1); | |||
| } | |||
| else if (n & 1) | |||
| { | |||
| x0 = *x; | |||
| y0 = *y; | |||
| dot += (y0 * x0); | |||
| } | |||
| } | |||
| return (dot); | |||
| } | |||
| @@ -0,0 +1,118 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| FLOAT * __restrict dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; | |||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | |||
| psrc0 = src; | |||
| pdst = dst; | |||
| for (j = (n >> 2); j--;) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc3 = psrc2 + lda; | |||
| psrc4 = psrc3 + lda; | |||
| psrc0 += 4 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_DP(src6, src4, dst1, dst5); | |||
| ILVRL_D2_DP(src3, src1, dst2, dst6); | |||
| ILVRL_D2_DP(src7, src5, dst3, dst7); | |||
| ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); | |||
| } | |||
| for (i = (m & 3); i--;) | |||
| { | |||
| *pdst++ = *psrc1++; | |||
| *pdst++ = *psrc2++; | |||
| *pdst++ = *psrc3++; | |||
| *pdst++ = *psrc4++; | |||
| } | |||
| } | |||
| if (n & 2) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc0 += 2 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_DP(src3, src1, dst1, dst5); | |||
| ST_DP4_INC(dst0, dst4, dst1, dst5, pdst, 2); | |||
| } | |||
| for (i = (m & 3); i--;) | |||
| { | |||
| *pdst++ = *psrc1++; | |||
| *pdst++ = *psrc2++; | |||
| } | |||
| } | |||
| if (n & 1) | |||
| { | |||
| psrc1 = psrc0; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_DP2(psrc1, 2, src0, src1); | |||
| psrc1 += 4; | |||
| ST_DP2(src0, src1, pdst, 2); | |||
| pdst += 4; | |||
| } | |||
| for (i = (m & 3); i--;) | |||
| { | |||
| *pdst++ = *psrc1++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,186 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| FLOAT * __restrict dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; | |||
| FLOAT *psrc8, *pdst; | |||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v2f64 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | |||
| psrc0 = src; | |||
| pdst = dst; | |||
| for (j = (n >> 3); j--;) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc3 = psrc2 + lda; | |||
| psrc4 = psrc3 + lda; | |||
| psrc5 = psrc4 + lda; | |||
| psrc6 = psrc5 + lda; | |||
| psrc7 = psrc6 + lda; | |||
| psrc8 = psrc7 + lda; | |||
| psrc0 += 8 * lda; | |||
| for (i = (m >> 3); i--;) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| LD_DP2_INC(psrc5, 2, src8, src9); | |||
| LD_DP2_INC(psrc6, 2, src10, src11); | |||
| LD_DP2_INC(psrc7, 2, src12, src13); | |||
| LD_DP2_INC(psrc8, 2, src14, src15); | |||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_DP(src6, src4, dst1, dst5); | |||
| ILVRL_D2_DP(src10, src8, dst2, dst6); | |||
| ILVRL_D2_DP(src14, src12, dst3, dst7); | |||
| ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); | |||
| ILVRL_D2_DP(src3, src1, dst0, dst4); | |||
| ILVRL_D2_DP(src7, src5, dst1, dst5); | |||
| ILVRL_D2_DP(src11, src9, dst2, dst6); | |||
| ILVRL_D2_DP(src15, src13, dst3, dst7); | |||
| ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| LD_DP2_INC(psrc5, 2, src8, src9); | |||
| LD_DP2_INC(psrc6, 2, src10, src11); | |||
| LD_DP2_INC(psrc7, 2, src12, src13); | |||
| LD_DP2_INC(psrc8, 2, src14, src15); | |||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_DP(src6, src4, dst1, dst5); | |||
| ILVRL_D2_DP(src10, src8, dst2, dst6); | |||
| ILVRL_D2_DP(src14, src12, dst3, dst7); | |||
| ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); | |||
| ILVRL_D2_DP(src3, src1, dst0, dst4); | |||
| ILVRL_D2_DP(src7, src5, dst1, dst5); | |||
| ILVRL_D2_DP(src11, src9, dst2, dst6); | |||
| ILVRL_D2_DP(src15, src13, dst3, dst7); | |||
| ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); | |||
| } | |||
| for (i = (m & 7); i--;) | |||
| { | |||
| *pdst++ = *psrc1++; | |||
| *pdst++ = *psrc2++; | |||
| *pdst++ = *psrc3++; | |||
| *pdst++ = *psrc4++; | |||
| *pdst++ = *psrc5++; | |||
| *pdst++ = *psrc6++; | |||
| *pdst++ = *psrc7++; | |||
| *pdst++ = *psrc8++; | |||
| } | |||
| } | |||
| if (n & 4) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc3 = psrc2 + lda; | |||
| psrc4 = psrc3 + lda; | |||
| psrc0 += 4 * lda; | |||
| for (i = (m >> 2); i--;) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| ILVRL_D2_DP(src2, src0, dst0, dst4); | |||
| ILVRL_D2_DP(src6, src4, dst1, dst5); | |||
| ILVRL_D2_DP(src3, src1, dst2, dst6); | |||
| ILVRL_D2_DP(src7, src5, dst3, dst7); | |||
| ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); | |||
| } | |||
| for (i = (m & 3); i--;) | |||
| { | |||
| *pdst++ = *psrc1++; | |||
| *pdst++ = *psrc2++; | |||
| *pdst++ = *psrc3++; | |||
| *pdst++ = *psrc4++; | |||
| } | |||
| } | |||
| if (n & 2) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc0 += 2 * lda; | |||
| for (i = (m >> 1); i--;) | |||
| { | |||
| src0 = LD_DP(psrc1); | |||
| src1 = LD_DP(psrc2); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| ILVRL_D2_DP(src1, src0, dst0, dst1); | |||
| ST_DP2_INC(dst0, dst1, pdst, 2); | |||
| } | |||
| if (m & 1) | |||
| { | |||
| *pdst++ = *psrc1++; | |||
| *pdst++ = *psrc2++; | |||
| } | |||
| } | |||
| if (n & 1) | |||
| { | |||
| psrc1 = psrc0; | |||
| for (i = m; i--;) | |||
| { | |||
| *pdst++ = *psrc1++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,153 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| FLOAT * __restrict dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; | |||
| FLOAT *pdst0, *pdst1, *pdst2, *pdst3; | |||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| psrc0 = src; | |||
| pdst0 = dst; | |||
| pdst2 = dst + m * (n & ~3); | |||
| pdst3 = dst + m * (n & ~1); | |||
| for (j = (m >> 2); j--;) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc3 = psrc2 + lda; | |||
| psrc4 = psrc3 + lda; | |||
| psrc0 += 4 * lda; | |||
| pdst1 = pdst0; | |||
| pdst0 += 16; | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); | |||
| pdst1 += m * 4; | |||
| } | |||
| if (n & 2) | |||
| { | |||
| src0 = LD_DP(psrc1); | |||
| src1 = LD_DP(psrc2); | |||
| src2 = LD_DP(psrc3); | |||
| src3 = LD_DP(psrc4); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| psrc3 += 2; | |||
| psrc4 += 2; | |||
| ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *pdst3++ = *psrc1++; | |||
| *pdst3++ = *psrc2++; | |||
| *pdst3++ = *psrc3++; | |||
| *pdst3++ = *psrc4++; | |||
| } | |||
| } | |||
| if (m & 2) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc0 += 2 * lda; | |||
| pdst1 = pdst0; | |||
| pdst0 += 8; | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| ST_DP4(src0, src1, src2, src3, pdst1, 2); | |||
| pdst1 += m * 4; | |||
| } | |||
| if (n & 2) | |||
| { | |||
| src0 = LD_DP(psrc1); | |||
| src1 = LD_DP(psrc2); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| ST_DP2_INC(src0, src1, pdst2, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *pdst3++ = *psrc1++; | |||
| *pdst3++ = *psrc2++; | |||
| } | |||
| } | |||
| if (m & 1) | |||
| { | |||
| psrc1 = psrc0; | |||
| pdst1 = pdst0; | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| ST_DP2(src0, src1, pdst1, 2); | |||
| pdst1 += 4 * m; | |||
| } | |||
| if (n & 2) | |||
| { | |||
| src0 = LD_DP(psrc1); | |||
| psrc1 += 2; | |||
| ST_DP(src0, pdst2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *pdst3 = *psrc1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,276 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, | |||
| FLOAT * __restrict dst) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; | |||
| FLOAT *psrc5, *psrc6, *psrc7, *psrc8; | |||
| FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; | |||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v2f64 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| psrc0 = src; | |||
| pdst0 = dst; | |||
| pdst2 = dst + m * (n & ~7); | |||
| pdst3 = dst + m * (n & ~3); | |||
| pdst4 = dst + m * (n & ~1); | |||
| for (j = (m >> 3); j--;) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc3 = psrc2 + lda; | |||
| psrc4 = psrc3 + lda; | |||
| psrc5 = psrc4 + lda; | |||
| psrc6 = psrc5 + lda; | |||
| psrc7 = psrc6 + lda; | |||
| psrc8 = psrc7 + lda; | |||
| psrc0 += 8 * lda; | |||
| pdst1 = pdst0; | |||
| pdst0 += 64; | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||
| LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); | |||
| LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); | |||
| LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); | |||
| ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, | |||
| pdst1 + 16, 2); | |||
| LD_DP4_INC(psrc5, 2, src0, src1, src2, src3); | |||
| LD_DP4_INC(psrc6, 2, src4, src5, src6, src7); | |||
| LD_DP4_INC(psrc7, 2, src8, src9, src10, src11); | |||
| LD_DP4_INC(psrc8, 2, src12, src13, src14, src15); | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32, | |||
| 2); | |||
| ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, | |||
| pdst1 + 48, 2); | |||
| pdst1 += m * 8; | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| LD_DP2_INC(psrc5, 2, src8, src9); | |||
| LD_DP2_INC(psrc6, 2, src10, src11); | |||
| LD_DP2_INC(psrc7, 2, src12, src13); | |||
| LD_DP2_INC(psrc8, 2, src14, src15); | |||
| ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); | |||
| ST_DP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, | |||
| pdst2, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| src0 = LD_DP(psrc1); | |||
| src1 = LD_DP(psrc2); | |||
| src2 = LD_DP(psrc3); | |||
| src3 = LD_DP(psrc4); | |||
| src4 = LD_DP(psrc5); | |||
| src5 = LD_DP(psrc6); | |||
| src6 = LD_DP(psrc7); | |||
| src7 = LD_DP(psrc8); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| psrc3 += 2; | |||
| psrc4 += 2; | |||
| psrc5 += 2; | |||
| psrc6 += 2; | |||
| psrc7 += 2; | |||
| psrc8 += 2; | |||
| ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *pdst4++ = *psrc1++; | |||
| *pdst4++ = *psrc2++; | |||
| *pdst4++ = *psrc3++; | |||
| *pdst4++ = *psrc4++; | |||
| *pdst4++ = *psrc5++; | |||
| *pdst4++ = *psrc6++; | |||
| *pdst4++ = *psrc7++; | |||
| *pdst4++ = *psrc8++; | |||
| } | |||
| } | |||
| if (m & 4) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc3 = psrc2 + lda; | |||
| psrc4 = psrc3 + lda; | |||
| psrc0 += 4 * lda; | |||
| pdst1 = pdst0; | |||
| pdst0 += 32; | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||
| LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); | |||
| LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); | |||
| LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); | |||
| ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, | |||
| pdst1 + 16, 2); | |||
| pdst1 += 8 * m; | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| LD_DP2_INC(psrc3, 2, src4, src5); | |||
| LD_DP2_INC(psrc4, 2, src6, src7); | |||
| ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| src0 = LD_DP(psrc1); | |||
| src1 = LD_DP(psrc2); | |||
| src2 = LD_DP(psrc3); | |||
| src3 = LD_DP(psrc4); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| psrc3 += 2; | |||
| psrc4 += 2; | |||
| ST_DP4_INC(src0, src1, src2, src3, pdst3, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *pdst4++ = *psrc1++; | |||
| *pdst4++ = *psrc2++; | |||
| *pdst4++ = *psrc3++; | |||
| *pdst4++ = *psrc4++; | |||
| } | |||
| } | |||
| if (m & 2) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc2 = psrc1 + lda; | |||
| psrc0 += 2 * lda; | |||
| pdst1 = pdst0; | |||
| pdst0 += 16; | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||
| LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); | |||
| ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); | |||
| pdst1 += 8 * m; | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| LD_DP2_INC(psrc2, 2, src2, src3); | |||
| ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| src0 = LD_DP(psrc1); | |||
| src1 = LD_DP(psrc2); | |||
| psrc1 += 2; | |||
| psrc2 += 2; | |||
| ST_DP2_INC(src0, src1, pdst3, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *pdst4++ = *psrc1++; | |||
| *pdst4++ = *psrc2++; | |||
| } | |||
| } | |||
| if (m & 1) | |||
| { | |||
| psrc1 = psrc0; | |||
| psrc0 += lda; | |||
| pdst1 = pdst0; | |||
| pdst0 += 8; | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); | |||
| ST_DP4(src0, src1, src2, src3, pdst1, 2); | |||
| pdst1 += 8 * m; | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP2_INC(psrc1, 2, src0, src1); | |||
| ST_DP2_INC(src0, src1, pdst2, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| src0 = LD_DP(psrc1); | |||
| psrc1 += 2; | |||
| ST_DP(src0, pdst3); | |||
| pdst3 += 2; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *pdst4++ = *psrc1++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,577 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| #define DGEMV_N_8x8() \ | |||
| { \ | |||
| LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ | |||
| LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ | |||
| LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ | |||
| LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ | |||
| LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \ | |||
| LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \ | |||
| LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \ | |||
| LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \ | |||
| \ | |||
| y0 += tp0 * t0; \ | |||
| y1 += tp0 * t1; \ | |||
| y2 += tp0 * t2; \ | |||
| y3 += tp0 * t3; \ | |||
| \ | |||
| y0 += tp1 * t4; \ | |||
| y1 += tp1 * t5; \ | |||
| y2 += tp1 * t6; \ | |||
| y3 += tp1 * t7; \ | |||
| \ | |||
| y0 += tp2 * t8; \ | |||
| y1 += tp2 * t9; \ | |||
| y2 += tp2 * t10; \ | |||
| y3 += tp2 * t11; \ | |||
| \ | |||
| y0 += tp3 * t12; \ | |||
| y1 += tp3 * t13; \ | |||
| y2 += tp3 * t14; \ | |||
| y3 += tp3 * t15; \ | |||
| \ | |||
| y0 += tp4 * t16; \ | |||
| y1 += tp4 * t17; \ | |||
| y2 += tp4 * t18; \ | |||
| y3 += tp4 * t19; \ | |||
| \ | |||
| y0 += tp5 * t20; \ | |||
| y1 += tp5 * t21; \ | |||
| y2 += tp5 * t22; \ | |||
| y3 += tp5 * t23; \ | |||
| \ | |||
| y0 += tp6 * t24; \ | |||
| y1 += tp6 * t25; \ | |||
| y2 += tp6 * t26; \ | |||
| y3 += tp6 * t27; \ | |||
| \ | |||
| y0 += tp7 * t28; \ | |||
| y1 += tp7 * t29; \ | |||
| y2 += tp7 * t30; \ | |||
| y3 += tp7 * t31; \ | |||
| } | |||
| #define DGEMV_N_4x8() \ | |||
| { \ | |||
| LD_DP2(pa0 + k, 2, t0, t1); \ | |||
| LD_DP2(pa1 + k, 2, t4, t5); \ | |||
| LD_DP2(pa2 + k, 2, t8, t9); \ | |||
| LD_DP2(pa3 + k, 2, t12, t13); \ | |||
| LD_DP2(pa4 + k, 2, t16, t17); \ | |||
| LD_DP2(pa5 + k, 2, t20, t21); \ | |||
| LD_DP2(pa6 + k, 2, t24, t25); \ | |||
| LD_DP2(pa7 + k, 2, t28, t29); \ | |||
| \ | |||
| y0 += tp0 * t0; \ | |||
| y1 += tp0 * t1; \ | |||
| \ | |||
| y0 += tp1 * t4; \ | |||
| y1 += tp1 * t5; \ | |||
| \ | |||
| y0 += tp2 * t8; \ | |||
| y1 += tp2 * t9; \ | |||
| \ | |||
| y0 += tp3 * t12; \ | |||
| y1 += tp3 * t13; \ | |||
| \ | |||
| y0 += tp4 * t16; \ | |||
| y1 += tp4 * t17; \ | |||
| \ | |||
| y0 += tp5 * t20; \ | |||
| y1 += tp5 * t21; \ | |||
| \ | |||
| y0 += tp6 * t24; \ | |||
| y1 += tp6 * t25; \ | |||
| \ | |||
| y0 += tp7 * t28; \ | |||
| y1 += tp7 * t29; \ | |||
| } | |||
| #define DGEMV_N_8x4() \ | |||
| { \ | |||
| LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ | |||
| LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ | |||
| LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ | |||
| LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ | |||
| \ | |||
| y0 += tp0 * t0; \ | |||
| y1 += tp0 * t1; \ | |||
| y2 += tp0 * t2; \ | |||
| y3 += tp0 * t3; \ | |||
| \ | |||
| y0 += tp1 * t4; \ | |||
| y1 += tp1 * t5; \ | |||
| y2 += tp1 * t6; \ | |||
| y3 += tp1 * t7; \ | |||
| \ | |||
| y0 += tp2 * t8; \ | |||
| y1 += tp2 * t9; \ | |||
| y2 += tp2 * t10; \ | |||
| y3 += tp2 * t11; \ | |||
| \ | |||
| y0 += tp3 * t12; \ | |||
| y1 += tp3 * t13; \ | |||
| y2 += tp3 * t14; \ | |||
| y3 += tp3 * t15; \ | |||
| } | |||
| #define DGEMV_N_4x4() \ | |||
| { \ | |||
| LD_DP2(pa0 + k, 2, t0, t1); \ | |||
| LD_DP2(pa1 + k, 2, t4, t5); \ | |||
| LD_DP2(pa2 + k, 2, t8, t9); \ | |||
| LD_DP2(pa3 + k, 2, t12, t13); \ | |||
| \ | |||
| y0 += tp0 * t0; \ | |||
| y1 += tp0 * t1; \ | |||
| \ | |||
| y0 += tp1 * t4; \ | |||
| y1 += tp1 * t5; \ | |||
| \ | |||
| y0 += tp2 * t8; \ | |||
| y1 += tp2 * t9; \ | |||
| \ | |||
| y0 += tp3 * t12; \ | |||
| y1 += tp3 * t13; \ | |||
| } | |||
| #define DGEMV_N_8x2() \ | |||
| { \ | |||
| LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ | |||
| LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ | |||
| \ | |||
| y0 += tp0 * t0; \ | |||
| y1 += tp0 * t1; \ | |||
| y2 += tp0 * t2; \ | |||
| y3 += tp0 * t3; \ | |||
| \ | |||
| y0 += tp1 * t4; \ | |||
| y1 += tp1 * t5; \ | |||
| y2 += tp1 * t6; \ | |||
| y3 += tp1 * t7; \ | |||
| } | |||
| #define DGEMV_N_4x2() \ | |||
| { \ | |||
| LD_DP2(pa0 + k, 2, t0, t1); \ | |||
| LD_DP2(pa1 + k, 2, t4, t5); \ | |||
| \ | |||
| y0 += tp0 * t0; \ | |||
| y1 += tp0 * t1; \ | |||
| \ | |||
| y0 += tp1 * t4; \ | |||
| y1 += tp1 * t5; \ | |||
| } | |||
| #define DLOAD_X8_SCALE_GP() \ | |||
| temp0 = alpha * x[0 * inc_x]; \ | |||
| temp1 = alpha * x[1 * inc_x]; \ | |||
| temp2 = alpha * x[2 * inc_x]; \ | |||
| temp3 = alpha * x[3 * inc_x]; \ | |||
| temp4 = alpha * x[4 * inc_x]; \ | |||
| temp5 = alpha * x[5 * inc_x]; \ | |||
| temp6 = alpha * x[6 * inc_x]; \ | |||
| temp7 = alpha * x[7 * inc_x]; \ | |||
| \ | |||
| tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ | |||
| tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ | |||
| tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \ | |||
| tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \ | |||
| tp4 = COPY_DOUBLE_TO_VECTOR(temp4); \ | |||
| tp5 = COPY_DOUBLE_TO_VECTOR(temp5); \ | |||
| tp6 = COPY_DOUBLE_TO_VECTOR(temp6); \ | |||
| tp7 = COPY_DOUBLE_TO_VECTOR(temp7); \ | |||
| #define DLOAD_X4_SCALE_GP() \ | |||
| temp0 = alpha * x[0 * inc_x]; \ | |||
| temp1 = alpha * x[1 * inc_x]; \ | |||
| temp2 = alpha * x[2 * inc_x]; \ | |||
| temp3 = alpha * x[3 * inc_x]; \ | |||
| \ | |||
| tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ | |||
| tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ | |||
| tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \ | |||
| tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \ | |||
| #define DLOAD_X8_SCALE_VECTOR() \ | |||
| LD_DP4(x, 2, x0, x1, x2, x3); \ | |||
| \ | |||
| x0 = x0 * v_alpha; \ | |||
| x1 = x1 * v_alpha; \ | |||
| x2 = x2 * v_alpha; \ | |||
| x3 = x3 * v_alpha; \ | |||
| \ | |||
| SPLATI_D2_DP(x0, tp0, tp1); \ | |||
| SPLATI_D2_DP(x1, tp2, tp3); \ | |||
| SPLATI_D2_DP(x2, tp4, tp5); \ | |||
| SPLATI_D2_DP(x3, tp6, tp7); \ | |||
| #define DLOAD_X4_SCALE_VECTOR() \ | |||
| LD_DP2(x, 2, x0, x1); \ | |||
| \ | |||
| x0 = x0 * v_alpha; \ | |||
| x1 = x1 * v_alpha; \ | |||
| \ | |||
| SPLATI_D2_DP(x0, tp0, tp1); \ | |||
| SPLATI_D2_DP(x1, tp2, tp3); \ | |||
| #define DLOAD_Y8_GP() \ | |||
| y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \ | |||
| y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \ | |||
| y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \ | |||
| y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \ | |||
| y2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 4 * inc_y))); \ | |||
| y2 = (v2f64) __msa_insert_d((v2i64) y2, 1, *((long long *)(y + 5 * inc_y))); \ | |||
| y3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 6 * inc_y))); \ | |||
| y3 = (v2f64) __msa_insert_d((v2i64) y3, 1, *((long long *)(y + 7 * inc_y))); \ | |||
| #define DLOAD_Y4_GP() \ | |||
| y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \ | |||
| y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \ | |||
| y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \ | |||
| y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \ | |||
| #define DLOAD_Y8_VECTOR() LD_DP4(y, 2, y0, y1, y2, y3); | |||
| #define DLOAD_Y4_VECTOR() LD_DP2(y, 2, y0, y1); | |||
| #define DSTORE_Y8_GP() \ | |||
| *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \ | |||
| *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \ | |||
| *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \ | |||
| *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \ | |||
| *((long long *)(y + 4 * inc_y)) = __msa_copy_s_d((v2i64) y2, 0); \ | |||
| *((long long *)(y + 5 * inc_y)) = __msa_copy_s_d((v2i64) y2, 1); \ | |||
| *((long long *)(y + 6 * inc_y)) = __msa_copy_s_d((v2i64) y3, 0); \ | |||
| *((long long *)(y + 7 * inc_y)) = __msa_copy_s_d((v2i64) y3, 1); \ | |||
| #define DSTORE_Y4_GP() \ | |||
| *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \ | |||
| *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \ | |||
| *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \ | |||
| *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \ | |||
| #define DSTORE_Y8_VECTOR() ST_DP4(y0, y1, y2, y3, y, 2); | |||
| #define DSTORE_Y4_VECTOR() ST_DP2(y0, y1, y, 2); | |||
| #define DGEMV_N_MSA() \ | |||
| for (j = (n >> 3); j--;) \ | |||
| { \ | |||
| DLOAD_X8_SCALE(); \ | |||
| \ | |||
| k = 0; \ | |||
| y = y_org; \ | |||
| \ | |||
| for (i = (m >> 3); i--;) \ | |||
| { \ | |||
| DLOAD_Y8(); \ | |||
| DGEMV_N_8x8(); \ | |||
| DSTORE_Y8(); \ | |||
| \ | |||
| y += 8 * inc_y; \ | |||
| k += 8; \ | |||
| } \ | |||
| \ | |||
| if (m & 4) \ | |||
| { \ | |||
| DLOAD_Y4(); \ | |||
| DGEMV_N_4x8(); \ | |||
| DSTORE_Y4(); \ | |||
| \ | |||
| y += 4 * inc_y; \ | |||
| k += 4; \ | |||
| } \ | |||
| \ | |||
| if (m & 3) \ | |||
| { \ | |||
| temp0 = alpha * x[0 * inc_x]; \ | |||
| temp1 = alpha * x[1 * inc_x]; \ | |||
| temp2 = alpha * x[2 * inc_x]; \ | |||
| temp3 = alpha * x[3 * inc_x]; \ | |||
| temp4 = alpha * x[4 * inc_x]; \ | |||
| temp5 = alpha * x[5 * inc_x]; \ | |||
| temp6 = alpha * x[6 * inc_x]; \ | |||
| temp7 = alpha * x[7 * inc_x]; \ | |||
| \ | |||
| for (i = (m & 3); i--;) \ | |||
| { \ | |||
| temp = y[0]; \ | |||
| temp += temp0 * pa0[k]; \ | |||
| temp += temp1 * pa1[k]; \ | |||
| temp += temp2 * pa2[k]; \ | |||
| temp += temp3 * pa3[k]; \ | |||
| temp += temp4 * pa4[k]; \ | |||
| temp += temp5 * pa5[k]; \ | |||
| temp += temp6 * pa6[k]; \ | |||
| temp += temp7 * pa7[k]; \ | |||
| y[0] = temp; \ | |||
| \ | |||
| y += inc_y; \ | |||
| k++; \ | |||
| } \ | |||
| } \ | |||
| pa0 += 8 * lda; \ | |||
| pa1 += 8 * lda; \ | |||
| pa2 += 8 * lda; \ | |||
| pa3 += 8 * lda; \ | |||
| pa4 += 8 * lda; \ | |||
| pa5 += 8 * lda; \ | |||
| pa6 += 8 * lda; \ | |||
| pa7 += 8 * lda; \ | |||
| \ | |||
| x += 8 * inc_x; \ | |||
| } \ | |||
| \ | |||
| if (n & 4) \ | |||
| { \ | |||
| DLOAD_X4_SCALE(); \ | |||
| \ | |||
| k = 0; \ | |||
| y = y_org; \ | |||
| \ | |||
| for (i = (m >> 3); i--;) \ | |||
| { \ | |||
| DLOAD_Y8(); \ | |||
| DGEMV_N_8x4(); \ | |||
| DSTORE_Y8(); \ | |||
| \ | |||
| y += 8 * inc_y; \ | |||
| k += 8; \ | |||
| } \ | |||
| \ | |||
| if (m & 4) \ | |||
| { \ | |||
| DLOAD_Y4(); \ | |||
| DGEMV_N_4x4(); \ | |||
| DSTORE_Y4(); \ | |||
| \ | |||
| y += 4 * inc_y; \ | |||
| k += 4; \ | |||
| } \ | |||
| \ | |||
| if (m & 3) \ | |||
| { \ | |||
| temp0 = alpha * x[0 * inc_x]; \ | |||
| temp1 = alpha * x[1 * inc_x]; \ | |||
| temp2 = alpha * x[2 * inc_x]; \ | |||
| temp3 = alpha * x[3 * inc_x]; \ | |||
| \ | |||
| for (i = (m & 3); i--;) \ | |||
| { \ | |||
| temp = y[0]; \ | |||
| temp += temp0 * pa0[k]; \ | |||
| temp += temp1 * pa1[k]; \ | |||
| temp += temp2 * pa2[k]; \ | |||
| temp += temp3 * pa3[k]; \ | |||
| y[0] = temp; \ | |||
| \ | |||
| y += inc_y; \ | |||
| k++; \ | |||
| } \ | |||
| } \ | |||
| \ | |||
| pa0 += 4 * lda; \ | |||
| pa1 += 4 * lda; \ | |||
| pa2 += 4 * lda; \ | |||
| pa3 += 4 * lda; \ | |||
| \ | |||
| x += 4 * inc_x; \ | |||
| } \ | |||
| \ | |||
| if (n & 2) \ | |||
| { \ | |||
| temp0 = alpha * x[0 * inc_x]; \ | |||
| temp1 = alpha * x[1 * inc_x]; \ | |||
| \ | |||
| tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ | |||
| tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ | |||
| \ | |||
| k = 0; \ | |||
| y = y_org; \ | |||
| \ | |||
| for (i = (m >> 3); i--;) \ | |||
| { \ | |||
| DLOAD_Y8(); \ | |||
| DGEMV_N_8x2(); \ | |||
| DSTORE_Y8(); \ | |||
| \ | |||
| y += 8 * inc_y; \ | |||
| k += 8; \ | |||
| } \ | |||
| \ | |||
| if (m & 4) \ | |||
| { \ | |||
| DLOAD_Y4(); \ | |||
| DGEMV_N_4x2(); \ | |||
| DSTORE_Y4(); \ | |||
| \ | |||
| y += 4 * inc_y; \ | |||
| k += 4; \ | |||
| } \ | |||
| \ | |||
| if (m & 3) \ | |||
| { \ | |||
| temp0 = alpha * x[0 * inc_x]; \ | |||
| temp1 = alpha * x[1 * inc_x]; \ | |||
| \ | |||
| for (i = (m & 3); i--;) \ | |||
| { \ | |||
| temp = y[0]; \ | |||
| temp += temp0 * pa0[k]; \ | |||
| temp += temp1 * pa1[k]; \ | |||
| y[0] = temp; \ | |||
| \ | |||
| y += inc_y; \ | |||
| k++; \ | |||
| } \ | |||
| } \ | |||
| \ | |||
| pa0 += 2 * lda; \ | |||
| pa1 += 2 * lda; \ | |||
| \ | |||
| x += 2 * inc_x; \ | |||
| } \ | |||
| \ | |||
| if (n & 1) \ | |||
| { \ | |||
| temp = alpha * x[0]; \ | |||
| \ | |||
| k = 0; \ | |||
| y = y_org; \ | |||
| \ | |||
| for (i = m; i--;) \ | |||
| { \ | |||
| y[0] += temp * pa0[k]; \ | |||
| y += inc_y; \ | |||
| k++; \ | |||
| } \ | |||
| } \ | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, | |||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
| FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j, k; | |||
| FLOAT *y_org = y; | |||
| FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; | |||
| FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; | |||
| v2f64 v_alpha; | |||
| v2f64 x0, x1, x2, x3, y0, y1, y2, y3; | |||
| v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; | |||
| v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; | |||
| v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; | |||
| v_alpha = COPY_DOUBLE_TO_VECTOR(alpha); | |||
| pa0 = A; | |||
| pa1 = A + lda; | |||
| pa2 = A + 2 * lda; | |||
| pa3 = A + 3 * lda; | |||
| pa4 = A + 4 * lda; | |||
| pa5 = A + 5 * lda; | |||
| pa6 = A + 6 * lda; | |||
| pa7 = A + 7 * lda; | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR | |||
| #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR | |||
| #define DLOAD_Y8 DLOAD_Y8_VECTOR | |||
| #define DLOAD_Y4 DLOAD_Y4_VECTOR | |||
| #define DSTORE_Y8 DSTORE_Y8_VECTOR | |||
| #define DSTORE_Y4 DSTORE_Y4_VECTOR | |||
| DGEMV_N_MSA(); | |||
| #undef DLOAD_X8_SCALE | |||
| #undef DLOAD_X4_SCALE | |||
| #undef DLOAD_Y8 | |||
| #undef DLOAD_Y4 | |||
| #undef DSTORE_Y8 | |||
| #undef DSTORE_Y4 | |||
| } | |||
| else if (1 == inc_y) | |||
| { | |||
| #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP | |||
| #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP | |||
| #define DLOAD_Y8 DLOAD_Y8_VECTOR | |||
| #define DLOAD_Y4 DLOAD_Y4_VECTOR | |||
| #define DSTORE_Y8 DSTORE_Y8_VECTOR | |||
| #define DSTORE_Y4 DSTORE_Y4_VECTOR | |||
| DGEMV_N_MSA(); | |||
| #undef DLOAD_X8_SCALE | |||
| #undef DLOAD_X4_SCALE | |||
| #undef DLOAD_Y8 | |||
| #undef DLOAD_Y4 | |||
| #undef DSTORE_Y8 | |||
| #undef DSTORE_Y4 | |||
| } | |||
| else if (1 == inc_x) | |||
| { | |||
| #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR | |||
| #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR | |||
| #define DLOAD_Y8 DLOAD_Y8_GP | |||
| #define DLOAD_Y4 DLOAD_Y4_GP | |||
| #define DSTORE_Y8 DSTORE_Y8_GP | |||
| #define DSTORE_Y4 DSTORE_Y4_GP | |||
| DGEMV_N_MSA(); | |||
| #undef DLOAD_X8_SCALE | |||
| #undef DLOAD_X4_SCALE | |||
| #undef DLOAD_Y8 | |||
| #undef DLOAD_Y4 | |||
| #undef DSTORE_Y8 | |||
| #undef DSTORE_Y4 | |||
| } | |||
| else | |||
| { | |||
| #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP | |||
| #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP | |||
| #define DLOAD_Y8 DLOAD_Y8_GP | |||
| #define DLOAD_Y4 DLOAD_Y4_GP | |||
| #define DSTORE_Y8 DSTORE_Y8_GP | |||
| #define DSTORE_Y4 DSTORE_Y4_GP | |||
| DGEMV_N_MSA(); | |||
| #undef DLOAD_X8_SCALE | |||
| #undef DLOAD_X4_SCALE | |||
| #undef DLOAD_Y8 | |||
| #undef DLOAD_Y4 | |||
| #undef DSTORE_Y8 | |||
| #undef DSTORE_Y4 | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,589 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| #define DGEMV_T_8x8() \ | |||
| { \ | |||
| LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ | |||
| LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ | |||
| LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ | |||
| LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ | |||
| LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \ | |||
| LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \ | |||
| LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \ | |||
| LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \ | |||
| \ | |||
| tp0 += x0 * t0; \ | |||
| tp0 += x1 * t1; \ | |||
| tp0 += x2 * t2; \ | |||
| tp0 += x3 * t3; \ | |||
| \ | |||
| tp1 += x0 * t4; \ | |||
| tp1 += x1 * t5; \ | |||
| tp1 += x2 * t6; \ | |||
| tp1 += x3 * t7; \ | |||
| \ | |||
| tp2 += x0 * t8; \ | |||
| tp2 += x1 * t9; \ | |||
| tp2 += x2 * t10; \ | |||
| tp2 += x3 * t11; \ | |||
| \ | |||
| tp3 += x0 * t12; \ | |||
| tp3 += x1 * t13; \ | |||
| tp3 += x2 * t14; \ | |||
| tp3 += x3 * t15; \ | |||
| \ | |||
| tp4 += x0 * t16; \ | |||
| tp4 += x1 * t17; \ | |||
| tp4 += x2 * t18; \ | |||
| tp4 += x3 * t19; \ | |||
| \ | |||
| tp5 += x0 * t20; \ | |||
| tp5 += x1 * t21; \ | |||
| tp5 += x2 * t22; \ | |||
| tp5 += x3 * t23; \ | |||
| \ | |||
| tp6 += x0 * t24; \ | |||
| tp6 += x1 * t25; \ | |||
| tp6 += x2 * t26; \ | |||
| tp6 += x3 * t27; \ | |||
| \ | |||
| tp7 += x0 * t28; \ | |||
| tp7 += x1 * t29; \ | |||
| tp7 += x2 * t30; \ | |||
| tp7 += x3 * t31; \ | |||
| } | |||
| #define DGEMV_T_8x4() \ | |||
| { \ | |||
| LD_DP2(pa0 + k, 2, t0, t1); \ | |||
| LD_DP2(pa1 + k, 2, t4, t5); \ | |||
| LD_DP2(pa2 + k, 2, t8, t9); \ | |||
| LD_DP2(pa3 + k, 2, t12, t13); \ | |||
| LD_DP2(pa4 + k, 2, t16, t17); \ | |||
| LD_DP2(pa5 + k, 2, t20, t21); \ | |||
| LD_DP2(pa6 + k, 2, t24, t25); \ | |||
| LD_DP2(pa7 + k, 2, t28, t29); \ | |||
| \ | |||
| tp0 += x0 * t0; \ | |||
| tp0 += x1 * t1; \ | |||
| \ | |||
| tp1 += x0 * t4; \ | |||
| tp1 += x1 * t5; \ | |||
| \ | |||
| tp2 += x0 * t8; \ | |||
| tp2 += x1 * t9; \ | |||
| \ | |||
| tp3 += x0 * t12; \ | |||
| tp3 += x1 * t13; \ | |||
| \ | |||
| tp4 += x0 * t16; \ | |||
| tp4 += x1 * t17; \ | |||
| \ | |||
| tp5 += x0 * t20; \ | |||
| tp5 += x1 * t21; \ | |||
| \ | |||
| tp6 += x0 * t24; \ | |||
| tp6 += x1 * t25; \ | |||
| \ | |||
| tp7 += x0 * t28; \ | |||
| tp7 += x1 * t29; \ | |||
| } | |||
| #define DGEMV_T_8x2() \ | |||
| { \ | |||
| t0 = LD_DP(pa0 + k); \ | |||
| t4 = LD_DP(pa1 + k); \ | |||
| t8 = LD_DP(pa2 + k); \ | |||
| t12 = LD_DP(pa3 + k); \ | |||
| t16 = LD_DP(pa4 + k); \ | |||
| t20 = LD_DP(pa5 + k); \ | |||
| t24 = LD_DP(pa6 + k); \ | |||
| t28 = LD_DP(pa7 + k); \ | |||
| \ | |||
| tp0 += x0 * t0; \ | |||
| tp1 += x0 * t4; \ | |||
| tp2 += x0 * t8; \ | |||
| tp3 += x0 * t12; \ | |||
| tp4 += x0 * t16; \ | |||
| tp5 += x0 * t20; \ | |||
| tp6 += x0 * t24; \ | |||
| tp7 += x0 * t28; \ | |||
| } | |||
| #define DGEMV_T_4x8() \ | |||
| { \ | |||
| LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ | |||
| LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ | |||
| LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ | |||
| LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ | |||
| \ | |||
| tp0 += x0 * t0; \ | |||
| tp0 += x1 * t1; \ | |||
| tp0 += x2 * t2; \ | |||
| tp0 += x3 * t3; \ | |||
| \ | |||
| tp1 += x0 * t4; \ | |||
| tp1 += x1 * t5; \ | |||
| tp1 += x2 * t6; \ | |||
| tp1 += x3 * t7; \ | |||
| \ | |||
| tp2 += x0 * t8; \ | |||
| tp2 += x1 * t9; \ | |||
| tp2 += x2 * t10; \ | |||
| tp2 += x3 * t11; \ | |||
| \ | |||
| tp3 += x0 * t12; \ | |||
| tp3 += x1 * t13; \ | |||
| tp3 += x2 * t14; \ | |||
| tp3 += x3 * t15; \ | |||
| } | |||
| #define DGEMV_T_4x4() \ | |||
| { \ | |||
| LD_DP2(pa0 + k, 2, t0, t1); \ | |||
| LD_DP2(pa1 + k, 2, t4, t5); \ | |||
| LD_DP2(pa2 + k, 2, t8, t9); \ | |||
| LD_DP2(pa3 + k, 2, t12, t13); \ | |||
| \ | |||
| tp0 += x0 * t0; \ | |||
| tp0 += x1 * t1; \ | |||
| \ | |||
| tp1 += x0 * t4; \ | |||
| tp1 += x1 * t5; \ | |||
| \ | |||
| tp2 += x0 * t8; \ | |||
| tp2 += x1 * t9; \ | |||
| \ | |||
| tp3 += x0 * t12; \ | |||
| tp3 += x1 * t13; \ | |||
| } | |||
| #define DGEMV_T_4x2() \ | |||
| { \ | |||
| t0 = LD_DP(pa0 + k); \ | |||
| t4 = LD_DP(pa1 + k); \ | |||
| t8 = LD_DP(pa2 + k); \ | |||
| t12 = LD_DP(pa3 + k); \ | |||
| \ | |||
| tp0 += x0 * t0; \ | |||
| tp1 += x0 * t4; \ | |||
| tp2 += x0 * t8; \ | |||
| tp3 += x0 * t12; \ | |||
| } | |||
| #define DGEMV_T_2x8() \ | |||
| { \ | |||
| LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ | |||
| LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ | |||
| \ | |||
| tp0 += x0 * t0; \ | |||
| tp0 += x1 * t1; \ | |||
| tp0 += x2 * t2; \ | |||
| tp0 += x3 * t3; \ | |||
| \ | |||
| tp1 += x0 * t4; \ | |||
| tp1 += x1 * t5; \ | |||
| tp1 += x2 * t6; \ | |||
| tp1 += x3 * t7; \ | |||
| } | |||
| #define DGEMV_T_2x4() \ | |||
| { \ | |||
| LD_DP2(pa0 + k, 2, t0, t1); \ | |||
| LD_DP2(pa1 + k, 2, t4, t5); \ | |||
| \ | |||
| tp0 += x0 * t0; \ | |||
| tp0 += x1 * t1; \ | |||
| \ | |||
| tp1 += x0 * t4; \ | |||
| tp1 += x1 * t5; \ | |||
| } | |||
| #define DGEMV_T_2x2() \ | |||
| { \ | |||
| t0 = LD_DP(pa0 + k); \ | |||
| t4 = LD_DP(pa1 + k); \ | |||
| \ | |||
| tp0 += x0 * t0; \ | |||
| tp1 += x0 * t4; \ | |||
| } | |||
| #define DLOAD_X8_GP() \ | |||
| x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ | |||
| x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ | |||
| x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \ | |||
| x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \ | |||
| x2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 4 * inc_x))); \ | |||
| x2 = (v2f64) __msa_insert_d((v2i64) x2, 1, *((long long *)(x + 5 * inc_x))); \ | |||
| x3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 6 * inc_x))); \ | |||
| x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((long long *)(x + 7 * inc_x))); \ | |||
| #define DLOAD_X4_GP() \ | |||
| x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ | |||
| x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ | |||
| x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \ | |||
| x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \ | |||
| #define DLOAD_X2_GP() \ | |||
| x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ | |||
| x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ | |||
| #define DLOAD_X8_VECTOR() LD_DP4(x, 2, x0, x1, x2, x3); | |||
| #define DLOAD_X4_VECTOR() LD_DP2(x, 2, x0, x1); | |||
| #define DLOAD_X2_VECTOR() x0 = LD_DP(x); | |||
| #define DGEMV_T_MSA() \ | |||
| for (j = (n >> 3); j--;) \ | |||
| { \ | |||
| tp0 = zero; \ | |||
| tp1 = zero; \ | |||
| tp2 = zero; \ | |||
| tp3 = zero; \ | |||
| tp4 = zero; \ | |||
| tp5 = zero; \ | |||
| tp6 = zero; \ | |||
| tp7 = zero; \ | |||
| \ | |||
| k = 0; \ | |||
| x = srcx_org; \ | |||
| \ | |||
| for (i = (m >> 3); i--;) \ | |||
| { \ | |||
| DLOAD_X8(); \ | |||
| DGEMV_T_8x8(); \ | |||
| \ | |||
| x += 8 * inc_x; \ | |||
| k += 8; \ | |||
| } \ | |||
| \ | |||
| if (m & 4) \ | |||
| { \ | |||
| DLOAD_X4(); \ | |||
| DGEMV_T_8x4(); \ | |||
| \ | |||
| x += 4 * inc_x; \ | |||
| k += 4; \ | |||
| } \ | |||
| \ | |||
| if (m & 2) \ | |||
| { \ | |||
| DLOAD_X2(); \ | |||
| DGEMV_T_8x2(); \ | |||
| \ | |||
| x += 2 * inc_x; \ | |||
| k += 2; \ | |||
| } \ | |||
| \ | |||
| ILVRL_D2_DP(tp1, tp0, t0, t4); \ | |||
| ILVRL_D2_DP(tp3, tp2, t1, t5); \ | |||
| ILVRL_D2_DP(tp5, tp4, t2, t6); \ | |||
| ILVRL_D2_DP(tp7, tp6, t3, t7); \ | |||
| ADD2(t0, t4, t1, t5, t0, t1); \ | |||
| ADD2(t2, t6, t3, t7, t2, t3); \ | |||
| \ | |||
| temp0 = t0[0]; \ | |||
| temp1 = t0[1]; \ | |||
| temp2 = t1[0]; \ | |||
| temp3 = t1[1]; \ | |||
| temp4 = t2[0]; \ | |||
| temp5 = t2[1]; \ | |||
| temp6 = t3[0]; \ | |||
| temp7 = t3[1]; \ | |||
| \ | |||
| if (m & 1) \ | |||
| { \ | |||
| temp0 += pa0[k] * x[0]; \ | |||
| temp1 += pa1[k] * x[0]; \ | |||
| temp2 += pa2[k] * x[0]; \ | |||
| temp3 += pa3[k] * x[0]; \ | |||
| temp4 += pa4[k] * x[0]; \ | |||
| temp5 += pa5[k] * x[0]; \ | |||
| temp6 += pa6[k] * x[0]; \ | |||
| temp7 += pa7[k] * x[0]; \ | |||
| \ | |||
| x += inc_x; \ | |||
| k++; \ | |||
| } \ | |||
| \ | |||
| res0 = y[0 * inc_y]; \ | |||
| res1 = y[1 * inc_y]; \ | |||
| res2 = y[2 * inc_y]; \ | |||
| res3 = y[3 * inc_y]; \ | |||
| res4 = y[4 * inc_y]; \ | |||
| res5 = y[5 * inc_y]; \ | |||
| res6 = y[6 * inc_y]; \ | |||
| res7 = y[7 * inc_y]; \ | |||
| \ | |||
| res0 += alpha * temp0; \ | |||
| res1 += alpha * temp1; \ | |||
| res2 += alpha * temp2; \ | |||
| res3 += alpha * temp3; \ | |||
| res4 += alpha * temp4; \ | |||
| res5 += alpha * temp5; \ | |||
| res6 += alpha * temp6; \ | |||
| res7 += alpha * temp7; \ | |||
| \ | |||
| y[0 * inc_y] = res0; \ | |||
| y[1 * inc_y] = res1; \ | |||
| y[2 * inc_y] = res2; \ | |||
| y[3 * inc_y] = res3; \ | |||
| y[4 * inc_y] = res4; \ | |||
| y[5 * inc_y] = res5; \ | |||
| y[6 * inc_y] = res6; \ | |||
| y[7 * inc_y] = res7; \ | |||
| \ | |||
| y += 8 * inc_y; \ | |||
| \ | |||
| pa0 += 8 * lda; \ | |||
| pa1 += 8 * lda; \ | |||
| pa2 += 8 * lda; \ | |||
| pa3 += 8 * lda; \ | |||
| pa4 += 8 * lda; \ | |||
| pa5 += 8 * lda; \ | |||
| pa6 += 8 * lda; \ | |||
| pa7 += 8 * lda; \ | |||
| } \ | |||
| \ | |||
| if (n & 4) \ | |||
| { \ | |||
| tp0 = zero; \ | |||
| tp1 = zero; \ | |||
| tp2 = zero; \ | |||
| tp3 = zero; \ | |||
| \ | |||
| k = 0; \ | |||
| x = srcx_org; \ | |||
| \ | |||
| for (i = (m >> 3); i--;) \ | |||
| { \ | |||
| DLOAD_X8(); \ | |||
| DGEMV_T_4x8(); \ | |||
| \ | |||
| x += 8 * inc_x; \ | |||
| k += 8; \ | |||
| } \ | |||
| \ | |||
| if (m & 4) \ | |||
| { \ | |||
| DLOAD_X4(); \ | |||
| DGEMV_T_4x4(); \ | |||
| \ | |||
| x += 4 * inc_x; \ | |||
| k += 4; \ | |||
| } \ | |||
| \ | |||
| if (m & 2) \ | |||
| { \ | |||
| DLOAD_X2(); \ | |||
| DGEMV_T_4x2(); \ | |||
| \ | |||
| x += 2 * inc_x; \ | |||
| k += 2; \ | |||
| } \ | |||
| \ | |||
| ILVRL_D2_DP(tp1, tp0, t0, t4); \ | |||
| ILVRL_D2_DP(tp3, tp2, t1, t5); \ | |||
| ADD2(t0, t4, t1, t5, t0, t1); \ | |||
| \ | |||
| temp0 = t0[0]; \ | |||
| temp1 = t0[1]; \ | |||
| temp2 = t1[0]; \ | |||
| temp3 = t1[1]; \ | |||
| \ | |||
| if (m & 1) \ | |||
| { \ | |||
| temp0 += pa0[k] * x[0]; \ | |||
| temp1 += pa1[k] * x[0]; \ | |||
| temp2 += pa2[k] * x[0]; \ | |||
| temp3 += pa3[k] * x[0]; \ | |||
| \ | |||
| x += inc_x; \ | |||
| k++; \ | |||
| } \ | |||
| \ | |||
| res0 = y[0 * inc_y]; \ | |||
| res1 = y[1 * inc_y]; \ | |||
| res2 = y[2 * inc_y]; \ | |||
| res3 = y[3 * inc_y]; \ | |||
| \ | |||
| res0 += alpha * temp0; \ | |||
| res1 += alpha * temp1; \ | |||
| res2 += alpha * temp2; \ | |||
| res3 += alpha * temp3; \ | |||
| \ | |||
| y[0 * inc_y] = res0; \ | |||
| y[1 * inc_y] = res1; \ | |||
| y[2 * inc_y] = res2; \ | |||
| y[3 * inc_y] = res3; \ | |||
| \ | |||
| y += 4 * inc_y; \ | |||
| \ | |||
| pa0 += 4 * lda; \ | |||
| pa1 += 4 * lda; \ | |||
| pa2 += 4 * lda; \ | |||
| pa3 += 4 * lda; \ | |||
| } \ | |||
| \ | |||
| if (n & 2) \ | |||
| { \ | |||
| tp0 = zero; \ | |||
| tp1 = zero; \ | |||
| \ | |||
| k = 0; \ | |||
| x = srcx_org; \ | |||
| \ | |||
| for (i = (m >> 3); i--;) \ | |||
| { \ | |||
| DLOAD_X8(); \ | |||
| DGEMV_T_2x8(); \ | |||
| \ | |||
| x += 8 * inc_x; \ | |||
| k += 8; \ | |||
| } \ | |||
| \ | |||
| if (m & 4) \ | |||
| { \ | |||
| DLOAD_X4(); \ | |||
| DGEMV_T_2x4(); \ | |||
| \ | |||
| x += 4 * inc_x; \ | |||
| k += 4; \ | |||
| } \ | |||
| \ | |||
| if (m & 2) \ | |||
| { \ | |||
| DLOAD_X2(); \ | |||
| DGEMV_T_2x2(); \ | |||
| \ | |||
| x += 2 * inc_x; \ | |||
| k += 2; \ | |||
| } \ | |||
| \ | |||
| ILVRL_D2_DP(tp1, tp0, t0, t4); \ | |||
| \ | |||
| t0 += t4; \ | |||
| \ | |||
| temp0 = t0[0]; \ | |||
| temp1 = t0[1]; \ | |||
| \ | |||
| if (m & 1) \ | |||
| { \ | |||
| temp0 += pa0[k] * x[0]; \ | |||
| temp1 += pa1[k] * x[0]; \ | |||
| x += inc_x; \ | |||
| k++; \ | |||
| } \ | |||
| \ | |||
| res0 = y[0 * inc_y]; \ | |||
| res1 = y[1 * inc_y]; \ | |||
| \ | |||
| res0 += alpha * temp0; \ | |||
| res1 += alpha * temp1; \ | |||
| \ | |||
| y[0 * inc_y] = res0; \ | |||
| y[1 * inc_y] = res1; \ | |||
| \ | |||
| y += 2 * inc_y; \ | |||
| \ | |||
| pa0 += 2 * lda; \ | |||
| pa1 += 2 * lda; \ | |||
| } \ | |||
| \ | |||
| if (n & 1) \ | |||
| { \ | |||
| temp0 = 0.0; \ | |||
| \ | |||
| k = 0; \ | |||
| x = srcx_org; \ | |||
| \ | |||
| for (i = m; i--;) \ | |||
| { \ | |||
| temp0 += pa0[k] * x[0]; \ | |||
| x += inc_x; \ | |||
| k++; \ | |||
| } \ | |||
| \ | |||
| y[0] += alpha * temp0; \ | |||
| y += inc_y; \ | |||
| pa0 += lda; \ | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, | |||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
| FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j, k; | |||
| FLOAT *srcx_org = x; | |||
| FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; | |||
| FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; | |||
| FLOAT res0, res1, res2, res3, res4, res5, res6, res7; | |||
| v2f64 x0, x1, x2, x3; | |||
| v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; | |||
| v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; | |||
| v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; | |||
| v2f64 zero = {0}; | |||
| pa0 = A + 0 * lda; | |||
| pa1 = A + 1 * lda; | |||
| pa2 = A + 2 * lda; | |||
| pa3 = A + 3 * lda; | |||
| pa4 = A + 4 * lda; | |||
| pa5 = A + 5 * lda; | |||
| pa6 = A + 6 * lda; | |||
| pa7 = A + 7 * lda; | |||
| if (1 == inc_x) | |||
| { | |||
| #define DLOAD_X8 DLOAD_X8_VECTOR | |||
| #define DLOAD_X4 DLOAD_X4_VECTOR | |||
| #define DLOAD_X2 DLOAD_X2_VECTOR | |||
| DGEMV_T_MSA(); | |||
| #undef DLOAD_X8 | |||
| #undef DLOAD_X4 | |||
| #undef DLOAD_X2 | |||
| } | |||
| else | |||
| { | |||
| #define DLOAD_X8 DLOAD_X8_GP | |||
| #define DLOAD_X4 DLOAD_X4_GP | |||
| #define DLOAD_X2 DLOAD_X2_GP | |||
| DGEMV_T_MSA(); | |||
| #undef DLOAD_X8 | |||
| #undef DLOAD_X4 | |||
| #undef DLOAD_X2 | |||
| } | |||
| return(0); | |||
| } | |||