diff --git a/CMakeLists.txt b/CMakeLists.txt
index ead63bff8..ff42643fa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4)
project(OpenBLAS)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 2)
-set(OpenBLAS_PATCH_VERSION 18)
+set(OpenBLAS_PATCH_VERSION 19)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
enable_language(ASM)
@@ -45,8 +45,8 @@ endif()
message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.")
-include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake")
-include("${CMAKE_SOURCE_DIR}/cmake/system.cmake")
+include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
+include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
@@ -123,9 +123,9 @@ endforeach ()
# Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke.
# Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want.
if (NOT NOFORTRAN AND NOT NO_LAPACK)
- include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake")
+ include("${PROJECT_SOURCE_DIR}/cmake/lapack.cmake")
if (NOT NO_LAPACKE)
- include("${CMAKE_SOURCE_DIR}/cmake/lapacke.cmake")
+ include("${PROJECT_SOURCE_DIR}/cmake/lapacke.cmake")
endif ()
endif ()
@@ -137,7 +137,7 @@ endif()
# add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
-include("${CMAKE_SOURCE_DIR}/cmake/export.cmake")
+include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
# Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index ebe52ea8a..5ecf32b91 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -150,3 +150,14 @@ In chronological order:
* theoractice
* [2016-03-20] Fix compiler error in VisualStudio with CMake
* [2016-03-22] Fix access violation on Windows while static linking
+
+* Paul Mustière
+ * [2016-02-04] Fix Android build on ARMV7
+ * [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8
+
+* Shivraj Patil
+ * [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA
+
+* Kaustubh Raste
+ * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA
+ * [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA
diff --git a/Changelog.txt b/Changelog.txt
index 7f82e8e88..2eb27ab04 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,22 @@
OpenBLAS ChangeLog
+====================================================================
+Version 0.2.19
+1-Sep-2016
+common:
+ * Improved cross compiling.
+ * Fix the bug on musl libc.
+
+POWER:
+ * Optimize BLAS on Power8
+ * Fixed Julia+OpenBLAS bugs on Power8
+
+MIPS:
+ * Optimize BLAS on MIPS P5600 and I6400 (Thanks, Shivraj Patil, Kaustubh Raste)
+
+ARM:
+ * Improved on ARM Cortex-A57. (Thanks, Ashwin Sekhar T K)
+
+
====================================================================
Version 0.2.18
12-Apr-2016
diff --git a/Makefile b/Makefile
index 9ba2bffb3..2ae004798 100644
--- a/Makefile
+++ b/Makefile
@@ -108,8 +108,6 @@ endif
tests :
ifndef NOFORTRAN
-ifndef TARGET
-ifndef CROSS
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
@@ -119,8 +117,6 @@ ifndef NO_CBLAS
$(MAKE) -C ctest all
endif
endif
-endif
-endif
libs :
ifeq ($(CORE), UNKOWN)
diff --git a/Makefile.install b/Makefile.install
index 5da4e68c9..1b9388a8b 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -20,75 +20,75 @@ lib.grd :
$(error OpenBLAS: Please run "make" firstly)
install : lib.grd
- @-mkdir -p $(DESTDIR)$(PREFIX)
- @-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
- @-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
- @-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR)
- @-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
+ @-mkdir -p "$(DESTDIR)$(PREFIX)"
+ @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)"
+ @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
+ @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
+ @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)"
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
#for inc
- @echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
- @echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
- @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
- @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
- @cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
- @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
+ @echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
+ @echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
+ @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
+ @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
+ @cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
+ @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
- @echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
- @echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
- @echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
- @cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
- @echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
+ @echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
+ @echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
+ @echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
+ @cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
+ @echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
ifndef NO_CBLAS
@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
- @sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h
+ @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
endif
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
- @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
- @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
- @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
- @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
+ @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
+ @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
+ @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
+ @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
endif
#for install static library
ifndef NO_STATIC
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
- @install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
- @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
+ @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
+ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
- @install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
- @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
+ @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
+ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
- @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
- @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
+ @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
+ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
- @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
- @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
+ @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
+ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
- @-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
- @-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
- @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
+ @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
+ @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
+ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
- @-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR)
- @-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
+ @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
+ @-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
endif
ifeq ($(OSNAME), CYGWIN_NT)
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
@@ -96,34 +96,34 @@ endif
endif
#Generating OpenBLASConfig.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
- @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
- @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
+ @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
+ @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
ifndef NO_SHARED
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
- @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
+ @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
- @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
+ @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
ifeq ($(OSNAME), Darwin)
- @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
+ @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
else
#only static
- @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
+ @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
#Generating OpenBLASConfigVersion.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
- @echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
- @echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
+ @echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
+ @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo Install OK!
diff --git a/Makefile.mips b/Makefile.mips
new file mode 100644
index 000000000..05ea9c679
--- /dev/null
+++ b/Makefile.mips
@@ -0,0 +1,3 @@
+ifdef BINARY64
+else
+endif
diff --git a/Makefile.power b/Makefile.power
index 7e2b47386..79db83751 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -1,4 +1,26 @@
-# CCOMMON_OPT += -DALLOC_SHM
+
+ifdef USE_THREAD
+ifeq ($(USE_THREAD), 0)
+USE_OPENMP = 0
+else
+USE_OPENMP = 1
+endif
+else
+USE_OPENMP = 1
+endif
+
+
+
+ifeq ($(CORE), POWER8)
+ifeq ($(USE_OPENMP), 1)
+COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
+FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
+else
+COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
+FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
+endif
+endif
+
FLAMEPATH = $(HOME)/flame/lib
@@ -16,6 +38,16 @@ else
endif
endif
+#Either uncomment below line or run make with `USE_MASS=1` to enable support of MASS library
+#USE_MASS = 1
+
+ifeq ($(USE_MASS), 1)
+# Path to MASS libs, change it if the libs are installed at any other location
+MASSPATH = /opt/ibm/xlmass/8.1.3/lib
+COMMON_OPT += -mveclibabi=mass -ftree-vectorize -funsafe-math-optimizations -DUSE_MASS
+EXTRALIB += -L$(MASSPATH) -lmass -lmassvp8 -lmass_simdp8
+endif
+
ifdef BINARY64
diff --git a/Makefile.prebuild b/Makefile.prebuild
index ee0b67787..524f0a741 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -17,14 +17,26 @@ ifdef CPUIDEMU
EXFLAGS = -DCPUIDEMU -DVENDOR=99
endif
+ifeq ($(TARGET), P5600)
+TARGET_FLAGS = -mips32r5
+endif
+
+ifeq ($(TARGET), I6400)
+TARGET_FLAGS = -mips64r6
+endif
+
+ifeq ($(TARGET), P6600)
+TARGET_FLAGS = -mips64r6
+endif
+
all: getarch_2nd
./getarch_2nd 0 >> $(TARGET_MAKE)
./getarch_2nd 1 >> $(TARGET_CONF)
config.h : c_check f_check getarch
- perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC)
+ perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS)
ifneq ($(ONLY_CBLAS), 1)
- perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC)
+ perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS)
else
#When we only build CBLAS, we set NOFORTRAN=2
echo "NOFORTRAN=2" >> $(TARGET_MAKE)
diff --git a/Makefile.rule b/Makefile.rule
index d8db6102c..5bb9cf0b7 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
#
# This library's version
-VERSION = 0.2.18
+VERSION = 0.2.19
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -52,6 +52,7 @@ VERSION = 0.2.18
# USE_THREAD = 0
# If you're going to use this library with OpenMP, please comment it in.
+# This flag is always set for POWER8. Don't modify the flag
# USE_OPENMP = 1
# You can define maximum number of threads. Basically it should be
@@ -153,10 +154,12 @@ NO_AFFINITY = 1
# Common Optimization Flag;
# The default -O2 is enough.
+# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
# COMMON_OPT = -O2
# gfortran option for LAPACK
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
+# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
# FCOMMON_OPT = -frecursive
# Profiling flags
diff --git a/Makefile.system b/Makefile.system
index b89f60e96..b05177b6c 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -159,7 +159,7 @@ ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1
# Generating Makefile.conf and config.h
-DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) all)
+DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
ifndef TARGET_CORE
include $(TOPDIR)/Makefile.conf
@@ -462,7 +462,7 @@ endif
endif
endif
-ifeq ($(ARCH), mips64)
+ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
NO_BINARY_MODE = 1
endif
@@ -502,13 +502,16 @@ endif
ifdef NO_BINARY_MODE
-ifeq ($(ARCH), mips64)
+ifeq ($(ARCH), $(filter $(ARCH),mips64))
ifdef BINARY64
CCOMMON_OPT += -mabi=64
else
CCOMMON_OPT += -mabi=n32
endif
BINARY_DEFINED = 1
+else ifeq ($(ARCH), $(filter $(ARCH),mips))
+CCOMMON_OPT += -mabi=32
+BINARY_DEFINED = 1
endif
ifeq ($(CORE), LOONGSON3A)
@@ -521,6 +524,21 @@ CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
+ifeq ($(CORE), P5600)
+CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
+FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
+endif
+
+ifeq ($(CORE), I6400)
+CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
+FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
+endif
+
+ifeq ($(CORE), P6600)
+CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS)
+FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS)
+endif
+
ifeq ($(OSNAME), AIX)
BINARY_DEFINED = 1
endif
@@ -589,12 +607,14 @@ ifneq ($(NO_LAPACK), 1)
EXTRALIB += -lgfortran
endif
ifdef NO_BINARY_MODE
-ifeq ($(ARCH), mips64)
+ifeq ($(ARCH), $(filter $(ARCH),mips64))
ifdef BINARY64
FCOMMON_OPT += -mabi=64
else
FCOMMON_OPT += -mabi=n32
endif
+else ifeq ($(ARCH), $(filter $(ARCH),mips))
+FCOMMON_OPT += -mabi=32
endif
else
ifdef BINARY64
@@ -677,21 +697,7 @@ FCOMMON_OPT += -i8
endif
endif
endif
-
-ifneq ($(ARCH), mips64)
-ifndef BINARY64
-FCOMMON_OPT += -m32
-else
-FCOMMON_OPT += -m64
-endif
-else
-ifdef BINARY64
-FCOMMON_OPT += -mabi=64
-else
-FCOMMON_OPT += -mabi=n32
-endif
-endif
-
+
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -mp
endif
@@ -707,7 +713,7 @@ endif
endif
endif
-ifeq ($(ARCH), mips64)
+ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
ifndef BINARY64
FCOMMON_OPT += -n32
else
@@ -737,7 +743,7 @@ endif
ifeq ($(C_COMPILER), OPEN64)
-ifeq ($(ARCH), mips64)
+ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
ifndef BINARY64
CCOMMON_OPT += -n32
else
@@ -1126,6 +1132,8 @@ export HAVE_VFP
export HAVE_VFPV3
export HAVE_VFPV4
export HAVE_NEON
+export HAVE_MSA
+export MSA_FLAGS
export KERNELDIR
export FUNCTION_PROFILE
export TARGET_CORE
diff --git a/README.md b/README.md
index 32a861081..ff55edaa1 100644
--- a/README.md
+++ b/README.md
@@ -43,6 +43,35 @@ On X86 box, compile this library for loongson3a CPU with loongcc (based on Open6
make DEBUG=1
+### Compile with MASS Support on Power CPU (Optional dependency)
+
+[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and
+Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER.
+The library can be installed as below -
+
+ * On Ubuntu:
+
+ wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
+ echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
+ sudo apt-get update
+ sudo apt-get install libxlmass-devel.8.1.3
+
+ * On RHEL/CentOS:
+
+ wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
+ sudo rpm --import repomd.xml.key
+ wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
+ sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
+ sudo yum install libxlmass-devel.8.1.3
+
+After installing MASS library, compile openblas with USE_MASS=1.
+
+Example:
+
+Compiling on Power8 with MASS support -
+
+ make USE_MASS=1 TARGET=POWER8
+
### Install to the directory (optional)
Example:
@@ -82,6 +111,7 @@ Please read GotoBLAS_01Readme.txt
- **MingWin or Visual Studio(CMake)/Windows**: Please read .
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
+- **Android**: Supported by community. Please read .
## Usages
Link with libopenblas.a or -lopenblas for shared library.
diff --git a/TargetList.txt b/TargetList.txt
index dc1e08722..52a60b49c 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -53,26 +53,31 @@ PPC440
PPC440FP2
CELL
-3.MIPS64 CPU:
+3.MIPS CPU:
+P5600
+
+4.MIPS64 CPU:
SICORTEX
LOONGSON3A
LOONGSON3B
+I6400
+P6600
-4.IA64 CPU:
+5.IA64 CPU:
ITANIUM2
-5.SPARC CPU:
+6.SPARC CPU:
SPARC
SPARCV7
-6.ARM CPU:
+7.ARM CPU:
CORTEXA15
CORTEXA9
ARMV7
ARMV6
ARMV5
-7.ARM 64-bit CPU:
+8.ARM 64-bit CPU:
ARMV8
CORTEXA57
diff --git a/appveyor.yml b/appveyor.yml
index 5360a9ef9..c9d8e47ac 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,4 +1,4 @@
-version: 0.2.18.{build}
+version: 0.2.19.{build}
#environment:
diff --git a/benchmark/Makefile b/benchmark/Makefile
index 8166f3863..e801ce4eb 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -173,7 +173,9 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
- smallscaling
+ smallscaling \
+ isamax.goto idamax.goto icamax.goto izamax.goto \
+ snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@@ -226,7 +228,9 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \
spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \
- ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas
+ ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \
+ isamax.atlas idamax.atlas icamax.atlas izamax.atlas \
+ snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto
mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \
@@ -261,7 +265,9 @@ endif
essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \
- slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
+ slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \
+ scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \
+ strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
@@ -393,6 +399,9 @@ scholesky.mkl : scholesky.$(SUFFIX)
scholesky.veclib : scholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+scholesky.essl : scholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Dcholesky ###################################################
dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME)
@@ -410,6 +419,9 @@ dcholesky.mkl : dcholesky.$(SUFFIX)
dcholesky.veclib : dcholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+dcholesky.essl : dcholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Ccholesky ###################################################
ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME)
@@ -427,6 +439,9 @@ ccholesky.mkl : ccholesky.$(SUFFIX)
ccholesky.veclib : ccholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+ccholesky.essl : ccholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Zcholesky ###################################################
@@ -445,6 +460,9 @@ zcholesky.mkl : zcholesky.$(SUFFIX)
zcholesky.veclib : zcholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+zcholesky.essl : zcholesky.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Sgemm ####################################################
sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -683,6 +701,9 @@ strsm.mkl : strsm.$(SUFFIX)
strsm.veclib : strsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+strsm.essl : strsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Dtrsm ####################################################
dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -699,6 +720,9 @@ dtrsm.mkl : dtrsm.$(SUFFIX)
dtrsm.veclib : dtrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+dtrsm.essl : dtrsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Ctrsm ####################################################
ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME)
@@ -716,6 +740,9 @@ ctrsm.mkl : ctrsm.$(SUFFIX)
ctrsm.veclib : ctrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+ctrsm.essl : ctrsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Ztrsm ####################################################
ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME)
@@ -733,6 +760,9 @@ ztrsm.mkl : ztrsm.$(SUFFIX)
ztrsm.veclib : ztrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+ztrsm.essl : ztrsm.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Ssyrk ####################################################
ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -1911,6 +1941,63 @@ zgemm3m.mkl : zgemm3m.$(SUFFIX)
zgemm3m.veclib : zgemm3m.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+############################################## ISAMAX ##############################################
+isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+isamax.atlas : isamax.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## IDAMAX ##############################################
+idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+idamax.atlas : idamax.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## ICAMAX ##############################################
+icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+icamax.atlas : icamax.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## IZAMAX ##############################################
+izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+izamax.atlas : izamax.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## SNRM2 ##############################################
+snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+snrm2.atlas : snrm2.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## DNRM2 ##############################################
+dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dnrm2.atlas : dnrm2.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## Sscnrm2 ##############################################
+scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+scnrm2.atlas : scnrm2.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## Ddznrm2 ##############################################
+dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dznrm2.atlas : dznrm2.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+
###################################################################################################
slinpack.$(SUFFIX) : linpack.c
@@ -2217,11 +2304,38 @@ cgemm3m.$(SUFFIX) : gemm3m.c
zgemm3m.$(SUFFIX) : gemm3m.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+isamax.$(SUFFIX) : iamax.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+idamax.$(SUFFIX) : iamax.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+icamax.$(SUFFIX) : iamax.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+izamax.$(SUFFIX) : iamax.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+snrm2.$(SUFFIX) : nrm2.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dnrm2.$(SUFFIX) : nrm2.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+scnrm2.$(SUFFIX) : nrm2.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+dznrm2.$(SUFFIX) : nrm2.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
smallscaling: smallscaling.c ../$(LIBNAME)
- $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
+ $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread
clean ::
- @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl
+ @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling
include $(TOPDIR)/Makefile.tail
diff --git a/benchmark/asum.c b/benchmark/asum.c
index beb6402f4..78ccdf47b 100644
--- a/benchmark/asum.c
+++ b/benchmark/asum.c
@@ -183,9 +183,9 @@ int main(int argc, char *argv[]){
timeg /= loops;
#ifdef COMPLEX
- fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6);
+ fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg);
#else
- fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6);
+ fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg);
#endif
}
diff --git a/benchmark/axpy.c b/benchmark/axpy.c
index a7206b690..37c7aeb63 100644
--- a/benchmark/axpy.c
+++ b/benchmark/axpy.c
@@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
- " %10.2f MFlops\n",
- COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
+ " %10.2f MFlops %10.6f sec\n",
+ COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}
diff --git a/benchmark/copy.c b/benchmark/copy.c
index 15c45201c..ea5b38d68 100644
--- a/benchmark/copy.c
+++ b/benchmark/copy.c
@@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
- " %10.2f MBytes\n",
- COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
+ " %10.2f MBytes %10.6f sec\n",
+ COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}
diff --git a/benchmark/dot.c b/benchmark/dot.c
index 4c8d6cc38..50d05e532 100644
--- a/benchmark/dot.c
+++ b/benchmark/dot.c
@@ -184,8 +184,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
- " %10.2f MFlops\n",
- COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
+ " %10.2f MFlops %10.6f sec\n",
+ COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}
diff --git a/benchmark/gemv.c b/benchmark/gemv.c
index 42af2825a..c06e829d9 100644
--- a/benchmark/gemv.c
+++ b/benchmark/gemv.c
@@ -221,7 +221,7 @@ int main(int argc, char *argv[]){
timeg /= loops;
- fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
+ fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg);
}
}
@@ -258,7 +258,7 @@ int main(int argc, char *argv[]){
timeg /= loops;
- fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
+ fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg);
}
}
diff --git a/benchmark/iamax.c b/benchmark/iamax.c
new file mode 100644
index 000000000..c55f41579
--- /dev/null
+++ b/benchmark/iamax.c
@@ -0,0 +1,190 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include
+#include
+#ifdef __CYGWIN32__
+#include
+#endif
+#include "common.h"
+
+
+#undef IAMAX
+
+#ifdef COMPLEX
+#ifdef DOUBLE
+#define IAMAX BLASFUNC(izamax)
+#else
+#define IAMAX BLASFUNC(icamax)
+#endif
+#else
+#ifdef DOUBLE
+#define IAMAX BLASFUNC(idamax)
+#else
+#define IAMAX BLASFUNC(isamax)
+#endif
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int main(int argc, char *argv[]){
+
+ FLOAT *x;
+ blasint m, i;
+ blasint inc_x=1;
+ int loops = 1;
+ int l;
+ char *p;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1,timeg;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
+ if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+
+ if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Time\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ timeg=0;
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+
+ for (l=0; l
+#include
+#ifdef __CYGWIN32__
+#include
+#endif
+#include "common.h"
+
+
+#undef NRM2
+
+#ifdef COMPLEX
+#ifdef DOUBLE
+#define NRM2 BLASFUNC(dznrm2)
+#else
+#define NRM2 BLASFUNC(scnrm2)
+#endif
+#else
+#ifdef DOUBLE
+#define NRM2 BLASFUNC(dnrm2)
+#else
+#define NRM2 BLASFUNC(snrm2)
+#endif
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int main(int argc, char *argv[]){
+
+ FLOAT *x;
+ blasint m, i;
+ blasint inc_x=1;
+ int loops = 1;
+ int l;
+ char *p;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1,timeg;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
+ if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+
+ if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Time\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ timeg=0;
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+
+ for (l=0; l
#include
#include
+#include
#define MIN_SIZE 5
#define MAX_SIZE 60
#define NB_SIZE 10
diff --git a/benchmark/swap.c b/benchmark/swap.c
index 9f108ef50..368c59cd4 100644
--- a/benchmark/swap.c
+++ b/benchmark/swap.c
@@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
- " %10.2f MBytes\n",
- COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
+ " %10.2f MBytes %10.6f sec\n",
+ COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}
diff --git a/benchmark/trmm.c b/benchmark/trmm.c
index f81e9d912..54c7972db 100644
--- a/benchmark/trmm.c
+++ b/benchmark/trmm.c
@@ -191,8 +191,8 @@ int main(int argc, char *argv[]){
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
- " %10.2f MFlops\n",
- COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
+ " %10.2f MFlops %10.6f sec\n",
+ COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1);
}
diff --git a/benchmark/zdot.c b/benchmark/zdot.c
index d5ec99726..ed9d4d2e8 100644
--- a/benchmark/zdot.c
+++ b/benchmark/zdot.c
@@ -184,8 +184,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
- " %10.2f MFlops\n",
- COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
+ " %10.2f MFlops %10.6f sec\n",
+ COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}
diff --git a/c_check b/c_check
index bcf4c2cb3..2ec9fc484 100644
--- a/c_check
+++ b/c_check
@@ -1,5 +1,8 @@
#!/usr/bin/perl
+use File::Basename;
+use File::Temp qw(tempfile);
+
# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
@@ -8,6 +11,7 @@ $hostarch = "arm" if ($hostarch =~ /^arm.*/);
$hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
+$tmpf = new File::Temp( UNLINK => 1 );
$binary = $ENV{"BINARY"};
$makefile = shift(@ARGV);
@@ -26,14 +30,12 @@ if ($?) {
$cross_suffix = "";
-if ($ARGV[0] =~ /(.*)(-[.\d]+)/) {
- if ($1 =~ /(.*-)(.*)/) {
- $cross_suffix = $1;
- }
-} else {
- if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) {
- $cross_suffix = $1;
- }
+if (dirname($compiler_name) ne ".") {
+ $cross_suffix .= dirname($compiler_name) . "/";
+}
+
+if (basename($compiler_name) =~ /(.*-)(.*)/) {
+ $cross_suffix .= $1;
}
$compiler = "";
@@ -63,7 +65,7 @@ $os = Android if ($data =~ /OS_ANDROID/);
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = power if ($data =~ /ARCH_POWER/);
-$architecture = mips32 if ($data =~ /ARCH_MIPS32/);
+$architecture = mips if ($data =~ /ARCH_MIPS/);
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/);
@@ -79,7 +81,12 @@ if ($os eq "AIX") {
$defined = 1;
}
-if (($architecture eq "mips32") || ($architecture eq "mips64")) {
+if ($architecture eq "mips") {
+ $compiler_name .= " -mabi=32";
+ $defined = 1;
+}
+
+if ($architecture eq "mips64") {
$compiler_name .= " -mabi=n32" if ($binary eq "32");
$compiler_name .= " -mabi=64" if ($binary eq "64");
$defined = 1;
@@ -152,10 +159,28 @@ if ($?) {
die 1;
}
+$have_msa = 0;
+if (($architecture eq "mips") || ($architecture eq "mips64")) {
+ $code = '"addvi.b $w0, $w1, 1"';
+ $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
+ print $tmpf "#include \n\n";
+ print $tmpf "void main(void){ __asm__ volatile($code); }\n";
+
+ $args = "$msa_flags -o $tmpf.o -x c $tmpf";
+ my @cmd = ("$compiler_name $args");
+ system(@cmd) == 0;
+ if ($? != 0) {
+ $have_msa = 0;
+ } else {
+ $have_msa = 1;
+ }
+ unlink("$tmpf.o");
+}
+
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = power if ($data =~ /ARCH_POWER/);
-$architecture = mips32 if ($data =~ /ARCH_MIPS32/);
+$architecture = mips if ($data =~ /ARCH_MIPS/);
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/);
@@ -243,9 +268,11 @@ print MAKEFILE "BINARY64=\n" if $binformat ne bin64;
print MAKEFILE "BINARY32=1\n" if $binformat eq bin32;
print MAKEFILE "BINARY64=1\n" if $binformat eq bin64;
print MAKEFILE "FU=$need_fu\n" if $need_fu ne "";
-print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne "";
+print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne "";
print MAKEFILE "CROSS=1\n" if $cross != 0;
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
+print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
+print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
$os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/;
@@ -257,6 +284,7 @@ print CONFFILE "#define C_$compiler\t1\n";
print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32;
print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
+print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
if ($os eq "LINUX") {
diff --git a/cmake/export.cmake b/cmake/export.cmake
index adf59101f..629f8fbc2 100644
--- a/cmake/export.cmake
+++ b/cmake/export.cmake
@@ -53,7 +53,7 @@ endif()
add_custom_command(
TARGET ${OpenBLAS_LIBNAME} PRE_LINK
COMMAND perl
- ARGS "${CMAKE_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def"
+ ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def"
COMMENT "Create openblas.def file"
VERBATIM)
diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index c3fa48655..471ce90e4 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -50,20 +50,20 @@ else()
set(TARGET_CONF "config.h")
endif ()
-include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake")
+include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake")
if (NOT NOFORTRAN)
- include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake")
+ include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
endif ()
# compile getarch
set(GETARCH_SRC
- ${CMAKE_SOURCE_DIR}/getarch.c
+ ${PROJECT_SOURCE_DIR}/getarch.c
${CPUIDEMO}
)
if (NOT MSVC)
- list(APPEND GETARCH_SRC ${CMAKE_SOURCE_DIR}/cpuid.S)
+ list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
endif ()
if (MSVC)
@@ -76,7 +76,7 @@ set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH_DIR})
try_compile(GETARCH_RESULT ${GETARCH_DIR}
SOURCES ${GETARCH_SRC}
- COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${CMAKE_SOURCE_DIR}
+ COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
)
@@ -97,8 +97,8 @@ set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH2_DIR})
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
- SOURCES ${CMAKE_SOURCE_DIR}/getarch_2nd.c
- COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${CMAKE_SOURCE_DIR}
+ SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
+ COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH2_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
)
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 134e9c12d..aa046a56a 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -3,7 +3,7 @@
## Description: Ported from OpenBLAS/Makefile.system
##
-set(NETLIB_LAPACK_DIR "${CMAKE_SOURCE_DIR}/lapack-netlib")
+set(NETLIB_LAPACK_DIR "${PROJECT_SOURCE_DIR}/lapack-netlib")
# TODO: Makefile.system detects Darwin (mac) and switches to clang here -hpa
# http://stackoverflow.com/questions/714100/os-detecting-makefile
@@ -78,7 +78,7 @@ else ()
set(ONLY_CBLAS 0)
endif ()
-include("${CMAKE_SOURCE_DIR}/cmake/prebuild.cmake")
+include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
if (NOT DEFINED NUM_THREADS)
set(NUM_THREADS ${NUM_CORES})
@@ -124,17 +124,17 @@ set(OBJCOPY "${CROSS_SUFFIX}objcopy")
set(OBJCONV "${CROSS_SUFFIX}objconv")
# OS dependent settings
-include("${CMAKE_SOURCE_DIR}/cmake/os.cmake")
+include("${PROJECT_SOURCE_DIR}/cmake/os.cmake")
# Architecture dependent settings
-include("${CMAKE_SOURCE_DIR}/cmake/arch.cmake")
+include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake")
# C Compiler dependent settings
-include("${CMAKE_SOURCE_DIR}/cmake/cc.cmake")
+include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
if (NOT NOFORTRAN)
# Fortran Compiler dependent settings
- include("${CMAKE_SOURCE_DIR}/cmake/fc.cmake")
+ include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
endif ()
if (BINARY64)
@@ -247,10 +247,10 @@ if (NOT DEFINED SYMBOLSUFFIX)
set(SYMBOLSUFFIX "")
endif ()
-set(KERNELDIR "${CMAKE_SOURCE_DIR}/kernel/${ARCH}")
+set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
# TODO: nead to convert these Makefiles
-# include ${CMAKE_SOURCE_DIR}/cmake/${ARCH}.cmake
+# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
if (${CORE} STREQUAL "PPC440")
set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC")
@@ -410,8 +410,8 @@ set(LIBDEFNAME "${LIBNAME}.${LIBSUFFIX}.def")
set(LIBEXPNAME "${LIBNAME}.${LIBSUFFIX}.exp")
set(LIBZIPNAME "${LIBNAME}.${LIBSUFFIX}.zip")
-set(LIBS "${CMAKE_SOURCE_DIR}/${LIBNAME}")
-set(LIBS_P "${CMAKE_SOURCE_DIR}/${LIBNAME_P}")
+set(LIBS "${PROJECT_SOURCE_DIR}/${LIBNAME}")
+set(LIBS_P "${PROJECT_SOURCE_DIR}/${LIBNAME_P}")
set(LIB_COMPONENTS BLAS)
diff --git a/common.h b/common.h
index e045e42b2..480174c11 100644
--- a/common.h
+++ b/common.h
@@ -332,6 +332,13 @@ typedef int blasint;
#endif
#endif
+#ifdef POWER8
+#ifndef YIELDING
+#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
+#endif
+#endif
+
+
/*
#ifdef PILEDRIVER
#ifndef YIELDING
@@ -397,6 +404,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_sparc.h"
#endif
+#ifdef ARCH_MIPS
+#include "common_mips.h"
+#endif
+
#ifdef ARCH_MIPS64
#include "common_mips64.h"
#endif
@@ -615,9 +626,14 @@ void gotoblas_profile_init(void);
void gotoblas_profile_quit(void);
#ifdef USE_OPENMP
+#ifndef C_MSVC
int omp_in_parallel(void);
int omp_get_num_procs(void);
#else
+__declspec(dllimport) int __cdecl omp_in_parallel(void);
+__declspec(dllimport) int __cdecl omp_get_num_procs(void);
+#endif
+#else
#ifdef __ELF__
int omp_in_parallel (void) __attribute__ ((weak));
int omp_get_num_procs(void) __attribute__ ((weak));
diff --git a/common_mips.h b/common_mips.h
new file mode 100644
index 000000000..ae126949a
--- /dev/null
+++ b/common_mips.h
@@ -0,0 +1,109 @@
+/*****************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ 3. Neither the name of the OpenBLAS project nor the names of
+ its contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#ifndef COMMON_MIPS
+#define COMMON_MIPS
+
+#define MB
+#define WMB
+
+#define INLINE inline
+
+#define RETURN_BY_COMPLEX
+
+#ifndef ASSEMBLER
+
+static void INLINE blas_lock(volatile unsigned long *address){
+
+}
+#define BLAS_LOCK_DEFINED
+
+static inline unsigned int rpcc(void){
+ unsigned long ret;
+
+ __asm__ __volatile__(".set push \n"
+ "rdhwr %0, $30 \n"
+ ".set pop" : "=r"(ret) : : "memory");
+
+ return ret;
+}
+#define RPCC_DEFINED
+
+static inline int blas_quickdivide(blasint x, blasint y){
+ return x / y;
+}
+
+#define GET_IMAGE(res)
+
+#define GET_IMAGE_CANCEL
+
+#endif
+
+
+#ifndef F_INTERFACE
+#define REALNAME ASMNAME
+#else
+#define REALNAME ASMFNAME
+#endif
+
+#if defined(ASSEMBLER) && !defined(NEEDPARAM)
+
+#define PROLOGUE \
+ .arm ;\
+ .global REALNAME ;\
+ .func REALNAME ;\
+REALNAME:
+
+#define EPILOGUE
+
+#define PROFCODE
+
+#endif
+
+
+#define SEEK_ADDRESS
+
+#ifndef PAGESIZE
+#define PAGESIZE ( 4 << 10)
+#endif
+#define HUGE_PAGESIZE ( 4 << 20)
+
+#define BUFFER_SIZE (16 << 20)
+
+
+#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+#endif
diff --git a/common_mips64.h b/common_mips64.h
index f5c0ec7cf..6078bf35b 100644
--- a/common_mips64.h
+++ b/common_mips64.h
@@ -102,7 +102,7 @@ static void INLINE blas_lock(volatile unsigned long *address){
static inline unsigned int rpcc(void){
unsigned long ret;
-#if defined(LOONGSON3A) || defined(LOONGSON3B)
+
// unsigned long long tmp;
//__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
//ret=tmp;
@@ -111,17 +111,10 @@ static inline unsigned int rpcc(void){
"rdhwr %0, $2\n"
".set pop": "=r"(ret):: "memory");
-#else
- __asm__ __volatile__(".set push \n"
- ".set mips32r2\n"
- "rdhwr %0, $30 \n"
- ".set pop" : "=r"(ret) : : "memory");
-#endif
return ret;
}
#define RPCC_DEFINED
-#if defined(LOONGSON3A) || defined(LOONGSON3B)
#ifndef NO_AFFINITY
#define WHEREAMI
static inline int WhereAmI(void){
@@ -134,7 +127,6 @@ static inline int WhereAmI(void){
}
#endif
-#endif
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
diff --git a/common_power.h b/common_power.h
index 723d949f2..e3a1a7aef 100644
--- a/common_power.h
+++ b/common_power.h
@@ -39,8 +39,13 @@
#ifndef COMMON_POWER
#define COMMON_POWER
+#if defined(POWER8)
+#define MB __asm__ __volatile__ ("eieio":::"memory")
+#define WMB __asm__ __volatile__ ("eieio":::"memory")
+#else
#define MB __asm__ __volatile__ ("sync")
#define WMB __asm__ __volatile__ ("sync")
+#endif
#define INLINE inline
@@ -798,7 +803,7 @@ Lmcount$lazy_ptr:
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8)
-#define BUFFER_SIZE ( 32 << 20)
+#define BUFFER_SIZE ( 64 << 20)
#else
#define BUFFER_SIZE ( 16 << 20)
#endif
diff --git a/cpuid_mips.c b/cpuid_mips.c
index 22beff7fc..15c58959e 100644
--- a/cpuid_mips.c
+++ b/cpuid_mips.c
@@ -71,15 +71,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/
#define CPU_UNKNOWN 0
-#define CPU_SICORTEX 1
-#define CPU_LOONGSON3A 2
-#define CPU_LOONGSON3B 3
+#define CPU_P5600 1
static char *cpuname[] = {
"UNKOWN",
- "SICORTEX",
- "LOONGSON3A",
- "LOONGSON3B"
+ "P5600"
};
int detect(void){
@@ -120,7 +116,7 @@ int detect(void){
if (strstr(p, "loongson3a"))
return CPU_LOONGSON3A;
}else{
- return CPU_SICORTEX;
+ return CPU_UNKNOWN;
}
}
//Check model name for Loongson3
@@ -149,64 +145,40 @@ char *get_corename(void){
}
void get_architecture(void){
- printf("MIPS64");
+ printf("MIPS");
}
void get_subarchitecture(void){
- if(detect()==CPU_LOONGSON3A) {
- printf("LOONGSON3A");
- }else if(detect()==CPU_LOONGSON3B){
- printf("LOONGSON3B");
+ if(detect()==CPU_P5600){
+ printf("P5600");
}else{
- printf("SICORTEX");
+ printf("UNKNOWN");
}
}
void get_subdirname(void){
- printf("mips64");
+ printf("mips");
}
void get_cpuconfig(void){
- if(detect()==CPU_LOONGSON3A) {
- printf("#define LOONGSON3A\n");
- printf("#define L1_DATA_SIZE 65536\n");
- printf("#define L1_DATA_LINESIZE 32\n");
- printf("#define L2_SIZE 512488\n");
- printf("#define L2_LINESIZE 32\n");
- printf("#define DTB_DEFAULT_ENTRIES 64\n");
- printf("#define DTB_SIZE 4096\n");
- printf("#define L2_ASSOCIATIVE 4\n");
- }else if(detect()==CPU_LOONGSON3B){
- printf("#define LOONGSON3B\n");
+ if(detect()==CPU_P5600){
+ printf("#define P5600\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
- printf("#define L2_SIZE 512488\n");
+ printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
- printf("#define L2_ASSOCIATIVE 4\n");
- }else{
- printf("#define SICORTEX\n");
- printf("#define L1_DATA_SIZE 32768\n");
- printf("#define L1_DATA_LINESIZE 32\n");
- printf("#define L2_SIZE 512488\n");
- printf("#define L2_LINESIZE 32\n");
- printf("#define DTB_DEFAULT_ENTRIES 32\n");
- printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n");
+ }else{
+ printf("#define UNKNOWN\n");
}
}
void get_libname(void){
- if(detect()==CPU_LOONGSON3A) {
- printf("loongson3a\n");
- }else if(detect()==CPU_LOONGSON3B) {
- printf("loongson3b\n");
+ if(detect()==CPU_P5600) {
+ printf("p5600\n");
}else{
-#ifdef __mips64
- printf("mips64\n");
-#else
- printf("mips32\n");
-#endif
+ printf("mips\n");
}
}
diff --git a/cpuid_mips64.c b/cpuid_mips64.c
new file mode 100644
index 000000000..ac1554c79
--- /dev/null
+++ b/cpuid_mips64.c
@@ -0,0 +1,238 @@
+/*****************************************************************************
+Copyright (c) 2011-2014, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ 3. Neither the name of the OpenBLAS project nor the names of
+ its contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define CPU_UNKNOWN 0
+#define CPU_SICORTEX 1
+#define CPU_LOONGSON3A 2
+#define CPU_LOONGSON3B 3
+#define CPU_I6400 4
+#define CPU_P6600 5
+
+static char *cpuname[] = {
+ "UNKOWN",
+ "SICORTEX",
+ "LOONGSON3A",
+ "LOONGSON3B",
+ "I6400",
+ "P6600"
+};
+
+int detect(void){
+
+#ifdef linux
+ FILE *infile;
+ char buffer[512], *p;
+
+ p = (char *)NULL;
+ infile = fopen("/proc/cpuinfo", "r");
+ while (fgets(buffer, sizeof(buffer), infile)){
+ if (!strncmp("cpu", buffer, 3)){
+ p = strchr(buffer, ':') + 2;
+#if 0
+ fprintf(stderr, "%s\n", p);
+#endif
+ break;
+ }
+ }
+
+ fclose(infile);
+
+ if(p != NULL){
+ if (strstr(p, "Loongson-3A")){
+ return CPU_LOONGSON3A;
+ }else if(strstr(p, "Loongson-3B")){
+ return CPU_LOONGSON3B;
+ }else if (strstr(p, "Loongson-3")){
+ infile = fopen("/proc/cpuinfo", "r");
+ p = (char *)NULL;
+ while (fgets(buffer, sizeof(buffer), infile)){
+ if (!strncmp("system type", buffer, 11)){
+ p = strchr(buffer, ':') + 2;
+ break;
+ }
+ }
+ fclose(infile);
+ if (strstr(p, "loongson3a"))
+ return CPU_LOONGSON3A;
+ }else{
+ return CPU_SICORTEX;
+ }
+ }
+ //Check model name for Loongson3
+ infile = fopen("/proc/cpuinfo", "r");
+ p = (char *)NULL;
+ while (fgets(buffer, sizeof(buffer), infile)){
+ if (!strncmp("model name", buffer, 10)){
+ p = strchr(buffer, ':') + 2;
+ break;
+ }
+ }
+ fclose(infile);
+ if(p != NULL){
+ if (strstr(p, "Loongson-3A")){
+ return CPU_LOONGSON3A;
+ }else if(strstr(p, "Loongson-3B")){
+ return CPU_LOONGSON3B;
+ }
+ }
+#endif
+ return CPU_UNKNOWN;
+}
+
+char *get_corename(void){
+ return cpuname[detect()];
+}
+
+void get_architecture(void){
+ printf("MIPS64");
+}
+
+void get_subarchitecture(void){
+ if(detect()==CPU_LOONGSON3A) {
+ printf("LOONGSON3A");
+ }else if(detect()==CPU_LOONGSON3B){
+ printf("LOONGSON3B");
+ }else if(detect()==CPU_I6400){
+ printf("I6400");
+ }else if(detect()==CPU_P6600){
+ printf("P6600");
+ }else{
+ printf("SICORTEX");
+ }
+}
+
+void get_subdirname(void){
+ printf("mips64");
+}
+
+void get_cpuconfig(void){
+ if(detect()==CPU_LOONGSON3A) {
+ printf("#define LOONGSON3A\n");
+ printf("#define L1_DATA_SIZE 65536\n");
+ printf("#define L1_DATA_LINESIZE 32\n");
+ printf("#define L2_SIZE 512488\n");
+ printf("#define L2_LINESIZE 32\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ printf("#define L2_ASSOCIATIVE 4\n");
+ }else if(detect()==CPU_LOONGSON3B){
+ printf("#define LOONGSON3B\n");
+ printf("#define L1_DATA_SIZE 65536\n");
+ printf("#define L1_DATA_LINESIZE 32\n");
+ printf("#define L2_SIZE 512488\n");
+ printf("#define L2_LINESIZE 32\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ printf("#define L2_ASSOCIATIVE 4\n");
+ }else if(detect()==CPU_I6400){
+ printf("#define I6400\n");
+ printf("#define L1_DATA_SIZE 65536\n");
+ printf("#define L1_DATA_LINESIZE 32\n");
+ printf("#define L2_SIZE 1048576\n");
+ printf("#define L2_LINESIZE 32\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ printf("#define L2_ASSOCIATIVE 8\n");
+ }else if(detect()==CPU_P6600){
+ printf("#define P6600\n");
+ printf("#define L1_DATA_SIZE 65536\n");
+ printf("#define L1_DATA_LINESIZE 32\n");
+ printf("#define L2_SIZE 1048576\n");
+ printf("#define L2_LINESIZE 32\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ printf("#define L2_ASSOCIATIVE 8\n");
+ }else{
+ printf("#define SICORTEX\n");
+ printf("#define L1_DATA_SIZE 32768\n");
+ printf("#define L1_DATA_LINESIZE 32\n");
+ printf("#define L2_SIZE 512488\n");
+ printf("#define L2_LINESIZE 32\n");
+ printf("#define DTB_DEFAULT_ENTRIES 32\n");
+ printf("#define DTB_SIZE 4096\n");
+ printf("#define L2_ASSOCIATIVE 8\n");
+ }
+}
+
+void get_libname(void){
+ if(detect()==CPU_LOONGSON3A) {
+ printf("loongson3a\n");
+ }else if(detect()==CPU_LOONGSON3B) {
+ printf("loongson3b\n");
+ }else if(detect()==CPU_I6400) {
+ printf("i6400\n");
+ }else if(detect()==CPU_P6600) {
+ printf("p6600\n");
+ }else{
+ printf("mips64\n");
+ }
+}
diff --git a/cpuid_x86.c b/cpuid_x86.c
index e5938803d..bbd377f67 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1172,6 +1172,8 @@ int get_cpuname(void){
#endif
else
return CPUTYPE_NEHALEM;
+ case 12:
+ // Braswell
case 13:
// Avoton
return CPUTYPE_NEHALEM;
@@ -1678,6 +1680,8 @@ int get_coretype(void){
#endif
else
return CORE_NEHALEM;
+ case 12:
+ // Braswell
case 13:
// Avoton
return CORE_NEHALEM;
diff --git a/ctest.c b/ctest.c
index b5c74f137..e0ef46e60 100644
--- a/ctest.c
+++ b/ctest.c
@@ -110,7 +110,7 @@ ARCH_MIPS64
#endif
#if defined(__mips32) || defined(__mips)
-ARCH_MIPS32
+ARCH_MIPS
#endif
#ifdef __alpha
diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt
index dbe785bcb..addcffeac 100644
--- a/ctest/CMakeLists.txt
+++ b/ctest/CMakeLists.txt
@@ -1,4 +1,4 @@
-include_directories(${CMAKE_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR})
enable_language(Fortran)
diff --git a/ctest/Makefile b/ctest/Makefile
index 7a5d236aa..6eda43863 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -42,6 +42,7 @@ ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
all :: all1 all2 all3
all1: xscblat1 xdcblat1 xccblat1 xzcblat1
+ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat1
OMP_NUM_THREADS=2 ./xdcblat1
@@ -53,8 +54,10 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat1
OPENBLAS_NUM_THREADS=2 ./xzcblat1
endif
+endif
all2: xscblat2 xdcblat2 xccblat2 xzcblat2
+ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat2 < sin2
OMP_NUM_THREADS=2 ./xdcblat2 < din2
@@ -66,8 +69,10 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2
OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2
endif
+endif
all3: xscblat3 xdcblat3 xccblat3 xzcblat3
+ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat3 < sin3
OMP_NUM_THREADS=2 ./xdcblat3 < din3
@@ -88,6 +93,7 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
endif
+endif
diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt
index 696767486..f444469bd 100644
--- a/driver/level2/CMakeLists.txt
+++ b/driver/level2/CMakeLists.txt
@@ -1,5 +1,5 @@
-include_directories(${CMAKE_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR})
# sources that need to be compiled twice, once with no flags and once with LOWER
set(UL_SOURCES
diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt
index 3d3303af2..36677a942 100644
--- a/driver/level3/CMakeLists.txt
+++ b/driver/level3/CMakeLists.txt
@@ -1,4 +1,4 @@
-include_directories(${CMAKE_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR})
# N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa
diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt
index b361f2a97..489d40c76 100644
--- a/driver/others/CMakeLists.txt
+++ b/driver/others/CMakeLists.txt
@@ -1,4 +1,4 @@
-include_directories(${CMAKE_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR})
if (${CORE} STREQUAL "PPC440")
set(MEMORY memory_qalloc.c)
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 2fde07fcc..18f85c316 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -261,8 +261,8 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
- //Intel Avoton
- if (model == 13) {
+ //Intel Braswell / Avoton
+ if (model == 12 || model == 13) {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
@@ -439,7 +439,7 @@ static gotoblas_t *force_coretype(char *coretype){
char message[128];
//char mname[20];
- for ( i=1 ; i <= 21; i++)
+ for ( i=1 ; i <= 22; i++)
{
if (!strncasecmp(coretype,corename[i],20))
{
diff --git a/driver/others/init.c b/driver/others/init.c
index f134f85f7..801f93991 100644
--- a/driver/others/init.c
+++ b/driver/others/init.c
@@ -361,6 +361,9 @@ static void numa_mapping(void) {
unsigned long work, bit;
int count = 0;
int bitmask_idx = 0;
+ int current_cpu;
+ int current_node = 0;
+ int cpu_count = 0;
for (node = 0; node < common -> num_nodes; node ++) {
core = 0;
@@ -382,33 +385,84 @@ static void numa_mapping(void) {
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
#endif
- h = 1;
-
- while (h < count) h = 2 * h + 1;
-
- while (h > 1) {
- h /= 2;
- for (i = h; i < count; i++) {
- work = common -> cpu_info[i];
- bit = CPU_ISSET(i, &cpu_orig_mask[0]);
- j = i - h;
- while (work < common -> cpu_info[j]) {
- common -> cpu_info[j + h] = common -> cpu_info[j];
- if (CPU_ISSET(j, &cpu_orig_mask[0])) {
- CPU_SET(j + h, &cpu_orig_mask[0]);
- } else {
- CPU_CLR(j + h, &cpu_orig_mask[0]);
- }
- j -= h;
- if (j < 0) break;
- }
- common -> cpu_info[j + h] = work;
- if (bit) {
- CPU_SET(j + h, &cpu_orig_mask[0]);
- } else {
- CPU_CLR(j + h, &cpu_orig_mask[0]);
+ current_cpu = sched_getcpu();
+ for (cpu = 0; cpu < count; cpu++) {
+ if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) {
+ current_node = READ_NODE(common -> cpu_info[cpu]);
+ break;
+ }
+ }
+ for (i = 0; i < MAX_BITMASK_LEN; i++)
+ cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]);
+
+ /*
+ * If all the processes can be accommodated in the
+ * in the current node itself, then bind to cores
+ * from the current node only
+ */
+ if (numprocs <= cpu_count) {
+ /*
+ * First sort all the cores in order from the current node.
+ * Then take remaining nodes one by one in order,
+ * and sort their cores in order.
+ */
+ for (i = 0; i < count; i++) {
+ for (j = 0; j < count - 1; j++) {
+ int node_1, node_2;
+ int core_1, core_2;
+ int swap = 0;
+
+ node_1 = READ_NODE(common -> cpu_info[j]);
+ node_2 = READ_NODE(common -> cpu_info[j + 1]);
+ core_1 = READ_CORE(common -> cpu_info[j]);
+ core_2 = READ_CORE(common -> cpu_info[j + 1]);
+
+ if (node_1 == node_2) {
+ if (core_1 > core_2)
+ swap = 1;
+ } else {
+ if ((node_2 == current_node) ||
+ ((node_1 != current_node) && (node_1 > node_2)))
+ swap = 1;
+ }
+ if (swap) {
+ unsigned long temp;
+
+ temp = common->cpu_info[j];
+ common->cpu_info[j] = common->cpu_info[j + 1];
+ common->cpu_info[j + 1] = temp;
+ }
}
+ }
+ } else {
+ h = 1;
+
+ while (h < count) h = 2 * h + 1;
+
+ while (h > 1) {
+ h /= 2;
+ for (i = h; i < count; i++) {
+ work = common -> cpu_info[i];
+ bit = CPU_ISSET(i, &cpu_orig_mask[0]);
+ j = i - h;
+ while (work < common -> cpu_info[j]) {
+ common -> cpu_info[j + h] = common -> cpu_info[j];
+ if (CPU_ISSET(j, &cpu_orig_mask[0])) {
+ CPU_SET(j + h, &cpu_orig_mask[0]);
+ } else {
+ CPU_CLR(j + h, &cpu_orig_mask[0]);
+ }
+ j -= h;
+ if (j < 0) break;
+ }
+ common -> cpu_info[j + h] = work;
+ if (bit) {
+ CPU_SET(j + h, &cpu_orig_mask[0]);
+ } else {
+ CPU_CLR(j + h, &cpu_orig_mask[0]);
+ }
+ }
}
}
@@ -416,7 +470,10 @@ static void numa_mapping(void) {
fprintf(stderr, "\nSorting ...\n\n");
for (cpu = 0; cpu < count; cpu++)
- fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
+ fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu],
+ READ_CPU(common -> cpu_info[cpu]),
+ READ_CORE(common -> cpu_info[cpu]),
+ READ_NODE(common -> cpu_info[cpu]));
#endif
}
diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index f4b1a80ad..f22c6b69a 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -167,7 +167,7 @@ int get_L2_size(void){
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
- defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER)
+ defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@@ -251,7 +251,7 @@ int get_L2_size(void){
void blas_set_parameter(void){
int factor;
-#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
int size = 16;
#else
int size = get_L2_size();
diff --git a/exports/Makefile b/exports/Makefile
index c2b8d9c1c..5632b6fff 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -110,9 +110,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
endif
ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2))
#only build without Fortran
- $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
+ $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else
- $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
+ $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif
dllinit.$(SUFFIX) : dllinit.c
diff --git a/f_check b/f_check
index 4c9d81e9f..2f01f1c44 100644
--- a/f_check
+++ b/f_check
@@ -114,7 +114,7 @@ if ($compiler eq "") {
$openmp = "-mp";
}
- if ($data =~ /IBM/) {
+ if ($data =~ /IBM XL/) {
$vendor = IBM;
$openmp = "-openmp";
}
@@ -223,7 +223,12 @@ if (!$?) {
}
#For gfortran MIPS
if ($?) {
- $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
+ $mips_data = `$compiler_bin -E -dM - < /dev/null`;
+ if ($mips_data =~ /_MIPS_ISA_MIPS64/) {
+ $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
+ } else {
+ $link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
+ }
}
$binary = "" if ($?);
}
diff --git a/getarch.c b/getarch.c
index 1e0b08675..f8069e507 100644
--- a/getarch.c
+++ b/getarch.c
@@ -131,6 +131,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */
+/* #define FORCE_I6400 */
+/* #define FORCE_P6600 */
+/* #define FORCE_P5600 */
/* #define FORCE_ITANIUM2 */
/* #define FORCE_SPARC */
/* #define FORCE_SPARCV7 */
@@ -699,6 +702,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
+#ifdef FORCE_I6400
+#define FORCE
+#define ARCHITECTURE "MIPS"
+#define SUBARCHITECTURE "I6400"
+#define SUBDIRNAME "mips64"
+#define ARCHCONFIG "-DI6400 " \
+ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
+ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME "i6400"
+#define CORENAME "I6400"
+#else
+#endif
+
+#ifdef FORCE_P6600
+#define FORCE
+#define ARCHITECTURE "MIPS"
+#define SUBARCHITECTURE "P6600"
+#define SUBDIRNAME "mips64"
+#define ARCHCONFIG "-DP6600 " \
+ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
+ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME "p6600"
+#define CORENAME "P6600"
+#else
+#endif
+
+#ifdef FORCE_P5600
+#define FORCE
+#define ARCHITECTURE "MIPS"
+#define SUBARCHITECTURE "P5600"
+#define SUBDIRNAME "mips"
+#define ARCHCONFIG "-DP5600 " \
+ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
+ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME "p5600"
+#define CORENAME "P5600"
+#else
+#endif
+
#ifdef FORCE_ITANIUM2
#define FORCE
#define ARCHITECTURE "IA64"
@@ -888,7 +933,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef __mips__
+#ifdef __mips64
+#include "cpuid_mips64.c"
+#else
#include "cpuid_mips.c"
+#endif
#define OPENBLAS_SUPPORTED
#endif
diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index 9ff924e5f..1722dc661 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -1,5 +1,5 @@
-include_directories(${CMAKE_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR})
set(BLAS1_SOURCES
diff --git a/interface/lapack/fortran/dlaqr5.f b/interface/lapack/fortran/dlaqr5.f
new file mode 100644
index 000000000..a8fad0a79
--- /dev/null
+++ b/interface/lapack/fortran/dlaqr5.f
@@ -0,0 +1,1083 @@
+! Copyright (c) 2013-2016, The OpenBLAS Project
+! All rights reserved.
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions are
+! met:
+! 1. Redistributions of source code must retain the above copyright
+! notice, this list of conditions and the following disclaimer.
+! 2. Redistributions in binary form must reproduce the above copyright
+! notice, this list of conditions and the following disclaimer in
+! the documentation and/or other materials provided with the
+! distribution.
+! 3. Neither the name of the OpenBLAS project nor the names of
+! its contributors may be used to endorse or promote products
+! derived from this software without specific prior written permission.
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+! AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+! ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+! LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+! DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+! SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+! OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+! USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*> \brief \b DLAQR5 performs a single small-bulge multi-shift QR sweep.
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DLAQR5 + dependencies
+*>
+*> [TGZ]
+*>
+*> [ZIP]
+*>
+*> [TXT]
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
+* SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U,
+* LDU, NV, WV, LDWV, NH, WH, LDWH )
+*
+* .. Scalar Arguments ..
+* INTEGER IHIZ, ILOZ, KACC22, KBOT, KTOP, LDH, LDU, LDV,
+* $ LDWH, LDWV, LDZ, N, NH, NSHFTS, NV
+* LOGICAL WANTT, WANTZ
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION H( LDH, * ), SI( * ), SR( * ), U( LDU, * ),
+* $ V( LDV, * ), WH( LDWH, * ), WV( LDWV, * ),
+* $ Z( LDZ, * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DLAQR5, called by DLAQR0, performs a
+*> single small-bulge multi-shift QR sweep.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] WANTT
+*> \verbatim
+*> WANTT is logical scalar
+*> WANTT = .true. if the quasi-triangular Schur factor
+*> is being computed. WANTT is set to .false. otherwise.
+*> \endverbatim
+*>
+*> \param[in] WANTZ
+*> \verbatim
+*> WANTZ is logical scalar
+*> WANTZ = .true. if the orthogonal Schur factor is being
+*> computed. WANTZ is set to .false. otherwise.
+*> \endverbatim
+*>
+*> \param[in] KACC22
+*> \verbatim
+*> KACC22 is integer with value 0, 1, or 2.
+*> Specifies the computation mode of far-from-diagonal
+*> orthogonal updates.
+*> = 0: DLAQR5 does not accumulate reflections and does not
+*> use matrix-matrix multiply to update far-from-diagonal
+*> matrix entries.
+*> = 1: DLAQR5 accumulates reflections and uses matrix-matrix
+*> multiply to update the far-from-diagonal matrix entries.
+*> = 2: DLAQR5 accumulates reflections, uses matrix-matrix
+*> multiply to update the far-from-diagonal matrix entries,
+*> and takes advantage of 2-by-2 block structure during
+*> matrix multiplies.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is integer scalar
+*> N is the order of the Hessenberg matrix H upon which this
+*> subroutine operates.
+*> \endverbatim
+*>
+*> \param[in] KTOP
+*> \verbatim
+*> KTOP is integer scalar
+*> \endverbatim
+*>
+*> \param[in] KBOT
+*> \verbatim
+*> KBOT is integer scalar
+*> These are the first and last rows and columns of an
+*> isolated diagonal block upon which the QR sweep is to be
+*> applied. It is assumed without a check that
+*> either KTOP = 1 or H(KTOP,KTOP-1) = 0
+*> and
+*> either KBOT = N or H(KBOT+1,KBOT) = 0.
+*> \endverbatim
+*>
+*> \param[in] NSHFTS
+*> \verbatim
+*> NSHFTS is integer scalar
+*> NSHFTS gives the number of simultaneous shifts. NSHFTS
+*> must be positive and even.
+*> \endverbatim
+*>
+*> \param[in,out] SR
+*> \verbatim
+*> SR is DOUBLE PRECISION array of size (NSHFTS)
+*> \endverbatim
+*>
+*> \param[in,out] SI
+*> \verbatim
+*> SI is DOUBLE PRECISION array of size (NSHFTS)
+*> SR contains the real parts and SI contains the imaginary
+*> parts of the NSHFTS shifts of origin that define the
+*> multi-shift QR sweep. On output SR and SI may be
+*> reordered.
+*> \endverbatim
+*>
+*> \param[in,out] H
+*> \verbatim
+*> H is DOUBLE PRECISION array of size (LDH,N)
+*> On input H contains a Hessenberg matrix. On output a
+*> multi-shift QR sweep with shifts SR(J)+i*SI(J) is applied
+*> to the isolated diagonal block in rows and columns KTOP
+*> through KBOT.
+*> \endverbatim
+*>
+*> \param[in] LDH
+*> \verbatim
+*> LDH is integer scalar
+*> LDH is the leading dimension of H just as declared in the
+*> calling procedure. LDH.GE.MAX(1,N).
+*> \endverbatim
+*>
+*> \param[in] ILOZ
+*> \verbatim
+*> ILOZ is INTEGER
+*> \endverbatim
+*>
+*> \param[in] IHIZ
+*> \verbatim
+*> IHIZ is INTEGER
+*> Specify the rows of Z to which transformations must be
+*> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N
+*> \endverbatim
+*>
+*> \param[in,out] Z
+*> \verbatim
+*> Z is DOUBLE PRECISION array of size (LDZ,IHI)
+*> If WANTZ = .TRUE., then the QR Sweep orthogonal
+*> similarity transformation is accumulated into
+*> Z(ILOZ:IHIZ,ILO:IHI) from the right.
+*> If WANTZ = .FALSE., then Z is unreferenced.
+*> \endverbatim
+*>
+*> \param[in] LDZ
+*> \verbatim
+*> LDZ is integer scalar
+*> LDA is the leading dimension of Z just as declared in
+*> the calling procedure. LDZ.GE.N.
+*> \endverbatim
+*>
+*> \param[out] V
+*> \verbatim
+*> V is DOUBLE PRECISION array of size (LDV,NSHFTS/2)
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*> LDV is integer scalar
+*> LDV is the leading dimension of V as declared in the
+*> calling procedure. LDV.GE.3.
+*> \endverbatim
+*>
+*> \param[out] U
+*> \verbatim
+*> U is DOUBLE PRECISION array of size
+*> (LDU,3*NSHFTS-3)
+*> \endverbatim
+*>
+*> \param[in] LDU
+*> \verbatim
+*> LDU is integer scalar
+*> LDU is the leading dimension of U just as declared in the
+*> in the calling subroutine. LDU.GE.3*NSHFTS-3.
+*> \endverbatim
+*>
+*> \param[in] NH
+*> \verbatim
+*> NH is integer scalar
+*> NH is the number of columns in array WH available for
+*> workspace. NH.GE.1.
+*> \endverbatim
+*>
+*> \param[out] WH
+*> \verbatim
+*> WH is DOUBLE PRECISION array of size (LDWH,NH)
+*> \endverbatim
+*>
+*> \param[in] LDWH
+*> \verbatim
+*> LDWH is integer scalar
+*> Leading dimension of WH just as declared in the
+*> calling procedure. LDWH.GE.3*NSHFTS-3.
+*> \endverbatim
+*>
+*> \param[in] NV
+*> \verbatim
+*> NV is integer scalar
+*> NV is the number of rows in WV agailable for workspace.
+*> NV.GE.1.
+*> \endverbatim
+*>
+*> \param[out] WV
+*> \verbatim
+*> WV is DOUBLE PRECISION array of size
+*> (LDWV,3*NSHFTS-3)
+*> \endverbatim
+*>
+*> \param[in] LDWV
+*> \verbatim
+*> LDWV is integer scalar
+*> LDWV is the leading dimension of WV as declared in the
+*> in the calling subroutine. LDWV.GE.NV.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date September 2012
+*
+*> \ingroup doubleOTHERauxiliary
+*
+*> \par Contributors:
+* ==================
+*>
+*> Karen Braman and Ralph Byers, Department of Mathematics,
+*> University of Kansas, USA
+*
+*> \par References:
+* ================
+*>
+*> K. Braman, R. Byers and R. Mathias, The Multi-Shift QR
+*> Algorithm Part I: Maintaining Well Focused Shifts, and Level 3
+*> Performance, SIAM Journal of Matrix Analysis, volume 23, pages
+*> 929--947, 2002.
+*>
+* =====================================================================
+ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
+ $ SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U,
+ $ LDU, NV, WV, LDWV, NH, WH, LDWH )
+*
+* -- LAPACK auxiliary routine (version 3.4.2) --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+* September 2012
+*
+* .. Scalar Arguments ..
+ INTEGER IHIZ, ILOZ, KACC22, KBOT, KTOP, LDH, LDU, LDV,
+ $ LDWH, LDWV, LDZ, N, NH, NSHFTS, NV
+ LOGICAL WANTT, WANTZ
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION H( LDH, * ), SI( * ), SR( * ), U( LDU, * ),
+ $ V( LDV, * ), WH( LDWH, * ), WV( LDWV, * ),
+ $ Z( LDZ, * )
+* ..
+*
+* ================================================================
+* .. Parameters ..
+ DOUBLE PRECISION ZERO, ONE
+ PARAMETER ( ZERO = 0.0d0, ONE = 1.0d0 )
+* ..
+* .. Local Scalars ..
+ DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM,
+ $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2,
+ $ ULP
+ INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN,
+ $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS,
+ $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL,
+ $ NS, NU
+ LOGICAL ACCUM, BLK22, BMP22
+* ..
+* .. External Functions ..
+ DOUBLE PRECISION DLAMCH
+ EXTERNAL DLAMCH
+* ..
+* .. Intrinsic Functions ..
+*
+ INTRINSIC ABS, DBLE, MAX, MIN, MOD
+* ..
+* .. Local Arrays ..
+ DOUBLE PRECISION VT( 3 )
+* temp scalars
+ DOUBLE PRECISION tempv1, tempv2, tempv3,
+ $ tempv4, tempv5, tempv6,
+ $ temph1, temph2, temph3,
+ $ temph4, temph5, temph6,
+ $ tempz1, tempz2, tempz3,
+ $ tempz4, tempz5, tempz6,
+ $ tempu1, tempu2, tempu3,
+ $ tempu4, tempu5, tempu6,
+ $ REFSU1
+ INTEGER JBEGIN, M1
+* ..
+* .. External Subroutines ..
+ EXTERNAL DGEMM, DLABAD, DLACPY, DLAQR1, DLARFG, DLASET,
+ $ DTRMM
+* ..
+* .. Executable Statements ..
+*
+* ==== If there are no shifts, then there is nothing to do. ====
+*
+ IF( NSHFTS.LT.2 )
+ $ RETURN
+*
+* ==== If the active block is empty or 1-by-1, then there
+* . is nothing to do. ====
+*
+ IF( KTOP.GE.KBOT )
+ $ RETURN
+*
+* ==== Shuffle shifts into pairs of real shifts and pairs
+* . of complex conjugate shifts assuming complex
+* . conjugate shifts are already adjacent to one
+* . another. ====
+*
+ DO 10 I = 1, NSHFTS - 2, 2
+ IF( SI( I ).NE.-SI( I+1 ) ) THEN
+*
+ SWAP = SR( I )
+ SR( I ) = SR( I+1 )
+ SR( I+1 ) = SR( I+2 )
+ SR( I+2 ) = SWAP
+*
+ SWAP = SI( I )
+ SI( I ) = SI( I+1 )
+ SI( I+1 ) = SI( I+2 )
+ SI( I+2 ) = SWAP
+ END IF
+ 10 CONTINUE
+*
+* ==== NSHFTS is supposed to be even, but if it is odd,
+* . then simply reduce it by one. The shuffle above
+* . ensures that the dropped shift is real and that
+* . the remaining shifts are paired. ====
+*
+ NS = NSHFTS - MOD( NSHFTS, 2 )
+*
+* ==== Machine constants for deflation ====
+*
+ SAFMIN = DLAMCH( 'SAFE MINIMUM' )
+ SAFMAX = ONE / SAFMIN
+ CALL DLABAD( SAFMIN, SAFMAX )
+ ULP = DLAMCH( 'PRECISION' )
+ SMLNUM = SAFMIN*( DBLE( N ) / ULP )
+*
+* ==== Use accumulated reflections to update far-from-diagonal
+* . entries ? ====
+*
+ ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 )
+*
+* ==== If so, exploit the 2-by-2 block structure? ====
+*
+ BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 )
+*
+* ==== clear trash ====
+*
+ IF( KTOP+2.LE.KBOT )
+ $ H( KTOP+2, KTOP ) = ZERO
+*
+* ==== NBMPS = number of 2-shift bulges in the chain ====
+*
+ NBMPS = NS / 2
+*
+* ==== KDU = width of slab ====
+*
+ KDU = 6*NBMPS - 3
+*
+* ==== Create and chase chains of NBMPS bulges ====
+*
+ DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2
+ NDCOL = INCOL + KDU
+ IF( ACCUM )
+ $ CALL DLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU )
+*
+* ==== Near-the-diagonal bulge chase. The following loop
+* . performs the near-the-diagonal part of a small bulge
+* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal
+* . chunk extends from column INCOL to column NDCOL
+* . (including both column INCOL and column NDCOL). The
+* . following loop chases a 3*NBMPS column long chain of
+* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL
+* . may be less than KTOP and and NDCOL may be greater than
+* . KBOT indicating phantom columns from which to chase
+* . bulges before they are actually introduced or to which
+* . to chase bulges beyond column KBOT.) ====
+*
+ DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 )
+*
+* ==== Bulges number MTOP to MBOT are active double implicit
+* . shift bulges. There may or may not also be small
+* . 2-by-2 bulge, if there is room. The inactive bulges
+* . (if any) must wait until the active bulges have moved
+* . down the diagonal to make room. The phantom matrix
+* . paradigm described above helps keep track. ====
+*
+ MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 )
+ MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 )
+ M22 = MBOT + 1
+ BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ.
+ $ ( KBOT-2 )
+*
+* ==== Generate reflections to chase the chain right
+* . one column. (The minimum value of K is KTOP-1.) ====
+*
+ DO 20 M = MTOP, MBOT
+ K = KRCOL + 3*( M-1 )
+ IF( K.EQ.KTOP-1 ) THEN
+ CALL DLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ),
+ $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ),
+ $ V( 1, M ) )
+ ALPHA = V( 1, M )
+ CALL DLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) )
+ ELSE
+ BETA = H( K+1, K )
+ V( 2, M ) = H( K+2, K )
+ V( 3, M ) = H( K+3, K )
+ CALL DLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) )
+*
+* ==== A Bulge may collapse because of vigilant
+* . deflation or destructive underflow. In the
+* . underflow case, try the two-small-subdiagonals
+* . trick to try to reinflate the bulge. ====
+*
+ IF( H( K+3, K ).NE.ZERO .OR. H( K+3, K+1 ).NE.
+ $ ZERO .OR. H( K+3, K+2 ).EQ.ZERO ) THEN
+*
+* ==== Typical case: not collapsed (yet). ====
+*
+ H( K+1, K ) = BETA
+ H( K+2, K ) = ZERO
+ H( K+3, K ) = ZERO
+ ELSE
+*
+* ==== Atypical case: collapsed. Attempt to
+* . reintroduce ignoring H(K+1,K) and H(K+2,K).
+* . If the fill resulting from the new
+* . reflector is too large, then abandon it.
+* . Otherwise, use the new one. ====
+*
+ CALL DLAQR1( 3, H( K+1, K+1 ), LDH, SR( 2*M-1 ),
+ $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ),
+ $ VT )
+ ALPHA = VT( 1 )
+ CALL DLARFG( 3, ALPHA, VT( 2 ), 1, VT( 1 ) )
+ REFSUM = VT( 1 )*( H( K+1, K )+VT( 2 )*
+ $ H( K+2, K ) )
+*
+ IF( ABS( H( K+2, K )-REFSUM*VT( 2 ) )+
+ $ ABS( REFSUM*VT( 3 ) ).GT.ULP*
+ $ ( ABS( H( K, K ) )+ABS( H( K+1,
+ $ K+1 ) )+ABS( H( K+2, K+2 ) ) ) ) THEN
+*
+* ==== Starting a new bulge here would
+* . create non-negligible fill. Use
+* . the old one with trepidation. ====
+*
+ H( K+1, K ) = BETA
+ H( K+2, K ) = ZERO
+ H( K+3, K ) = ZERO
+ ELSE
+*
+* ==== Stating a new bulge here would
+* . create only negligible fill.
+* . Replace the old reflector with
+* . the new one. ====
+*
+ H( K+1, K ) = H( K+1, K ) - REFSUM
+ H( K+2, K ) = ZERO
+ H( K+3, K ) = ZERO
+ V( 1, M ) = VT( 1 )
+ V( 2, M ) = VT( 2 )
+ V( 3, M ) = VT( 3 )
+ END IF
+ END IF
+ END IF
+ 20 CONTINUE
+*
+* ==== Generate a 2-by-2 reflection, if needed. ====
+*
+ K = KRCOL + 3*( M22-1 )
+ IF( BMP22 ) THEN
+ IF( K.EQ.KTOP-1 ) THEN
+ CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ),
+ $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ),
+ $ V( 1, M22 ) )
+ BETA = V( 1, M22 )
+ CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+ ELSE
+ BETA = H( K+1, K )
+ V( 2, M22 ) = H( K+2, K )
+ CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+ H( K+1, K ) = BETA
+ H( K+2, K ) = ZERO
+ END IF
+ END IF
+*
+* ==== Multiply H by reflections from the left ====
+*
+ IF( ACCUM ) THEN
+ JBOT = MIN( NDCOL, KBOT )
+ ELSE IF( WANTT ) THEN
+ JBOT = N
+ ELSE
+ JBOT = KBOT
+ END IF
+ DO 40 J = MAX( KTOP, KRCOL ), JBOT
+ MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 )
+
+ DO 30 M = MTOP, MEND
+
+ M1 = M -1
+
+ tempv1 = V( 1, M )
+ K = KRCOL + 2*M1
+ tempv2 = V( 2, M )
+ K = K + M1
+ tempv3 = V( 3, M )
+ temph1 = H( K+1, J )
+ temph2 = H( K+2, J )
+ temph3 = H( K+3, J )
+
+ REFSUM = tempv1*( temph1+tempv2*
+ $ temph2+tempv3*temph3 )
+
+
+ H( K+1, J ) = temph1 - REFSUM
+ H( K+2, J ) = temph2 - REFSUM*tempv2
+ H( K+3, J ) = temph3 - REFSUM*tempv3
+
+ 30 CONTINUE
+
+ 40 CONTINUE
+ IF( BMP22 ) THEN
+ K = KRCOL + 3*( M22-1 )
+ DO 50 J = MAX( K+1, KTOP ), JBOT
+ REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )*
+ $ H( K+2, J ) )
+ H( K+1, J ) = H( K+1, J ) - REFSUM
+ H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
+ 50 CONTINUE
+ END IF
+*
+* ==== Multiply H by reflections from the right.
+* . Delay filling in the last row until the
+* . vigilant deflation check is complete. ====
+*
+ IF( ACCUM ) THEN
+ JTOP = MAX( KTOP, INCOL )
+ ELSE IF( WANTT ) THEN
+ JTOP = 1
+ ELSE
+ JTOP = KTOP
+ END IF
+ DO 90 M = MTOP, MBOT
+ IF( V( 1, M ).NE.ZERO ) THEN
+ tempv1 = V( 1, M )
+ tempv2 = V( 2, M )
+ tempv3 = V( 3, M )
+ K = KRCOL + 3*( M-1 )
+ JBEGIN = JTOP
+
+ IF ( MOD( MIN( KBOT, K+3 )-JTOP+1, 2).GT.0 ) THEN
+ J = JBEGIN
+
+ temph1 = H( J, K+1 )
+ temph2 = H( J, K+2 )
+ temph3 = H( J, K+3 )
+ REFSUM = tempv1* ( temph1+tempv2*temph2+
+ $ tempv3*temph3 )
+ H( J, K+1 ) = temph1 - REFSUM
+ H( J, K+2 ) = temph2 - REFSUM*tempv2
+ H( J, K+3 ) = temph3 - REFSUM*tempv3
+
+ JBEGIN = JBEGIN + 1
+
+ END IF
+
+
+ DO 60 J = JBEGIN, MIN( KBOT, K+3 ), 2
+
+ temph1 = H( J, K+1 )
+ temph4 = H( J+1, K+1 )
+ temph2 = H( J, K+2 )
+ temph5 = H( J+1, K+2 )
+ temph3 = H( J, K+3 )
+ temph6 = H( J+1, K+3 )
+
+ REFSUM = tempv1* ( temph1+tempv2*temph2+
+ $ tempv3*temph3 )
+
+ REFSU1 = tempv1* ( temph4+tempv2*temph5+
+ $ tempv3*temph6 )
+
+ H( J, K+1 ) = temph1 - REFSUM
+ H( J+1, K+1 ) = temph4 - REFSU1
+ H( J, K+2 ) = temph2 - REFSUM*tempv2
+ H( J+1, K+2 ) = temph5 - REFSU1*tempv2
+ H( J, K+3 ) = temph3 - REFSUM*tempv3
+ H( J+1, K+3 ) = temph6 - REFSU1*tempv3
+
+ 60 CONTINUE
+*
+ IF( ACCUM ) THEN
+*
+* ==== Accumulate U. (If necessary, update Z later
+* . with with an efficient matrix-matrix
+* . multiply.) ====
+*
+ KMS = K - INCOL
+ JBEGIN=MAX( 1, KTOP-INCOL )
+
+ IF ( MOD(KDU-JBEGIN+1,2).GT.0 ) THEN
+ J = JBEGIN
+ tempu1 = U( J, KMS+1 )
+ tempu2 = U( J, KMS+2 )
+ tempu3 = U( J, KMS+3 )
+ REFSUM = tempv1* ( tempu1+tempv2*tempu2+
+ $ tempv3*tempu3 )
+ U( J, KMS+1 ) = tempu1 - REFSUM
+ U( J, KMS+2 ) = tempu2 - REFSUM*tempv2
+ U( J, KMS+3 ) = tempu3 - REFSUM*tempv3
+ JBEGIN = JBEGIN + 1
+
+ END IF
+
+
+ DO 70 J = JBEGIN, KDU , 2
+
+ tempu1 = U( J, KMS+1 )
+ tempu4 = U( J+1, KMS+1 )
+ tempu2 = U( J, KMS+2 )
+ tempu5 = U( J+1, KMS+2 )
+ tempu3 = U( J, KMS+3 )
+ tempu6 = U( J+1, KMS+3 )
+ REFSUM = tempv1* ( tempu1+tempv2*tempu2+
+ $ tempv3*tempu3 )
+
+ REFSU1 = tempv1* ( tempu4+tempv2*tempu5+
+ $ tempv3*tempu6 )
+
+ U( J, KMS+1 ) = tempu1 - REFSUM
+ U( J+1, KMS+1 ) = tempu4 - REFSU1
+ U( J, KMS+2 ) = tempu2 - REFSUM*tempv2
+ U( J+1, KMS+2 ) = tempu5 - REFSU1*tempv2
+ U( J, KMS+3 ) = tempu3 - REFSUM*tempv3
+ U( J+1, KMS+3 ) = tempu6 - REFSU1*tempv3
+
+ 70 CONTINUE
+
+
+ ELSE IF( WANTZ ) THEN
+*
+* ==== U is not accumulated, so update Z
+* . now by multiplying by reflections
+* . from the right. ====
+*
+ JBEGIN = ILOZ
+
+ IF ( MOD(IHIZ-ILOZ+1,2).GT.0 ) THEN
+ J = JBEGIN
+
+ tempz1 = Z( J, K+1 )
+ tempz2 = Z( J, K+2 )
+ tempz3 = Z( J, K+3 )
+ REFSUM = tempv1* ( tempz1+tempv2*tempz2+
+ $ tempv3*tempz3 )
+ Z( J, K+1 ) = tempz1 - REFSUM
+ Z( J, K+2 ) = tempz2 - REFSUM*tempv2
+ Z( J, K+3 ) = tempz3 - REFSUM*tempv3
+
+ JBEGIN = JBEGIN + 1
+
+ END IF
+
+ DO 80 J = JBEGIN, IHIZ, 2
+
+ tempz1 = Z( J, K+1 )
+ tempz4 = Z( J+1, K+1 )
+ tempz2 = Z( J, K+2 )
+ tempz5 = Z( J+1, K+2 )
+ tempz3 = Z( J, K+3 )
+ tempz6 = Z( J+1, K+3 )
+
+ REFSUM = tempv1* ( tempz1+tempv2*tempz2+
+ $ tempv3*tempz3 )
+
+ REFSU1 = tempv1* ( tempz4+tempv2*tempz5+
+ $ tempv3*tempz6 )
+
+ Z( J, K+1 ) = tempz1 - REFSUM
+ Z( J, K+2 ) = tempz2 - REFSUM*tempv2
+ Z( J, K+3 ) = tempz3 - REFSUM*tempv3
+
+
+ Z( J+1, K+1 ) = tempz4 - REFSU1
+ Z( J+1, K+2 ) = tempz5 - REFSU1*tempv2
+ Z( J+1, K+3 ) = tempz6 - REFSU1*tempv3
+
+
+ 80 CONTINUE
+
+ END IF
+ END IF
+ 90 CONTINUE
+*
+* ==== Special case: 2-by-2 reflection (if needed) ====
+*
+ K = KRCOL + 3*( M22-1 )
+ IF( BMP22 ) THEN
+ IF ( V( 1, M22 ).NE.ZERO ) THEN
+ DO 100 J = JTOP, MIN( KBOT, K+3 )
+ REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
+ $ H( J, K+2 ) )
+ H( J, K+1 ) = H( J, K+1 ) - REFSUM
+ H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 )
+ 100 CONTINUE
+*
+ IF( ACCUM ) THEN
+ KMS = K - INCOL
+ DO 110 J = MAX( 1, KTOP-INCOL ), KDU
+ REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
+ $ V( 2, M22 )*U( J, KMS+2 ) )
+ U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+ U( J, KMS+2 ) = U( J, KMS+2 ) -
+ $ REFSUM*V( 2, M22 )
+ 110 CONTINUE
+ ELSE IF( WANTZ ) THEN
+ DO 120 J = ILOZ, IHIZ
+ REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
+ $ Z( J, K+2 ) )
+ Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+ Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 )
+ 120 CONTINUE
+ END IF
+ END IF
+ END IF
+*
+* ==== Vigilant deflation check ====
+*
+ MSTART = MTOP
+ IF( KRCOL+3*( MSTART-1 ).LT.KTOP )
+ $ MSTART = MSTART + 1
+ MEND = MBOT
+ IF( BMP22 )
+ $ MEND = MEND + 1
+ IF( KRCOL.EQ.KBOT-2 )
+ $ MEND = MEND + 1
+ DO 130 M = MSTART, MEND
+ K = MIN( KBOT-1, KRCOL+3*( M-1 ) )
+*
+* ==== The following convergence test requires that
+* . the tradition small-compared-to-nearby-diagonals
+* . criterion and the Ahues & Tisseur (LAWN 122, 1997)
+* . criteria both be satisfied. The latter improves
+* . accuracy in some examples. Falling back on an
+* . alternate convergence criterion when TST1 or TST2
+* . is zero (as done here) is traditional but probably
+* . unnecessary. ====
+*
+ IF( H( K+1, K ).NE.ZERO ) THEN
+ TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) )
+ IF( TST1.EQ.ZERO ) THEN
+ IF( K.GE.KTOP+1 )
+ $ TST1 = TST1 + ABS( H( K, K-1 ) )
+ IF( K.GE.KTOP+2 )
+ $ TST1 = TST1 + ABS( H( K, K-2 ) )
+ IF( K.GE.KTOP+3 )
+ $ TST1 = TST1 + ABS( H( K, K-3 ) )
+ IF( K.LE.KBOT-2 )
+ $ TST1 = TST1 + ABS( H( K+2, K+1 ) )
+ IF( K.LE.KBOT-3 )
+ $ TST1 = TST1 + ABS( H( K+3, K+1 ) )
+ IF( K.LE.KBOT-4 )
+ $ TST1 = TST1 + ABS( H( K+4, K+1 ) )
+ END IF
+ IF( ABS( H( K+1, K ) ).LE.MAX( SMLNUM, ULP*TST1 ) )
+ $ THEN
+ H12 = MAX( ABS( H( K+1, K ) ), ABS( H( K, K+1 ) ) )
+ H21 = MIN( ABS( H( K+1, K ) ), ABS( H( K, K+1 ) ) )
+ H11 = MAX( ABS( H( K+1, K+1 ) ),
+ $ ABS( H( K, K )-H( K+1, K+1 ) ) )
+ H22 = MIN( ABS( H( K+1, K+1 ) ),
+ $ ABS( H( K, K )-H( K+1, K+1 ) ) )
+ SCL = H11 + H12
+ TST2 = H22*( H11 / SCL )
+*
+ IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE.
+ $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO
+ END IF
+ END IF
+ 130 CONTINUE
+*
+* ==== Fill in the last row of each bulge. ====
+*
+ MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 )
+ DO 140 M = MTOP, MEND
+ K = KRCOL + 3*( M-1 )
+ REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 )
+ H( K+4, K+1 ) = -REFSUM
+ H( K+4, K+2 ) = -REFSUM*V( 2, M )
+ H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M )
+ 140 CONTINUE
+*
+* ==== End of near-the-diagonal bulge chase. ====
+*
+ 150 CONTINUE
+*
+* ==== Use U (if accumulated) to update far-from-diagonal
+* . entries in H. If required, use U to update Z as
+* . well. ====
+*
+ IF( ACCUM ) THEN
+ IF( WANTT ) THEN
+ JTOP = 1
+ JBOT = N
+ ELSE
+ JTOP = KTOP
+ JBOT = KBOT
+ END IF
+ IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR.
+ $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN
+*
+* ==== Updates not exploiting the 2-by-2 block
+* . structure of U. K1 and NU keep track of
+* . the location and size of U in the special
+* . cases of introducing bulges and chasing
+* . bulges off the bottom. In these special
+* . cases and in case the number of shifts
+* . is NS = 2, there is no 2-by-2 block
+* . structure to exploit. ====
+*
+ K1 = MAX( 1, KTOP-INCOL )
+ NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
+*
+* ==== Horizontal Multiply ====
+*
+ DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
+ JLEN = MIN( NH, JBOT-JCOL+1 )
+ CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
+ $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
+ $ LDWH )
+ CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH,
+ $ H( INCOL+K1, JCOL ), LDH )
+ 160 CONTINUE
+*
+* ==== Vertical multiply ====
+*
+ DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
+ JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+ CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE,
+ $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+ $ LDU, ZERO, WV, LDWV )
+ CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV,
+ $ H( JROW, INCOL+K1 ), LDH )
+ 170 CONTINUE
+*
+* ==== Z multiply (also vertical) ====
+*
+ IF( WANTZ ) THEN
+ DO 180 JROW = ILOZ, IHIZ, NV
+ JLEN = MIN( NV, IHIZ-JROW+1 )
+ CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE,
+ $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
+ $ LDU, ZERO, WV, LDWV )
+ CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV,
+ $ Z( JROW, INCOL+K1 ), LDZ )
+ 180 CONTINUE
+ END IF
+ ELSE
+*
+* ==== Updates exploiting U's 2-by-2 block structure.
+* . (I2, I4, J2, J4 are the last rows and columns
+* . of the blocks.) ====
+*
+ I2 = ( KDU+1 ) / 2
+ I4 = KDU
+ J2 = I4 - I2
+ J4 = KDU
+*
+* ==== KZS and KNZ deal with the band of zeros
+* . along the diagonal of one of the triangular
+* . blocks. ====
+*
+ KZS = ( J4-J2 ) - ( NS+1 )
+ KNZ = NS + 1
+*
+* ==== Horizontal multiply ====
+*
+ DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
+ JLEN = MIN( NH, JBOT-JCOL+1 )
+*
+* ==== Copy bottom of H to top+KZS of scratch ====
+* (The first KZS rows get multiplied by zero.) ====
+*
+ CALL DLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ),
+ $ LDH, WH( KZS+1, 1 ), LDWH )
+*
+* ==== Multiply by U21**T ====
+*
+ CALL DLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH )
+ CALL DTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE,
+ $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ),
+ $ LDWH )
+*
+* ==== Multiply top of H by U11**T ====
+*
+ CALL DGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU,
+ $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH )
+*
+* ==== Copy top of H to bottom of WH ====
+*
+ CALL DLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH,
+ $ WH( I2+1, 1 ), LDWH )
+*
+* ==== Multiply by U21**T ====
+*
+ CALL DTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE,
+ $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH )
+*
+* ==== Multiply by U22 ====
+*
+ CALL DGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE,
+ $ U( J2+1, I2+1 ), LDU,
+ $ H( INCOL+1+J2, JCOL ), LDH, ONE,
+ $ WH( I2+1, 1 ), LDWH )
+*
+* ==== Copy it back ====
+*
+ CALL DLACPY( 'ALL', KDU, JLEN, WH, LDWH,
+ $ H( INCOL+1, JCOL ), LDH )
+ 190 CONTINUE
+*
+* ==== Vertical multiply ====
+*
+ DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV
+ JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW )
+*
+* ==== Copy right of H to scratch (the first KZS
+* . columns get multiplied by zero) ====
+*
+ CALL DLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ),
+ $ LDH, WV( 1, 1+KZS ), LDWV )
+*
+* ==== Multiply by U21 ====
+*
+ CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV )
+ CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
+ $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
+ $ LDWV )
+*
+* ==== Multiply by U11 ====
+*
+ CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE,
+ $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV,
+ $ LDWV )
+*
+* ==== Copy left of H to right of scratch ====
+*
+ CALL DLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH,
+ $ WV( 1, 1+I2 ), LDWV )
+*
+* ==== Multiply by U21 ====
+*
+ CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
+ $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV )
+*
+* ==== Multiply by U22 ====
+*
+ CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
+ $ H( JROW, INCOL+1+J2 ), LDH,
+ $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ),
+ $ LDWV )
+*
+* ==== Copy it back ====
+*
+ CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV,
+ $ H( JROW, INCOL+1 ), LDH )
+ 200 CONTINUE
+*
+* ==== Multiply Z (also vertical) ====
+*
+ IF( WANTZ ) THEN
+ DO 210 JROW = ILOZ, IHIZ, NV
+ JLEN = MIN( NV, IHIZ-JROW+1 )
+*
+* ==== Copy right of Z to left of scratch (first
+* . KZS columns get multiplied by zero) ====
+*
+ CALL DLACPY( 'ALL', JLEN, KNZ,
+ $ Z( JROW, INCOL+1+J2 ), LDZ,
+ $ WV( 1, 1+KZS ), LDWV )
+*
+* ==== Multiply by U12 ====
+*
+ CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV,
+ $ LDWV )
+ CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
+ $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
+ $ LDWV )
+*
+* ==== Multiply by U11 ====
+*
+ CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE,
+ $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE,
+ $ WV, LDWV )
+*
+* ==== Copy left of Z to right of scratch ====
+*
+ CALL DLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ),
+ $ LDZ, WV( 1, 1+I2 ), LDWV )
+*
+* ==== Multiply by U21 ====
+*
+ CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
+ $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ),
+ $ LDWV )
+*
+* ==== Multiply by U22 ====
+*
+ CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
+ $ Z( JROW, INCOL+1+J2 ), LDZ,
+ $ U( J2+1, I2+1 ), LDU, ONE,
+ $ WV( 1, 1+I2 ), LDWV )
+*
+* ==== Copy the result back to Z ====
+*
+ CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV,
+ $ Z( JROW, INCOL+1 ), LDZ )
+ 210 CONTINUE
+ END IF
+ END IF
+ END IF
+ 220 CONTINUE
+*
+* ==== End of DLAQR5 ====
+*
+ END
diff --git a/interface/swap.c b/interface/swap.c
index 23b2e4ec8..7d47d600b 100644
--- a/interface/swap.c
+++ b/interface/swap.c
@@ -42,6 +42,10 @@
#include "functable.h"
#endif
+// Disable multi-threading as it does not show any performance
+// benefits. Keep the multi-threading code for the record.
+#undef SMP
+
#ifndef CBLAS
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
diff --git a/interface/ztrmv.c b/interface/ztrmv.c
index 2be915c32..1721afc1c 100644
--- a/interface/ztrmv.c
+++ b/interface/ztrmv.c
@@ -243,6 +243,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
{
buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
+ // It seems to be required for some K8 or Barcelona CPU
+ buffer_size += 8;
if(incx != 1)
buffer_size += n * 2;
}
diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index fc4c4028b..17c2b1b89 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -1,6 +1,6 @@
-include_directories(${CMAKE_SOURCE_DIR})
-include("${CMAKE_SOURCE_DIR}/cmake/kernel.cmake")
+include_directories(${PROJECT_SOURCE_DIR})
+include("${PROJECT_SOURCE_DIR}/cmake/kernel.cmake")
# Makefile
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 8e6827424..e55f153f5 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -12,10 +12,6 @@ ifeq ($(ARCH), ia64)
USE_GEMM3M = 1
endif
-ifeq ($(ARCH), MIPS)
-USE_GEMM3M = 1
-endif
-
ifeq ($(ARCH), arm)
USE_TRMM = 1
endif
diff --git a/kernel/arm/scal.c b/kernel/arm/scal.c
index 91ca76569..4ef49e293 100644
--- a/kernel/arm/scal.c
+++ b/kernel/arm/scal.c
@@ -40,6 +40,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
{
BLASLONG i=0,j=0;
+ if ( (n <= 0) || (inc_x <= 0))
+ return(0);
+
+
while(j < n)
{
diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c
index f543edc04..0521aaa0b 100644
--- a/kernel/arm/zscal.c
+++ b/kernel/arm/zscal.c
@@ -43,6 +43,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
BLASLONG ip = 0;
FLOAT temp;
+ if ( (n <= 0) || (inc_x <= 0))
+ return(0);
+
+
inc_x2 = 2 * inc_x;
for ( i=0; i ALPHA0_R
+//v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I
+//v12 must save pB1_00_R, pB1_01_R
+//v13 must save pB1_00_I, pB1_01_I
+//v14 must save pB1_02_R, pB1_03_R
+//v15 must save pB1_02_I, pB1_03_I
//v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R
//v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I
//v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R
@@ -171,8 +173,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_I
- ld2 {v8.4s, v9.4s}, [pB]
- add pB, pB, #32
+ ld2 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
ld2 {v2.4s, v3.4s}, [pA]
@@ -189,6 +192,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v17.4s, v1.4s, v8.s[0]
+ ld2 {v10.2s, v11.2s}, [pB]
+ add pB, pB, #16
+
fmul v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -200,6 +206,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v19.4s, v3.4s, v8.s[0]
+ ld2 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+
fmul v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -211,6 +220,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v21.4s, v1.4s, v8.s[1]
+ ld2 {v14.2s, v15.2s}, [pB]
+ add pB, pB, #16
+
fmul v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -222,56 +234,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v23.4s, v3.4s, v8.s[1]
- fmul v24.4s, v0.4s, v8.s[2]
- OP_ii v24.4s, v1.4s, v9.s[2]
+ ld2 {v4.4s, v5.4s}, [pA]
+ add pA, pA, #32
+
+ fmul v24.4s, v0.4s, v10.s[0]
+ OP_ii v24.4s, v1.4s, v11.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.4s, v0.4s, v9.s[2]
+ fmls v25.4s, v0.4s, v11.s[0]
#else
- fmul v25.4s, v0.4s, v9.s[2]
+ fmul v25.4s, v0.4s, v11.s[0]
#endif
- OP_ir v25.4s, v1.4s, v8.s[2]
+ OP_ir v25.4s, v1.4s, v10.s[0]
- fmul v26.4s, v2.4s, v8.s[2]
- OP_ii v26.4s, v3.4s, v9.s[2]
+ ld2 {v6.4s, v7.4s}, [pA]
+ add pA, pA, #32
+
+ fmul v26.4s, v2.4s, v10.s[0]
+ OP_ii v26.4s, v3.4s, v11.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b
- fmls v27.4s, v2.4s, v9.s[2]
+ fmls v27.4s, v2.4s, v11.s[0]
#else
- fmul v27.4s, v2.4s, v9.s[2]
+ fmul v27.4s, v2.4s, v11.s[0]
#endif
- OP_ir v27.4s, v3.4s, v8.s[2]
+ OP_ir v27.4s, v3.4s, v10.s[0]
- fmul v28.4s, v0.4s, v8.s[3]
- OP_ii v28.4s, v1.4s, v9.s[3]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+ fmul v28.4s, v0.4s, v10.s[1]
+ OP_ii v28.4s, v1.4s, v11.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.4s, v0.4s, v9.s[3]
+ fmls v29.4s, v0.4s, v11.s[1]
#else
- fmul v29.4s, v0.4s, v9.s[3]
+ fmul v29.4s, v0.4s, v11.s[1]
#endif
- OP_ir v29.4s, v1.4s, v8.s[3]
+ OP_ir v29.4s, v1.4s, v10.s[1]
- fmul v30.4s, v2.4s, v8.s[3]
- OP_ii v30.4s, v3.4s, v9.s[3]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+
+ fmul v30.4s, v2.4s, v10.s[1]
+ OP_ii v30.4s, v3.4s, v11.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b
- fmls v31.4s, v2.4s, v9.s[3]
+ fmls v31.4s, v2.4s, v11.s[1]
#else
- fmul v31.4s, v2.4s, v9.s[3]
+ fmul v31.4s, v2.4s, v11.s[1]
#endif
- OP_ir v31.4s, v3.4s, v8.s[3]
-
- ld2 {v12.4s, v13.4s}, [pB]
- add pB, pB, #32
- ld2 {v4.4s, v5.4s}, [pA]
- add pA, pA, #32
- ld2 {v6.4s, v7.4s}, [pA]
- add pA, pA, #32
+ OP_ir v31.4s, v3.4s, v10.s[1]
.endm
.macro KERNEL8x4_M1
@@ -280,47 +295,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.s[0]
+ ld2 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+
OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.s[0]
+ ld2 {v4.4s, v5.4s}, [pA]
+ add pA, pA, #32
+
OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.s[1]
+ ld2 {v6.4s, v7.4s}, [pA]
+ add pA, pA, #32
+
OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.s[1]
- OP_rr v24.4s, v0.4s, v8.s[2]
- OP_ii v24.4s, v1.4s, v9.s[2]
- OP_ri v25.4s, v0.4s, v9.s[2]
- OP_ir v25.4s, v1.4s, v8.s[2]
+ ld2 {v14.2s, v15.2s}, [pB]
+ add pB, pB, #16
- OP_rr v26.4s, v2.4s, v8.s[2]
- OP_ii v26.4s, v3.4s, v9.s[2]
- OP_ri v27.4s, v2.4s, v9.s[2]
- OP_ir v27.4s, v3.4s, v8.s[2]
+ OP_rr v24.4s, v0.4s, v10.s[0]
+ OP_ii v24.4s, v1.4s, v11.s[0]
+ OP_ri v25.4s, v0.4s, v11.s[0]
+ OP_ir v25.4s, v1.4s, v10.s[0]
- OP_rr v28.4s, v0.4s, v8.s[3]
- OP_ii v28.4s, v1.4s, v9.s[3]
- OP_ri v29.4s, v0.4s, v9.s[3]
- OP_ir v29.4s, v1.4s, v8.s[3]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
- OP_rr v30.4s, v2.4s, v8.s[3]
- OP_ii v30.4s, v3.4s, v9.s[3]
- OP_ri v31.4s, v2.4s, v9.s[3]
- OP_ir v31.4s, v3.4s, v8.s[3]
+ OP_rr v26.4s, v2.4s, v10.s[0]
+ OP_ii v26.4s, v3.4s, v11.s[0]
+ OP_ri v27.4s, v2.4s, v11.s[0]
+ OP_ir v27.4s, v3.4s, v10.s[0]
- ld2 {v12.4s, v13.4s}, [pB] // For next round
- add pB, pB, #32
- ld2 {v4.4s, v5.4s}, [pA] // For next round
- add pA, pA, #32
- ld2 {v6.4s, v7.4s}, [pA]
- add pA, pA, #32
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+
+ OP_rr v28.4s, v0.4s, v10.s[1]
+ OP_ii v28.4s, v1.4s, v11.s[1]
+ OP_ri v29.4s, v0.4s, v11.s[1]
+ OP_ir v29.4s, v1.4s, v10.s[1]
+
+ OP_rr v30.4s, v2.4s, v10.s[1]
+ OP_ii v30.4s, v3.4s, v11.s[1]
+ OP_ri v31.4s, v2.4s, v11.s[1]
+ OP_ir v31.4s, v3.4s, v10.s[1]
.endm
.macro KERNEL8x4_M2
@@ -329,47 +353,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.s[0]
+ ld2 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+
OP_rr v18.4s, v6.4s, v12.s[0]
OP_ii v18.4s, v7.4s, v13.s[0]
OP_ri v19.4s, v6.4s, v13.s[0]
OP_ir v19.4s, v7.4s, v12.s[0]
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+
OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.s[1]
+ ld2 {v2.4s, v3.4s}, [pA]
+ add pA, pA, #32
+
OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.s[1]
- OP_rr v24.4s, v4.4s, v12.s[2]
- OP_ii v24.4s, v5.4s, v13.s[2]
- OP_ri v25.4s, v4.4s, v13.s[2]
- OP_ir v25.4s, v5.4s, v12.s[2]
+ ld2 {v10.2s, v11.2s}, [pB]
+ add pB, pB, #16
- OP_rr v26.4s, v6.4s, v12.s[2]
- OP_ii v26.4s, v7.4s, v13.s[2]
- OP_ri v27.4s, v6.4s, v13.s[2]
- OP_ir v27.4s, v7.4s, v12.s[2]
+ OP_rr v24.4s, v4.4s, v14.s[0]
+ OP_ii v24.4s, v5.4s, v15.s[0]
+ OP_ri v25.4s, v4.4s, v15.s[0]
+ OP_ir v25.4s, v5.4s, v14.s[0]
- OP_rr v28.4s, v4.4s, v12.s[3]
- OP_ii v28.4s, v5.4s, v13.s[3]
- OP_ri v29.4s, v4.4s, v13.s[3]
- OP_ir v29.4s, v5.4s, v12.s[3]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- OP_rr v30.4s, v6.4s, v12.s[3]
- OP_ii v30.4s, v7.4s, v13.s[3]
- OP_ri v31.4s, v6.4s, v13.s[3]
- OP_ir v31.4s, v7.4s, v12.s[3]
+ OP_rr v26.4s, v6.4s, v14.s[0]
+ OP_ii v26.4s, v7.4s, v15.s[0]
+ OP_ri v27.4s, v6.4s, v15.s[0]
+ OP_ir v27.4s, v7.4s, v14.s[0]
- ld2 {v8.4s, v9.4s}, [pB]
- add pB, pB, #32
- ld2 {v0.4s, v1.4s}, [pA]
- add pA, pA, #32
- ld2 {v2.4s, v3.4s}, [pA]
- add pA, pA, #32
+ OP_rr v28.4s, v4.4s, v14.s[1]
+ OP_ii v28.4s, v5.4s, v15.s[1]
+ OP_ri v29.4s, v4.4s, v15.s[1]
+ OP_ir v29.4s, v5.4s, v14.s[1]
+
+ OP_rr v30.4s, v6.4s, v14.s[1]
+ OP_ii v30.4s, v7.4s, v15.s[1]
+ OP_ri v31.4s, v6.4s, v15.s[1]
+ OP_ir v31.4s, v7.4s, v14.s[1]
.endm
.macro KERNEL8x4_E
@@ -388,157 +419,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.s[1]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.s[1]
- OP_rr v24.4s, v4.4s, v12.s[2]
- OP_ii v24.4s, v5.4s, v13.s[2]
- OP_ri v25.4s, v4.4s, v13.s[2]
- OP_ir v25.4s, v5.4s, v12.s[2]
-
- OP_rr v26.4s, v6.4s, v12.s[2]
- OP_ii v26.4s, v7.4s, v13.s[2]
- OP_ri v27.4s, v6.4s, v13.s[2]
- OP_ir v27.4s, v7.4s, v12.s[2]
-
- OP_rr v28.4s, v4.4s, v12.s[3]
- OP_ii v28.4s, v5.4s, v13.s[3]
- OP_ri v29.4s, v4.4s, v13.s[3]
- OP_ir v29.4s, v5.4s, v12.s[3]
-
- OP_rr v30.4s, v6.4s, v12.s[3]
- OP_ii v30.4s, v7.4s, v13.s[3]
- OP_ri v31.4s, v6.4s, v13.s[3]
- OP_ir v31.4s, v7.4s, v12.s[3]
-
+ OP_rr v24.4s, v4.4s, v14.s[0]
+ OP_ii v24.4s, v5.4s, v15.s[0]
+ OP_ri v25.4s, v4.4s, v15.s[0]
+ OP_ir v25.4s, v5.4s, v14.s[0]
+
+ OP_rr v26.4s, v6.4s, v14.s[0]
+ OP_ii v26.4s, v7.4s, v15.s[0]
+ OP_ri v27.4s, v6.4s, v15.s[0]
+ OP_ir v27.4s, v7.4s, v14.s[0]
+
+ OP_rr v28.4s, v4.4s, v14.s[1]
+ OP_ii v28.4s, v5.4s, v15.s[1]
+ OP_ri v29.4s, v4.4s, v15.s[1]
+ OP_ir v29.4s, v5.4s, v14.s[1]
+
+ OP_rr v30.4s, v6.4s, v14.s[1]
+ OP_ii v30.4s, v7.4s, v15.s[1]
+ OP_ri v31.4s, v6.4s, v15.s[1]
+ OP_ir v31.4s, v7.4s, v14.s[1]
.endm
.macro KERNEL8x4_SUB
- ld2 {v8.4s, v9.4s}, [pB]
- add pB, pB, #32
+ ld2 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- ld2 {v2.4s, v3.4s}, [pA]
- add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v18.4s, v2.4s, v8.s[0]
- OP_ii v18.4s, v3.4s, v9.s[0]
- OP_ri v19.4s, v2.4s, v9.s[0]
- OP_ir v19.4s, v3.4s, v8.s[0]
+ ld2 {v2.4s, v3.4s}, [pA]
+ add pA, pA, #32
OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.s[1]
+ ld2 {v10.2s, v11.2s}, [pB]
+ add pB, pB, #16
+
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.s[1]
- OP_rr v24.4s, v0.4s, v8.s[2]
- OP_ii v24.4s, v1.4s, v9.s[2]
- OP_ri v25.4s, v0.4s, v9.s[2]
- OP_ir v25.4s, v1.4s, v8.s[2]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
- OP_rr v26.4s, v2.4s, v8.s[2]
- OP_ii v26.4s, v3.4s, v9.s[2]
- OP_ri v27.4s, v2.4s, v9.s[2]
- OP_ir v27.4s, v3.4s, v8.s[2]
+ OP_rr v24.4s, v0.4s, v10.s[0]
+ OP_ii v24.4s, v1.4s, v11.s[0]
+ OP_ri v25.4s, v0.4s, v11.s[0]
+ OP_ir v25.4s, v1.4s, v10.s[0]
- OP_rr v28.4s, v0.4s, v8.s[3]
- OP_ii v28.4s, v1.4s, v9.s[3]
- OP_ri v29.4s, v0.4s, v9.s[3]
- OP_ir v29.4s, v1.4s, v8.s[3]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+ OP_rr v26.4s, v2.4s, v10.s[0]
+ OP_ii v26.4s, v3.4s, v11.s[0]
+ OP_ri v27.4s, v2.4s, v11.s[0]
+ OP_ir v27.4s, v3.4s, v10.s[0]
- OP_rr v30.4s, v2.4s, v8.s[3]
- OP_ii v30.4s, v3.4s, v9.s[3]
- OP_ri v31.4s, v2.4s, v9.s[3]
- OP_ir v31.4s, v3.4s, v8.s[3]
+ OP_rr v28.4s, v0.4s, v10.s[1]
+ OP_ii v28.4s, v1.4s, v11.s[1]
+ OP_ri v29.4s, v0.4s, v11.s[1]
+ OP_ir v29.4s, v1.4s, v10.s[1]
+ OP_rr v30.4s, v2.4s, v10.s[1]
+ OP_ii v30.4s, v3.4s, v11.s[1]
+ OP_ri v31.4s, v2.4s, v11.s[1]
+ OP_ir v31.4s, v3.4s, v10.s[1]
.endm
.macro SAVE8x4
- mov pCRow1, pCRow0
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
- ld2 {v0.4s, v1.4s}, [pCRow1]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+ ld2 {v0.4s, v1.4s}, [pCRow0]
fmla v0.4s, v16.4s, alphaV0_R
fmls v0.4s, v17.4s, alphaV0_I
- fmla v1.4s, v16.4s, alphaV1_I
- fmla v1.4s, v17.4s, alphaV1_R
- st2 {v0.4s, v1.4s}, [pCRow1]
+ fmla v1.4s, v16.4s, alphaV0_I
+ fmla v1.4s, v17.4s, alphaV0_R
+ st2 {v0.4s, v1.4s}, [pCRow0]
- add pCRow2, pCRow1, #32
+ add pCRow0, pCRow0, #32
- ld2 {v2.4s, v3.4s}, [pCRow2]
+ ld2 {v2.4s, v3.4s}, [pCRow0]
fmla v2.4s, v18.4s, alphaV0_R
fmls v2.4s, v19.4s, alphaV0_I
- fmla v3.4s, v18.4s, alphaV1_I
- fmla v3.4s, v19.4s, alphaV1_R
- st2 {v2.4s, v3.4s}, [pCRow2]
+ fmla v3.4s, v18.4s, alphaV0_I
+ fmla v3.4s, v19.4s, alphaV0_R
+ st2 {v2.4s, v3.4s}, [pCRow0]
- add pCRow1, pCRow1, LDC
+ add pCRow0, pCRow0, #32
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld2 {v4.4s, v5.4s}, [pCRow1]
fmla v4.4s, v20.4s, alphaV0_R
fmls v4.4s, v21.4s, alphaV0_I
- fmla v5.4s, v20.4s, alphaV1_I
- fmla v5.4s, v21.4s, alphaV1_R
+ fmla v5.4s, v20.4s, alphaV0_I
+ fmla v5.4s, v21.4s, alphaV0_R
st2 {v4.4s, v5.4s}, [pCRow1]
- add pCRow2, pCRow1, #32
+ add pCRow1, pCRow1, #32
- ld2 {v6.4s, v7.4s}, [pCRow2]
+ ld2 {v6.4s, v7.4s}, [pCRow1]
fmla v6.4s, v22.4s, alphaV0_R
fmls v6.4s, v23.4s, alphaV0_I
- fmla v7.4s, v22.4s, alphaV1_I
- fmla v7.4s, v23.4s, alphaV1_R
- st2 {v6.4s, v7.4s}, [pCRow2]
+ fmla v7.4s, v22.4s, alphaV0_I
+ fmla v7.4s, v23.4s, alphaV0_R
+ st2 {v6.4s, v7.4s}, [pCRow1]
- add pCRow1, pCRow1, LDC
+ add pCRow1, pCRow1, #32
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
- ld2 {v0.4s, v1.4s}, [pCRow1]
+ ld2 {v0.4s, v1.4s}, [pCRow2]
fmla v0.4s, v24.4s, alphaV0_R
fmls v0.4s, v25.4s, alphaV0_I
- fmla v1.4s, v24.4s, alphaV1_I
- fmla v1.4s, v25.4s, alphaV1_R
- st2 {v0.4s, v1.4s}, [pCRow1]
+ fmla v1.4s, v24.4s, alphaV0_I
+ fmla v1.4s, v25.4s, alphaV0_R
+ st2 {v0.4s, v1.4s}, [pCRow2]
- add pCRow2, pCRow1, #32
+ add pCRow2, pCRow2, #32
ld2 {v2.4s, v3.4s}, [pCRow2]
fmla v2.4s, v26.4s, alphaV0_R
fmls v2.4s, v27.4s, alphaV0_I
- fmla v3.4s, v26.4s, alphaV1_I
- fmla v3.4s, v27.4s, alphaV1_R
+ fmla v3.4s, v26.4s, alphaV0_I
+ fmla v3.4s, v27.4s, alphaV0_R
st2 {v2.4s, v3.4s}, [pCRow2]
- add pCRow1, pCRow1, LDC
+ add pCRow2, pCRow2, #32
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
- ld2 {v4.4s, v5.4s}, [pCRow1]
+ ld2 {v4.4s, v5.4s}, [pCRow3]
fmla v4.4s, v28.4s, alphaV0_R
fmls v4.4s, v29.4s, alphaV0_I
- fmla v5.4s, v28.4s, alphaV1_I
- fmla v5.4s, v29.4s, alphaV1_R
- st2 {v4.4s, v5.4s}, [pCRow1]
+ fmla v5.4s, v28.4s, alphaV0_I
+ fmla v5.4s, v29.4s, alphaV0_R
+ st2 {v4.4s, v5.4s}, [pCRow3]
- add pCRow2, pCRow1, #32
+ add pCRow3, pCRow3, #32
- ld2 {v6.4s, v7.4s}, [pCRow2]
+ ld2 {v6.4s, v7.4s}, [pCRow3]
fmla v6.4s, v30.4s, alphaV0_R
fmls v6.4s, v31.4s, alphaV0_I
- fmla v7.4s, v30.4s, alphaV1_I
- fmla v7.4s, v31.4s, alphaV1_R
- st2 {v6.4s, v7.4s}, [pCRow2]
+ fmla v7.4s, v30.4s, alphaV0_I
+ fmla v7.4s, v31.4s, alphaV0_R
+ st2 {v6.4s, v7.4s}, [pCRow3]
- add pCRow0, pCRow0, #64
+ add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@@ -720,13 +768,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
ld2 {v0.4s, v1.4s}, [pCRow1]
fmla v0.4s, v16.4s, alphaV0_R
fmls v0.4s, v17.4s, alphaV0_I
- fmla v1.4s, v16.4s, alphaV1_I
- fmla v1.4s, v17.4s, alphaV1_R
+ fmla v1.4s, v16.4s, alphaV0_I
+ fmla v1.4s, v17.4s, alphaV0_R
st2 {v0.4s, v1.4s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -734,8 +785,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.4s, v5.4s}, [pCRow1]
fmla v4.4s, v20.4s, alphaV0_R
fmls v4.4s, v21.4s, alphaV0_I
- fmla v5.4s, v20.4s, alphaV1_I
- fmla v5.4s, v21.4s, alphaV1_R
+ fmla v5.4s, v20.4s, alphaV0_I
+ fmla v5.4s, v21.4s, alphaV0_R
st2 {v4.4s, v5.4s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -743,8 +794,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pCRow1]
fmla v0.4s, v24.4s, alphaV0_R
fmls v0.4s, v25.4s, alphaV0_I
- fmla v1.4s, v24.4s, alphaV1_I
- fmla v1.4s, v25.4s, alphaV1_R
+ fmla v1.4s, v24.4s, alphaV0_I
+ fmla v1.4s, v25.4s, alphaV0_R
st2 {v0.4s, v1.4s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -752,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.4s, v5.4s}, [pCRow1]
fmla v4.4s, v28.4s, alphaV0_R
fmls v4.4s, v29.4s, alphaV0_I
- fmla v5.4s, v28.4s, alphaV1_I
- fmla v5.4s, v29.4s, alphaV1_R
+ fmla v5.4s, v28.4s, alphaV0_I
+ fmla v5.4s, v29.4s, alphaV0_R
st2 {v4.4s, v5.4s}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -800,13 +851,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
ld2 {v0.2s, v1.2s}, [pCRow1]
fmla v0.2s, v16.2s, alphaV0_R
fmls v0.2s, v17.2s, alphaV0_I
- fmla v1.2s, v16.2s, alphaV1_I
- fmla v1.2s, v17.2s, alphaV1_R
+ fmla v1.2s, v16.2s, alphaV0_I
+ fmla v1.2s, v17.2s, alphaV0_R
st2 {v0.2s, v1.2s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -814,8 +868,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2s, v5.2s}, [pCRow1]
fmla v4.2s, v20.2s, alphaV0_R
fmls v4.2s, v21.2s, alphaV0_I
- fmla v5.2s, v20.2s, alphaV1_I
- fmla v5.2s, v21.2s, alphaV1_R
+ fmla v5.2s, v20.2s, alphaV0_I
+ fmla v5.2s, v21.2s, alphaV0_R
st2 {v4.2s, v5.2s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -823,8 +877,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pCRow1]
fmla v0.2s, v24.2s, alphaV0_R
fmls v0.2s, v25.2s, alphaV0_I
- fmla v1.2s, v24.2s, alphaV1_I
- fmla v1.2s, v25.2s, alphaV1_R
+ fmla v1.2s, v24.2s, alphaV0_I
+ fmla v1.2s, v25.2s, alphaV0_R
st2 {v0.2s, v1.2s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -832,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2s, v5.2s}, [pCRow1]
fmla v4.2s, v28.2s, alphaV0_R
fmls v4.2s, v29.2s, alphaV0_I
- fmla v5.2s, v28.2s, alphaV1_I
- fmla v5.2s, v29.2s, alphaV1_R
+ fmla v5.2s, v28.2s, alphaV0_I
+ fmla v5.2s, v29.2s, alphaV0_R
st2 {v4.2s, v5.2s}, [pCRow1]
add pCRow0, pCRow0, #16
@@ -880,13 +934,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
ld2 {v0.s, v1.s}[0], [pCRow1]
fmla s0, s16, alphaV0_R
fmls s0, s17, alphaV0_I
- fmla s1, s16, alphaV1_I
- fmla s1, s17, alphaV1_R
+ fmla s1, s16, alphaV0_I
+ fmla s1, s17, alphaV0_R
st2 {v0.s, v1.s}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@@ -894,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.s, v5.s}[0], [pCRow1]
fmla s4, s20, alphaV0_R
fmls s4, s21, alphaV0_I
- fmla s5, s20, alphaV1_I
- fmla s5, s21, alphaV1_R
+ fmla s5, s20, alphaV0_I
+ fmla s5, s21, alphaV0_R
st2 {v4.s, v5.s}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@@ -903,8 +960,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pCRow1]
fmla s0, s24, alphaV0_R
fmls s0, s25, alphaV0_I
- fmla s1, s24, alphaV1_I
- fmla s1, s25, alphaV1_R
+ fmla s1, s24, alphaV0_I
+ fmla s1, s25, alphaV0_R
st2 {v0.s, v1.s}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@@ -912,8 +969,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.s, v5.s}[0], [pCRow1]
fmla s4, s28, alphaV0_R
fmls s4, s29, alphaV0_I
- fmla s5, s28, alphaV1_I
- fmla s5, s29, alphaV1_R
+ fmla s5, s28, alphaV0_I
+ fmla s5, s29, alphaV0_R
st2 {v4.s, v5.s}[0], [pCRow1]
add pCRow0, pCRow0, #8
@@ -962,13 +1019,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x2
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
ld2 {v0.4s, v1.4s}, [pCRow1]
fmla v0.4s, v16.4s, alphaV0_R
fmls v0.4s, v17.4s, alphaV0_I
- fmla v1.4s, v16.4s, alphaV1_I
- fmla v1.4s, v17.4s, alphaV1_R
+ fmla v1.4s, v16.4s, alphaV0_I
+ fmla v1.4s, v17.4s, alphaV0_R
st2 {v0.4s, v1.4s}, [pCRow1]
add pCRow2, pCRow1, #32
@@ -976,8 +1036,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pCRow2]
fmla v2.4s, v18.4s, alphaV0_R
fmls v2.4s, v19.4s, alphaV0_I
- fmla v3.4s, v18.4s, alphaV1_I
- fmla v3.4s, v19.4s, alphaV1_R
+ fmla v3.4s, v18.4s, alphaV0_I
+ fmla v3.4s, v19.4s, alphaV0_R
st2 {v2.4s, v3.4s}, [pCRow2]
add pCRow1, pCRow1, LDC
@@ -985,8 +1045,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.4s, v5.4s}, [pCRow1]
fmla v4.4s, v20.4s, alphaV0_R
fmls v4.4s, v21.4s, alphaV0_I
- fmla v5.4s, v20.4s, alphaV1_I
- fmla v5.4s, v21.4s, alphaV1_R
+ fmla v5.4s, v20.4s, alphaV0_I
+ fmla v5.4s, v21.4s, alphaV0_R
st2 {v4.4s, v5.4s}, [pCRow1]
add pCRow2, pCRow1, #32
@@ -994,8 +1054,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v6.4s, v7.4s}, [pCRow2]
fmla v6.4s, v22.4s, alphaV0_R
fmls v6.4s, v23.4s, alphaV0_I
- fmla v7.4s, v22.4s, alphaV1_I
- fmla v7.4s, v23.4s, alphaV1_R
+ fmla v7.4s, v22.4s, alphaV0_I
+ fmla v7.4s, v23.4s, alphaV0_R
st2 {v6.4s, v7.4s}, [pCRow2]
add pCRow0, pCRow0, #64
@@ -1028,13 +1088,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
ld2 {v0.4s, v1.4s}, [pCRow1]
fmla v0.4s, v16.4s, alphaV0_R
fmls v0.4s, v17.4s, alphaV0_I
- fmla v1.4s, v16.4s, alphaV1_I
- fmla v1.4s, v17.4s, alphaV1_R
+ fmla v1.4s, v16.4s, alphaV0_I
+ fmla v1.4s, v17.4s, alphaV0_R
st2 {v0.4s, v1.4s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -1042,8 +1105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.4s, v5.4s}, [pCRow1]
fmla v4.4s, v20.4s, alphaV0_R
fmls v4.4s, v21.4s, alphaV0_I
- fmla v5.4s, v20.4s, alphaV1_I
- fmla v5.4s, v21.4s, alphaV1_R
+ fmla v5.4s, v20.4s, alphaV0_I
+ fmla v5.4s, v21.4s, alphaV0_R
st2 {v4.4s, v5.4s}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -1076,13 +1139,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
ld2 {v0.2s, v1.2s}, [pCRow1]
fmla v0.2s, v16.2s, alphaV0_R
fmls v0.2s, v17.2s, alphaV0_I
- fmla v1.2s, v16.2s, alphaV1_I
- fmla v1.2s, v17.2s, alphaV1_R
+ fmla v1.2s, v16.2s, alphaV0_I
+ fmla v1.2s, v17.2s, alphaV0_R
st2 {v0.2s, v1.2s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -1090,8 +1156,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2s, v5.2s}, [pCRow1]
fmla v4.2s, v20.2s, alphaV0_R
fmls v4.2s, v21.2s, alphaV0_I
- fmla v5.2s, v20.2s, alphaV1_I
- fmla v5.2s, v21.2s, alphaV1_R
+ fmla v5.2s, v20.2s, alphaV0_I
+ fmla v5.2s, v21.2s, alphaV0_R
st2 {v4.2s, v5.2s}, [pCRow1]
add pCRow0, pCRow0, #16
@@ -1124,13 +1190,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
ld2 {v0.s, v1.s}[0], [pCRow1]
fmla s0, s16, alphaV0_R
fmls s0, s17, alphaV0_I
- fmla s1, s16, alphaV1_I
- fmla s1, s17, alphaV1_R
+ fmla s1, s16, alphaV0_I
+ fmla s1, s17, alphaV0_R
st2 {v0.s, v1.s}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@@ -1138,8 +1207,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.s, v5.s}[0], [pCRow1]
fmla s4, s20, alphaV0_R
fmls s4, s21, alphaV0_I
- fmla s5, s20, alphaV1_I
- fmla s5, s21, alphaV1_R
+ fmla s5, s20, alphaV0_I
+ fmla s5, s21, alphaV0_R
st2 {v4.s, v5.s}[0], [pCRow1]
add pCRow0, pCRow0, #8
@@ -1174,13 +1243,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x1
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
ld2 {v0.4s, v1.4s}, [pCRow1]
fmla v0.4s, v16.4s, alphaV0_R
fmls v0.4s, v17.4s, alphaV0_I
- fmla v1.4s, v16.4s, alphaV1_I
- fmla v1.4s, v17.4s, alphaV1_R
+ fmla v1.4s, v16.4s, alphaV0_I
+ fmla v1.4s, v17.4s, alphaV0_R
st2 {v0.4s, v1.4s}, [pCRow1]
add pCRow1, pCRow1, #32
@@ -1188,8 +1260,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pCRow1]
fmla v2.4s, v18.4s, alphaV0_R
fmls v2.4s, v19.4s, alphaV0_I
- fmla v3.4s, v18.4s, alphaV1_I
- fmla v3.4s, v19.4s, alphaV1_R
+ fmla v3.4s, v18.4s, alphaV0_I
+ fmla v3.4s, v19.4s, alphaV0_R
st2 {v2.4s, v3.4s}, [pCRow1]
add pCRow0, pCRow0, #64
@@ -1216,13 +1288,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
ld2 {v0.4s, v1.4s}, [pCRow1]
fmla v0.4s, v16.4s, alphaV0_R
fmls v0.4s, v17.4s, alphaV0_I
- fmla v1.4s, v16.4s, alphaV1_I
- fmla v1.4s, v17.4s, alphaV1_R
+ fmla v1.4s, v16.4s, alphaV0_I
+ fmla v1.4s, v17.4s, alphaV0_R
st2 {v0.4s, v1.4s}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -1248,13 +1323,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
ld2 {v0.2s, v1.2s}, [pCRow1]
fmla v0.2s, v16.2s, alphaV0_R
fmls v0.2s, v17.2s, alphaV0_I
- fmla v1.2s, v16.2s, alphaV1_I
- fmla v1.2s, v17.2s, alphaV1_R
+ fmla v1.2s, v16.2s, alphaV0_I
+ fmla v1.2s, v17.2s, alphaV0_R
st2 {v0.2s, v1.2s}, [pCRow1]
add pCRow0, pCRow0, #16
@@ -1281,13 +1359,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
ld2 {v0.s, v1.s}[0], [pCRow1]
fmla s0, s16, alphaV0_R
fmls s0, s17, alphaV0_I
- fmla s1, s16, alphaV1_I
- fmla s1, s17, alphaV1_R
+ fmla s1, s16, alphaV0_I
+ fmla s1, s17, alphaV0_R
st2 {v0.s, v1.s}[0], [pCRow1]
add pCRow0, pCRow0, #8
@@ -1313,10 +1394,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
- fmov alpha0_R, s0
- fmov alpha0_I, s1
- fmov alpha1_R, s0
- fmov alpha1_I, s1
+ prfm PLDL1KEEP, [origPB]
+ prfm PLDL1KEEP, [origPA]
+
+ fmov alphaR, s0
+ fmov alphaI, s1
lsl LDC, LDC, #3 // ldc = ldc * 8
@@ -1330,8 +1412,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/******************************************************************************/
cgemm_kernel_L4_BEGIN:
- mov pCRow0, pC // pCRow0 = C
- add pC, pC, LDC, lsl #2
+ mov pCRow0, pC
+ add pCRow1, pCRow0, LDC
+ add pCRow2, pCRow1, LDC
+ add pCRow3, pCRow2, LDC
+
+ add pC, pCRow3, LDC
mov pA, origPA // pA = start of A array
@@ -1342,44 +1428,69 @@ cgemm_kernel_L4_M8_BEGIN:
cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
+ .align 5
cgemm_kernel_L4_M8_20:
mov pB, origPB
- asr counterL , origK, #1 // L = K / 2
- cmp counterL , #2 // is there at least 4 to do?
+ asr counterL , origK, #3
+ cmp counterL , #2
blt cgemm_kernel_L4_M8_32
- KERNEL8x4_I // do one in the K
- KERNEL8x4_M2 // do another in the K
+ KERNEL8x4_I
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
- .align 5
+ .align 5
cgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
-
+ .align 5
cgemm_kernel_L4_M8_22a:
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b cgemm_kernel_L4_M8_44
+ .align 5
cgemm_kernel_L4_M8_32:
tst counterL, #1
ble cgemm_kernel_L4_M8_40
KERNEL8x4_I
-
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
KERNEL8x4_E
b cgemm_kernel_L4_M8_44
@@ -1390,14 +1501,21 @@ cgemm_kernel_L4_M8_40:
cgemm_kernel_L4_M8_44:
- ands counterL , origK, #1
+ ands counterL , origK, #7
ble cgemm_kernel_L4_M8_100
+ .align 5
cgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
+ subs counterL, counterL, #1
+ bne cgemm_kernel_L4_M8_46
+
cgemm_kernel_L4_M8_100:
+ prfm PLDL1KEEP, [pA]
+ prfm PLDL1KEEP, [pA, #64]
+ prfm PLDL1KEEP, [origPB]
SAVE8x4
diff --git a/kernel/arm64/copy.S b/kernel/arm64/copy.S
index 17aa5a1e8..70eab96fb 100644
--- a/kernel/arm64/copy.S
+++ b/kernel/arm64/copy.S
@@ -58,43 +58,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
str TMPF, [Y], #SZ
#else
#if !defined(DOUBLE)
- ld1 {v0.2s}, [X], #8
- st1 {v0.2s}, [Y], #8
+ ldr d0, [X], #8
+ str d0, [Y], #8
#else
- ld1 {v0.2d}, [X], #16
- st1 {v0.2d}, [Y], #16
+ ldr q0, [X], #16
+ str q0, [Y], #16
#endif
#endif
.endm
.macro KERNEL_F4
-
#if !defined(COMPLEX)
#if !defined(DOUBLE)
- ld1 {v0.4s}, [X], #16
- st1 {v0.4s}, [Y], #16
+ ldr q0, [X], #16
+ str q0, [Y], #16
#else // DOUBLE
- ld1 {v0.4s}, [X], #16
- ld1 {v1.4s}, [X], #16
- st1 {v0.4s}, [Y], #16
- st1 {v1.4s}, [Y], #16
+ ldr q0, [X], #16
+ str q0, [Y], #16
+ ldr q1, [X], #16
+ str q1, [Y], #16
+
#endif
#else // COMPLEX
#if !defined(DOUBLE)
- ld1 {v0.4s}, [X], #16
- ld1 {v1.4s}, [X], #16
- st1 {v0.4s}, [Y], #16
- st1 {v1.4s}, [Y], #16
+ ldr q0, [X], #16
+ str q0, [Y], #16
+ ldr q1, [X], #16
+ str q1, [Y], #16
#else // DOUBLE
- ld1 {v0.4s}, [X], #16
- ld1 {v1.4s}, [X], #16
- ld1 {v2.4s}, [X], #16
- ld1 {v3.4s}, [X], #16
- st1 {v0.4s}, [Y], #16
- st1 {v1.4s}, [Y], #16
- st1 {v2.4s}, [Y], #16
- st1 {v3.4s}, [Y], #16
+ ldr q0, [X], #16
+ str q0, [Y], #16
+ ldr q1, [X], #16
+ str q1, [Y], #16
+ ldr q2, [X], #16
+ str q2, [Y], #16
+ ldr q3, [X], #16
+ str q3, [Y], #16
#endif
#endif
diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S
index ce5cb0406..680fb56c3 100644
--- a/kernel/arm64/ctrmm_kernel_8x4.S
+++ b/kernel/arm64/ctrmm_kernel_8x4.S
@@ -46,20 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
-#define pA x15
-#define temp x16
-#define tempOffset x17
-#define tempK x18
+#define pCRow3 x15
+#define pA x16
+#define alphaR w17
+#define alphaI w18
+#define temp x19
+#define tempOffset x20
+#define tempK x21
#define alpha0_R s10
#define alphaV0_R v10.s[0]
#define alpha0_I s11
#define alphaV0_I v11.s[0]
-#define alpha1_R s14
-#define alphaV1_R v14.s[0]
-#define alpha1_I s15
-#define alphaV1_I v15.s[0]
+#define A_PRE_SIZE 2560
+#define B_PRE_SIZE 448
+#define C_PRE_SIZE 128
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define OP_rr fmla
@@ -124,14 +126,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I
//v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R
//v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I
-//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R
-//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I
-//v10 must save ALPHA0_R
-//v11 must save ALPHA0_I
-//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R
-//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I
-//v14 must save ALPHA1_R
-//v15 must save ALPHA1_I
+//v08 must save pB0_00_R, pB0_01_R
+//v09 must save pB0_00_I, pB0_01_I
+//v10 must save pB0_02_R, pB0_03_R --> ALPHA0_R
+//v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I
+//v12 must save pB1_00_R, pB1_01_R
+//v13 must save pB1_00_I, pB1_01_I
+//v14 must save pB1_02_R, pB1_03_R
+//v15 must save pB1_02_I, pB1_03_I
//v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R
//v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I
//v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R
@@ -149,6 +151,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R
//v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I
+
/*******************************************************************************
* Macro definitions
*******************************************************************************/
@@ -173,8 +176,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_I
- ld2 {v8.4s, v9.4s}, [pB]
- add pB, pB, #32
+ ld2 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
ld2 {v2.4s, v3.4s}, [pA]
@@ -191,6 +195,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v17.4s, v1.4s, v8.s[0]
+ ld2 {v10.2s, v11.2s}, [pB]
+ add pB, pB, #16
+
fmul v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -202,6 +209,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v19.4s, v3.4s, v8.s[0]
+ ld2 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+
fmul v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -213,6 +223,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v21.4s, v1.4s, v8.s[1]
+ ld2 {v14.2s, v15.2s}, [pB]
+ add pB, pB, #16
+
fmul v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -224,56 +237,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v23.4s, v3.4s, v8.s[1]
- fmul v24.4s, v0.4s, v8.s[2]
- OP_ii v24.4s, v1.4s, v9.s[2]
+ ld2 {v4.4s, v5.4s}, [pA]
+ add pA, pA, #32
+
+ fmul v24.4s, v0.4s, v10.s[0]
+ OP_ii v24.4s, v1.4s, v11.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.4s, v0.4s, v9.s[2]
+ fmls v25.4s, v0.4s, v11.s[0]
#else
- fmul v25.4s, v0.4s, v9.s[2]
+ fmul v25.4s, v0.4s, v11.s[0]
#endif
- OP_ir v25.4s, v1.4s, v8.s[2]
+ OP_ir v25.4s, v1.4s, v10.s[0]
- fmul v26.4s, v2.4s, v8.s[2]
- OP_ii v26.4s, v3.4s, v9.s[2]
+ ld2 {v6.4s, v7.4s}, [pA]
+ add pA, pA, #32
+
+ fmul v26.4s, v2.4s, v10.s[0]
+ OP_ii v26.4s, v3.4s, v11.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b
- fmls v27.4s, v2.4s, v9.s[2]
+ fmls v27.4s, v2.4s, v11.s[0]
#else
- fmul v27.4s, v2.4s, v9.s[2]
+ fmul v27.4s, v2.4s, v11.s[0]
#endif
- OP_ir v27.4s, v3.4s, v8.s[2]
+ OP_ir v27.4s, v3.4s, v10.s[0]
- fmul v28.4s, v0.4s, v8.s[3]
- OP_ii v28.4s, v1.4s, v9.s[3]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+ fmul v28.4s, v0.4s, v10.s[1]
+ OP_ii v28.4s, v1.4s, v11.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.4s, v0.4s, v9.s[3]
+ fmls v29.4s, v0.4s, v11.s[1]
#else
- fmul v29.4s, v0.4s, v9.s[3]
+ fmul v29.4s, v0.4s, v11.s[1]
#endif
- OP_ir v29.4s, v1.4s, v8.s[3]
+ OP_ir v29.4s, v1.4s, v10.s[1]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
- fmul v30.4s, v2.4s, v8.s[3]
- OP_ii v30.4s, v3.4s, v9.s[3]
+ fmul v30.4s, v2.4s, v10.s[1]
+ OP_ii v30.4s, v3.4s, v11.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b
- fmls v31.4s, v2.4s, v9.s[3]
+ fmls v31.4s, v2.4s, v11.s[1]
#else
- fmul v31.4s, v2.4s, v9.s[3]
+ fmul v31.4s, v2.4s, v11.s[1]
#endif
- OP_ir v31.4s, v3.4s, v8.s[3]
-
- ld2 {v12.4s, v13.4s}, [pB]
- add pB, pB, #32
- ld2 {v4.4s, v5.4s}, [pA]
- add pA, pA, #32
- ld2 {v6.4s, v7.4s}, [pA]
- add pA, pA, #32
+ OP_ir v31.4s, v3.4s, v10.s[1]
.endm
.macro KERNEL8x4_M1
@@ -282,47 +298,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.s[0]
+ ld2 {v12.2s, v13.2s}, [pB]
+ add pB, pB, #16
+
OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.s[0]
+ ld2 {v4.4s, v5.4s}, [pA]
+ add pA, pA, #32
+
OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.s[1]
+ ld2 {v6.4s, v7.4s}, [pA]
+ add pA, pA, #32
+
OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.s[1]
- OP_rr v24.4s, v0.4s, v8.s[2]
- OP_ii v24.4s, v1.4s, v9.s[2]
- OP_ri v25.4s, v0.4s, v9.s[2]
- OP_ir v25.4s, v1.4s, v8.s[2]
+ ld2 {v14.2s, v15.2s}, [pB]
+ add pB, pB, #16
- OP_rr v26.4s, v2.4s, v8.s[2]
- OP_ii v26.4s, v3.4s, v9.s[2]
- OP_ri v27.4s, v2.4s, v9.s[2]
- OP_ir v27.4s, v3.4s, v8.s[2]
+ OP_rr v24.4s, v0.4s, v10.s[0]
+ OP_ii v24.4s, v1.4s, v11.s[0]
+ OP_ri v25.4s, v0.4s, v11.s[0]
+ OP_ir v25.4s, v1.4s, v10.s[0]
- OP_rr v28.4s, v0.4s, v8.s[3]
- OP_ii v28.4s, v1.4s, v9.s[3]
- OP_ri v29.4s, v0.4s, v9.s[3]
- OP_ir v29.4s, v1.4s, v8.s[3]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
- OP_rr v30.4s, v2.4s, v8.s[3]
- OP_ii v30.4s, v3.4s, v9.s[3]
- OP_ri v31.4s, v2.4s, v9.s[3]
- OP_ir v31.4s, v3.4s, v8.s[3]
+ OP_rr v26.4s, v2.4s, v10.s[0]
+ OP_ii v26.4s, v3.4s, v11.s[0]
+ OP_ri v27.4s, v2.4s, v11.s[0]
+ OP_ir v27.4s, v3.4s, v10.s[0]
- ld2 {v12.4s, v13.4s}, [pB] // For next round
- add pB, pB, #32
- ld2 {v4.4s, v5.4s}, [pA] // For next round
- add pA, pA, #32
- ld2 {v6.4s, v7.4s}, [pA]
- add pA, pA, #32
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+
+ OP_rr v28.4s, v0.4s, v10.s[1]
+ OP_ii v28.4s, v1.4s, v11.s[1]
+ OP_ri v29.4s, v0.4s, v11.s[1]
+ OP_ir v29.4s, v1.4s, v10.s[1]
+
+ OP_rr v30.4s, v2.4s, v10.s[1]
+ OP_ii v30.4s, v3.4s, v11.s[1]
+ OP_ri v31.4s, v2.4s, v11.s[1]
+ OP_ir v31.4s, v3.4s, v10.s[1]
.endm
.macro KERNEL8x4_M2
@@ -331,47 +356,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.s[0]
+ ld2 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+
OP_rr v18.4s, v6.4s, v12.s[0]
OP_ii v18.4s, v7.4s, v13.s[0]
OP_ri v19.4s, v6.4s, v13.s[0]
OP_ir v19.4s, v7.4s, v12.s[0]
+ ld2 {v0.4s, v1.4s}, [pA]
+ add pA, pA, #32
+
OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.s[1]
+ ld2 {v2.4s, v3.4s}, [pA]
+ add pA, pA, #32
+
OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.s[1]
- OP_rr v24.4s, v4.4s, v12.s[2]
- OP_ii v24.4s, v5.4s, v13.s[2]
- OP_ri v25.4s, v4.4s, v13.s[2]
- OP_ir v25.4s, v5.4s, v12.s[2]
+ ld2 {v10.2s, v11.2s}, [pB]
+ add pB, pB, #16
- OP_rr v26.4s, v6.4s, v12.s[2]
- OP_ii v26.4s, v7.4s, v13.s[2]
- OP_ri v27.4s, v6.4s, v13.s[2]
- OP_ir v27.4s, v7.4s, v12.s[2]
+ OP_rr v24.4s, v4.4s, v14.s[0]
+ OP_ii v24.4s, v5.4s, v15.s[0]
+ OP_ri v25.4s, v4.4s, v15.s[0]
+ OP_ir v25.4s, v5.4s, v14.s[0]
- OP_rr v28.4s, v4.4s, v12.s[3]
- OP_ii v28.4s, v5.4s, v13.s[3]
- OP_ri v29.4s, v4.4s, v13.s[3]
- OP_ir v29.4s, v5.4s, v12.s[3]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- OP_rr v30.4s, v6.4s, v12.s[3]
- OP_ii v30.4s, v7.4s, v13.s[3]
- OP_ri v31.4s, v6.4s, v13.s[3]
- OP_ir v31.4s, v7.4s, v12.s[3]
+ OP_rr v26.4s, v6.4s, v14.s[0]
+ OP_ii v26.4s, v7.4s, v15.s[0]
+ OP_ri v27.4s, v6.4s, v15.s[0]
+ OP_ir v27.4s, v7.4s, v14.s[0]
- ld2 {v8.4s, v9.4s}, [pB]
- add pB, pB, #32
- ld2 {v0.4s, v1.4s}, [pA]
- add pA, pA, #32
- ld2 {v2.4s, v3.4s}, [pA]
- add pA, pA, #32
+ OP_rr v28.4s, v4.4s, v14.s[1]
+ OP_ii v28.4s, v5.4s, v15.s[1]
+ OP_ri v29.4s, v4.4s, v15.s[1]
+ OP_ir v29.4s, v5.4s, v14.s[1]
+
+ OP_rr v30.4s, v6.4s, v14.s[1]
+ OP_ii v30.4s, v7.4s, v15.s[1]
+ OP_ri v31.4s, v6.4s, v15.s[1]
+ OP_ir v31.4s, v7.4s, v14.s[1]
.endm
.macro KERNEL8x4_E
@@ -390,157 +422,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.s[1]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.s[1]
- OP_rr v24.4s, v4.4s, v12.s[2]
- OP_ii v24.4s, v5.4s, v13.s[2]
- OP_ri v25.4s, v4.4s, v13.s[2]
- OP_ir v25.4s, v5.4s, v12.s[2]
-
- OP_rr v26.4s, v6.4s, v12.s[2]
- OP_ii v26.4s, v7.4s, v13.s[2]
- OP_ri v27.4s, v6.4s, v13.s[2]
- OP_ir v27.4s, v7.4s, v12.s[2]
-
- OP_rr v28.4s, v4.4s, v12.s[3]
- OP_ii v28.4s, v5.4s, v13.s[3]
- OP_ri v29.4s, v4.4s, v13.s[3]
- OP_ir v29.4s, v5.4s, v12.s[3]
-
- OP_rr v30.4s, v6.4s, v12.s[3]
- OP_ii v30.4s, v7.4s, v13.s[3]
- OP_ri v31.4s, v6.4s, v13.s[3]
- OP_ir v31.4s, v7.4s, v12.s[3]
-
+ OP_rr v24.4s, v4.4s, v14.s[0]
+ OP_ii v24.4s, v5.4s, v15.s[0]
+ OP_ri v25.4s, v4.4s, v15.s[0]
+ OP_ir v25.4s, v5.4s, v14.s[0]
+
+ OP_rr v26.4s, v6.4s, v14.s[0]
+ OP_ii v26.4s, v7.4s, v15.s[0]
+ OP_ri v27.4s, v6.4s, v15.s[0]
+ OP_ir v27.4s, v7.4s, v14.s[0]
+
+ OP_rr v28.4s, v4.4s, v14.s[1]
+ OP_ii v28.4s, v5.4s, v15.s[1]
+ OP_ri v29.4s, v4.4s, v15.s[1]
+ OP_ir v29.4s, v5.4s, v14.s[1]
+
+ OP_rr v30.4s, v6.4s, v14.s[1]
+ OP_ii v30.4s, v7.4s, v15.s[1]
+ OP_ri v31.4s, v6.4s, v15.s[1]
+ OP_ir v31.4s, v7.4s, v14.s[1]
.endm
.macro KERNEL8x4_SUB
- ld2 {v8.4s, v9.4s}, [pB]
- add pB, pB, #32
+ ld2 {v8.2s, v9.2s}, [pB]
+ add pB, pB, #16
+
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- ld2 {v2.4s, v3.4s}, [pA]
- add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v18.4s, v2.4s, v8.s[0]
- OP_ii v18.4s, v3.4s, v9.s[0]
- OP_ri v19.4s, v2.4s, v9.s[0]
- OP_ir v19.4s, v3.4s, v8.s[0]
+ ld2 {v2.4s, v3.4s}, [pA]
+ add pA, pA, #32
OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.s[1]
+ ld2 {v10.2s, v11.2s}, [pB]
+ add pB, pB, #16
+
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.s[1]
- OP_rr v24.4s, v0.4s, v8.s[2]
- OP_ii v24.4s, v1.4s, v9.s[2]
- OP_ri v25.4s, v0.4s, v9.s[2]
- OP_ir v25.4s, v1.4s, v8.s[2]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
- OP_rr v26.4s, v2.4s, v8.s[2]
- OP_ii v26.4s, v3.4s, v9.s[2]
- OP_ri v27.4s, v2.4s, v9.s[2]
- OP_ir v27.4s, v3.4s, v8.s[2]
+ OP_rr v24.4s, v0.4s, v10.s[0]
+ OP_ii v24.4s, v1.4s, v11.s[0]
+ OP_ri v25.4s, v0.4s, v11.s[0]
+ OP_ir v25.4s, v1.4s, v10.s[0]
- OP_rr v28.4s, v0.4s, v8.s[3]
- OP_ii v28.4s, v1.4s, v9.s[3]
- OP_ri v29.4s, v0.4s, v9.s[3]
- OP_ir v29.4s, v1.4s, v8.s[3]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- OP_rr v30.4s, v2.4s, v8.s[3]
- OP_ii v30.4s, v3.4s, v9.s[3]
- OP_ri v31.4s, v2.4s, v9.s[3]
- OP_ir v31.4s, v3.4s, v8.s[3]
+ OP_rr v26.4s, v2.4s, v10.s[0]
+ OP_ii v26.4s, v3.4s, v11.s[0]
+ OP_ri v27.4s, v2.4s, v11.s[0]
+ OP_ir v27.4s, v3.4s, v10.s[0]
+ OP_rr v28.4s, v0.4s, v10.s[1]
+ OP_ii v28.4s, v1.4s, v11.s[1]
+ OP_ri v29.4s, v0.4s, v11.s[1]
+ OP_ir v29.4s, v1.4s, v10.s[1]
+
+ OP_rr v30.4s, v2.4s, v10.s[1]
+ OP_ii v30.4s, v3.4s, v11.s[1]
+ OP_ri v31.4s, v2.4s, v11.s[1]
+ OP_ir v31.4s, v3.4s, v10.s[1]
.endm
.macro SAVE8x4
- mov pCRow1, pCRow0
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
fmul v0.4s, v16.4s, alphaV0_R
fmls v0.4s, v17.4s, alphaV0_I
- fmul v1.4s, v16.4s, alphaV1_I
- fmla v1.4s, v17.4s, alphaV1_R
- st2 {v0.4s, v1.4s}, [pCRow1]
-
- add pCRow2, pCRow1, #32
+ fmul v1.4s, v16.4s, alphaV0_I
+ fmla v1.4s, v17.4s, alphaV0_R
+ st2 {v0.4s, v1.4s}, [pCRow0]
+ add pCRow0, pCRow0, #32
fmul v2.4s, v18.4s, alphaV0_R
fmls v2.4s, v19.4s, alphaV0_I
- fmul v3.4s, v18.4s, alphaV1_I
- fmla v3.4s, v19.4s, alphaV1_R
- st2 {v2.4s, v3.4s}, [pCRow2]
-
- add pCRow1, pCRow1, LDC
+ fmul v3.4s, v18.4s, alphaV0_I
+ fmla v3.4s, v19.4s, alphaV0_R
+ st2 {v2.4s, v3.4s}, [pCRow0]
+ add pCRow0, pCRow0, #32
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
fmul v4.4s, v20.4s, alphaV0_R
fmls v4.4s, v21.4s, alphaV0_I
- fmul v5.4s, v20.4s, alphaV1_I
- fmla v5.4s, v21.4s, alphaV1_R
+ fmul v5.4s, v20.4s, alphaV0_I
+ fmla v5.4s, v21.4s, alphaV0_R
st2 {v4.4s, v5.4s}, [pCRow1]
- add pCRow2, pCRow1, #32
-
+ add pCRow1, pCRow1, #32
fmul v6.4s, v22.4s, alphaV0_R
fmls v6.4s, v23.4s, alphaV0_I
- fmul v7.4s, v22.4s, alphaV1_I
- fmla v7.4s, v23.4s, alphaV1_R
- st2 {v6.4s, v7.4s}, [pCRow2]
-
- add pCRow1, pCRow1, LDC
+ fmul v7.4s, v22.4s, alphaV0_I
+ fmla v7.4s, v23.4s, alphaV0_R
+ st2 {v6.4s, v7.4s}, [pCRow1]
+ add pCRow1, pCRow1, #32
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
fmul v0.4s, v24.4s, alphaV0_R
fmls v0.4s, v25.4s, alphaV0_I
- fmul v1.4s, v24.4s, alphaV1_I
- fmla v1.4s, v25.4s, alphaV1_R
- st2 {v0.4s, v1.4s}, [pCRow1]
-
- add pCRow2, pCRow1, #32
+ fmul v1.4s, v24.4s, alphaV0_I
+ fmla v1.4s, v25.4s, alphaV0_R
+ st2 {v0.4s, v1.4s}, [pCRow2]
+ add pCRow2, pCRow2, #32
fmul v2.4s, v26.4s, alphaV0_R
fmls v2.4s, v27.4s, alphaV0_I
- fmul v3.4s, v26.4s, alphaV1_I
- fmla v3.4s, v27.4s, alphaV1_R
+ fmul v3.4s, v26.4s, alphaV0_I
+ fmla v3.4s, v27.4s, alphaV0_R
st2 {v2.4s, v3.4s}, [pCRow2]
- add pCRow1, pCRow1, LDC
-
+ add pCRow2, pCRow2, #32
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
fmul v4.4s, v28.4s, alphaV0_R
fmls v4.4s, v29.4s, alphaV0_I
- fmul v5.4s, v28.4s, alphaV1_I
- fmla v5.4s, v29.4s, alphaV1_R
- st2 {v4.4s, v5.4s}, [pCRow1]
-
- add pCRow2, pCRow1, #32
+ fmul v5.4s, v28.4s, alphaV0_I
+ fmla v5.4s, v29.4s, alphaV0_R
+ st2 {v4.4s, v5.4s}, [pCRow3]
+ add pCRow3, pCRow3, #32
fmul v6.4s, v30.4s, alphaV0_R
fmls v6.4s, v31.4s, alphaV0_I
- fmul v7.4s, v30.4s, alphaV1_I
- fmla v7.4s, v31.4s, alphaV1_R
- st2 {v6.4s, v7.4s}, [pCRow2]
+ fmul v7.4s, v30.4s, alphaV0_I
+ fmla v7.4s, v31.4s, alphaV0_R
+ st2 {v6.4s, v7.4s}, [pCRow3]
- add pCRow0, pCRow0, #64
+ add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@@ -722,13 +763,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
fmul v0.4s, v16.4s, alphaV0_R
fmls v0.4s, v17.4s, alphaV0_I
- fmul v1.4s, v16.4s, alphaV1_I
- fmla v1.4s, v17.4s, alphaV1_R
+ fmul v1.4s, v16.4s, alphaV0_I
+ fmla v1.4s, v17.4s, alphaV0_R
st2 {v0.4s, v1.4s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -736,8 +780,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v4.4s, v20.4s, alphaV0_R
fmls v4.4s, v21.4s, alphaV0_I
- fmul v5.4s, v20.4s, alphaV1_I
- fmla v5.4s, v21.4s, alphaV1_R
+ fmul v5.4s, v20.4s, alphaV0_I
+ fmla v5.4s, v21.4s, alphaV0_R
st2 {v4.4s, v5.4s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -745,8 +789,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v0.4s, v24.4s, alphaV0_R
fmls v0.4s, v25.4s, alphaV0_I
- fmul v1.4s, v24.4s, alphaV1_I
- fmla v1.4s, v25.4s, alphaV1_R
+ fmul v1.4s, v24.4s, alphaV0_I
+ fmla v1.4s, v25.4s, alphaV0_R
st2 {v0.4s, v1.4s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -754,8 +798,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v4.4s, v28.4s, alphaV0_R
fmls v4.4s, v29.4s, alphaV0_I
- fmul v5.4s, v28.4s, alphaV1_I
- fmla v5.4s, v29.4s, alphaV1_R
+ fmul v5.4s, v28.4s, alphaV0_I
+ fmla v5.4s, v29.4s, alphaV0_R
st2 {v4.4s, v5.4s}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -802,13 +846,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
fmul v0.2s, v16.2s, alphaV0_R
fmls v0.2s, v17.2s, alphaV0_I
- fmul v1.2s, v16.2s, alphaV1_I
- fmla v1.2s, v17.2s, alphaV1_R
+ fmul v1.2s, v16.2s, alphaV0_I
+ fmla v1.2s, v17.2s, alphaV0_R
st2 {v0.2s, v1.2s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -816,8 +863,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v4.2s, v20.2s, alphaV0_R
fmls v4.2s, v21.2s, alphaV0_I
- fmul v5.2s, v20.2s, alphaV1_I
- fmla v5.2s, v21.2s, alphaV1_R
+ fmul v5.2s, v20.2s, alphaV0_I
+ fmla v5.2s, v21.2s, alphaV0_R
st2 {v4.2s, v5.2s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -825,8 +872,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v0.2s, v24.2s, alphaV0_R
fmls v0.2s, v25.2s, alphaV0_I
- fmul v1.2s, v24.2s, alphaV1_I
- fmla v1.2s, v25.2s, alphaV1_R
+ fmul v1.2s, v24.2s, alphaV0_I
+ fmla v1.2s, v25.2s, alphaV0_R
st2 {v0.2s, v1.2s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -834,8 +881,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v4.2s, v28.2s, alphaV0_R
fmls v4.2s, v29.2s, alphaV0_I
- fmul v5.2s, v28.2s, alphaV1_I
- fmla v5.2s, v29.2s, alphaV1_R
+ fmul v5.2s, v28.2s, alphaV0_I
+ fmla v5.2s, v29.2s, alphaV0_R
st2 {v4.2s, v5.2s}, [pCRow1]
add pCRow0, pCRow0, #16
@@ -882,13 +929,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
fmul s0, s16, alphaV0_R
fmls s0, s17, alphaV0_I
- fmul s1, s16, alphaV1_I
- fmla s1, s17, alphaV1_R
+ fmul s1, s16, alphaV0_I
+ fmla s1, s17, alphaV0_R
st2 {v0.s, v1.s}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@@ -896,8 +946,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul s4, s20, alphaV0_R
fmls s4, s21, alphaV0_I
- fmul s5, s20, alphaV1_I
- fmla s5, s21, alphaV1_R
+ fmul s5, s20, alphaV0_I
+ fmla s5, s21, alphaV0_R
st2 {v4.s, v5.s}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@@ -905,8 +955,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul s0, s24, alphaV0_R
fmls s0, s25, alphaV0_I
- fmul s1, s24, alphaV1_I
- fmla s1, s25, alphaV1_R
+ fmul s1, s24, alphaV0_I
+ fmla s1, s25, alphaV0_R
st2 {v0.s, v1.s}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@@ -914,8 +964,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul s4, s28, alphaV0_R
fmls s4, s29, alphaV0_I
- fmul s5, s28, alphaV1_I
- fmla s5, s29, alphaV1_R
+ fmul s5, s28, alphaV0_I
+ fmla s5, s29, alphaV0_R
st2 {v4.s, v5.s}[0], [pCRow1]
add pCRow0, pCRow0, #8
@@ -964,13 +1014,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x2
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
fmul v0.4s, v16.4s, alphaV0_R
fmls v0.4s, v17.4s, alphaV0_I
- fmul v1.4s, v16.4s, alphaV1_I
- fmla v1.4s, v17.4s, alphaV1_R
+ fmul v1.4s, v16.4s, alphaV0_I
+ fmla v1.4s, v17.4s, alphaV0_R
st2 {v0.4s, v1.4s}, [pCRow1]
add pCRow2, pCRow1, #32
@@ -978,8 +1031,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v2.4s, v18.4s, alphaV0_R
fmls v2.4s, v19.4s, alphaV0_I
- fmul v3.4s, v18.4s, alphaV1_I
- fmla v3.4s, v19.4s, alphaV1_R
+ fmul v3.4s, v18.4s, alphaV0_I
+ fmla v3.4s, v19.4s, alphaV0_R
st2 {v2.4s, v3.4s}, [pCRow2]
add pCRow1, pCRow1, LDC
@@ -987,8 +1040,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v4.4s, v20.4s, alphaV0_R
fmls v4.4s, v21.4s, alphaV0_I
- fmul v5.4s, v20.4s, alphaV1_I
- fmla v5.4s, v21.4s, alphaV1_R
+ fmul v5.4s, v20.4s, alphaV0_I
+ fmla v5.4s, v21.4s, alphaV0_R
st2 {v4.4s, v5.4s}, [pCRow1]
add pCRow2, pCRow1, #32
@@ -996,8 +1049,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v6.4s, v22.4s, alphaV0_R
fmls v6.4s, v23.4s, alphaV0_I
- fmul v7.4s, v22.4s, alphaV1_I
- fmla v7.4s, v23.4s, alphaV1_R
+ fmul v7.4s, v22.4s, alphaV0_I
+ fmla v7.4s, v23.4s, alphaV0_R
st2 {v6.4s, v7.4s}, [pCRow2]
add pCRow0, pCRow0, #64
@@ -1030,13 +1083,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
fmul v0.4s, v16.4s, alphaV0_R
fmls v0.4s, v17.4s, alphaV0_I
- fmul v1.4s, v16.4s, alphaV1_I
- fmla v1.4s, v17.4s, alphaV1_R
+ fmul v1.4s, v16.4s, alphaV0_I
+ fmla v1.4s, v17.4s, alphaV0_R
st2 {v0.4s, v1.4s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -1044,8 +1100,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v4.4s, v20.4s, alphaV0_R
fmls v4.4s, v21.4s, alphaV0_I
- fmul v5.4s, v20.4s, alphaV1_I
- fmla v5.4s, v21.4s, alphaV1_R
+ fmul v5.4s, v20.4s, alphaV0_I
+ fmla v5.4s, v21.4s, alphaV0_R
st2 {v4.4s, v5.4s}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -1078,13 +1134,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
fmul v0.2s, v16.2s, alphaV0_R
fmls v0.2s, v17.2s, alphaV0_I
- fmul v1.2s, v16.2s, alphaV1_I
- fmla v1.2s, v17.2s, alphaV1_R
+ fmul v1.2s, v16.2s, alphaV0_I
+ fmla v1.2s, v17.2s, alphaV0_R
st2 {v0.2s, v1.2s}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -1092,8 +1151,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v4.2s, v20.2s, alphaV0_R
fmls v4.2s, v21.2s, alphaV0_I
- fmul v5.2s, v20.2s, alphaV1_I
- fmla v5.2s, v21.2s, alphaV1_R
+ fmul v5.2s, v20.2s, alphaV0_I
+ fmla v5.2s, v21.2s, alphaV0_R
st2 {v4.2s, v5.2s}, [pCRow1]
add pCRow0, pCRow0, #16
@@ -1126,13 +1185,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
fmul s0, s16, alphaV0_R
fmls s0, s17, alphaV0_I
- fmul s1, s16, alphaV1_I
- fmla s1, s17, alphaV1_R
+ fmul s1, s16, alphaV0_I
+ fmla s1, s17, alphaV0_R
st2 {v0.s, v1.s}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@@ -1140,8 +1202,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul s4, s20, alphaV0_R
fmls s4, s21, alphaV0_I
- fmul s5, s20, alphaV1_I
- fmla s5, s21, alphaV1_R
+ fmul s5, s20, alphaV0_I
+ fmla s5, s21, alphaV0_R
st2 {v4.s, v5.s}[0], [pCRow1]
add pCRow0, pCRow0, #8
@@ -1176,13 +1238,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x1
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
fmul v0.4s, v16.4s, alphaV0_R
fmls v0.4s, v17.4s, alphaV0_I
- fmul v1.4s, v16.4s, alphaV1_I
- fmla v1.4s, v17.4s, alphaV1_R
+ fmul v1.4s, v16.4s, alphaV0_I
+ fmla v1.4s, v17.4s, alphaV0_R
st2 {v0.4s, v1.4s}, [pCRow1]
add pCRow1, pCRow1, #32
@@ -1190,8 +1255,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v2.4s, v18.4s, alphaV0_R
fmls v2.4s, v19.4s, alphaV0_I
- fmul v3.4s, v18.4s, alphaV1_I
- fmla v3.4s, v19.4s, alphaV1_R
+ fmul v3.4s, v18.4s, alphaV0_I
+ fmla v3.4s, v19.4s, alphaV0_R
st2 {v2.4s, v3.4s}, [pCRow1]
add pCRow0, pCRow0, #64
@@ -1218,13 +1283,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
fmul v0.4s, v16.4s, alphaV0_R
fmls v0.4s, v17.4s, alphaV0_I
- fmul v1.4s, v16.4s, alphaV1_I
- fmla v1.4s, v17.4s, alphaV1_R
+ fmul v1.4s, v16.4s, alphaV0_I
+ fmla v1.4s, v17.4s, alphaV0_R
st2 {v0.4s, v1.4s}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -1250,13 +1318,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
fmul v0.2s, v16.2s, alphaV0_R
fmls v0.2s, v17.2s, alphaV0_I
- fmul v1.2s, v16.2s, alphaV1_I
- fmla v1.2s, v17.2s, alphaV1_R
+ fmul v1.2s, v16.2s, alphaV0_I
+ fmla v1.2s, v17.2s, alphaV0_R
st2 {v0.2s, v1.2s}, [pCRow1]
add pCRow0, pCRow0, #16
@@ -1283,13 +1354,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
+
mov pCRow1, pCRow0
fmul s0, s16, alphaV0_R
fmls s0, s17, alphaV0_I
- fmul s1, s16, alphaV1_I
- fmla s1, s17, alphaV1_R
+ fmul s1, s16, alphaV0_I
+ fmla s1, s17, alphaV0_R
st2 {v0.s, v1.s}[0], [pCRow1]
add pCRow0, pCRow0, #8
@@ -1315,10 +1389,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
- fmov alpha0_R, s0
- fmov alpha0_I, s1
- fmov alpha1_R, s0
- fmov alpha1_I, s1
+ prfm PLDL1KEEP, [origPB]
+ prfm PLDL1KEEP, [origPA]
+
+ fmov alphaR, s0
+ fmov alphaI, s1
lsl LDC, LDC, #3 // ldc = ldc * 8
@@ -1335,8 +1410,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/******************************************************************************/
ctrmm_kernel_L4_BEGIN:
- mov pCRow0, pC // pCRow0 = C
- add pC, pC, LDC, lsl #2
+ mov pCRow0, pC
+ add pCRow1, pCRow0, LDC
+ add pCRow2, pCRow1, LDC
+ add pCRow3, pCRow2, LDC
+
+ add pC, pCRow3, LDC
+
#if defined(LEFT)
mov tempOffset, offset
@@ -1370,40 +1450,64 @@ ctrmm_kernel_L4_M8_20:
add tempK, tempOffset, #4
#endif
- asr counterL , tempK, #1 // L = K / 2
- cmp counterL , #2 // is there at least 4 to do?
+ asr counterL , tempK, #3
+ cmp counterL , #2
blt ctrmm_kernel_L4_M8_32
- KERNEL8x4_I // do one in the K
- KERNEL8x4_M2 // do another in the K
+ KERNEL8x4_I
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble ctrmm_kernel_L4_M8_22a
- .align 5
+ .align 5
ctrmm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M8_22
-
+ .align 5
ctrmm_kernel_L4_M8_22a:
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b ctrmm_kernel_L4_M8_44
+ .align 5
ctrmm_kernel_L4_M8_32:
tst counterL, #1
ble ctrmm_kernel_L4_M8_40
KERNEL8x4_I
-
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
KERNEL8x4_E
b ctrmm_kernel_L4_M8_44
@@ -1414,13 +1518,17 @@ ctrmm_kernel_L4_M8_40:
ctrmm_kernel_L4_M8_44:
- ands counterL , tempK, #1
+ ands counterL , tempK, #7
ble ctrmm_kernel_L4_M8_100
+ .align 5
ctrmm_kernel_L4_M8_46:
KERNEL8x4_SUB
+ subs counterL, counterL, #1
+ bne ctrmm_kernel_L4_M8_46
+
ctrmm_kernel_L4_M8_100:
SAVE8x4
@@ -1440,6 +1548,9 @@ ctrmm_kernel_L4_M8_100:
#if defined(LEFT)
add tempOffset, tempOffset, #8
#endif
+ prfm PLDL1KEEP, [pA]
+ prfm PLDL1KEEP, [pA, #64]
+ prfm PLDL1KEEP, [origPB]
ctrmm_kernel_L4_M8_END:
subs counterI, counterI, #1
@@ -1454,9 +1565,8 @@ ctrmm_kernel_L4_M4_BEGIN:
tst counterI, #4
ble ctrmm_kernel_L4_M2_BEGIN
-ctrmm_kernel_L4_M4_20:
- INIT4x4
+ctrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@@ -1475,38 +1585,47 @@ ctrmm_kernel_L4_M4_20:
add tempK, tempOffset, #4
#endif
- asr counterL , tempK, #3 // counterL = counterL / 8
- cmp counterL , #0
- ble ctrmm_kernel_L4_M4_40
+ asr counterL , tempK, #1 // L = K / 2
+ cmp counterL , #2 // is there at least 4 to do?
+ blt ctrmm_kernel_L4_M4_32
-ctrmm_kernel_L4_M4_22:
+ KERNEL4x4_I // do one in the K
+ KERNEL4x4_M2 // do another in the K
- KERNEL4x4_SUB
- KERNEL4x4_SUB
- KERNEL4x4_SUB
- KERNEL4x4_SUB
+ subs counterL, counterL, #2
+ ble ctrmm_kernel_L4_M4_22a
+ .align 5
- KERNEL4x4_SUB
- KERNEL4x4_SUB
- KERNEL4x4_SUB
- KERNEL4x4_SUB
+
+ctrmm_kernel_L4_M4_22:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M4_22
-
+ctrmm_kernel_L4_M4_22a:
+ KERNEL4x4_M1
+ KERNEL4x4_E
+ b ctrmm_kernel_L4_M4_44
+ctrmm_kernel_L4_M4_32:
+ tst counterL, #1
+ ble ctrmm_kernel_L4_M4_40
+ KERNEL4x4_I
+ KERNEL4x4_E
+ b ctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_40:
- ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L4_M4_100
+ INIT4x4
-ctrmm_kernel_L4_M4_42:
+ctrmm_kernel_L4_M4_44:
+ ands counterL , tempK, #1
+ ble ctrmm_kernel_L4_M4_100
+ctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
- subs counterL, counterL, #1
- bgt ctrmm_kernel_L4_M4_42
-
ctrmm_kernel_L4_M4_100:
SAVE4x4
@@ -1528,7 +1647,6 @@ ctrmm_kernel_L4_M4_100:
ctrmm_kernel_L4_M4_END:
-
ctrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S
index f3c3d5c35..3fd74fc3b 100644
--- a/kernel/arm64/dgemm_kernel_8x4.S
+++ b/kernel/arm64/dgemm_kernel_8x4.S
@@ -339,7 +339,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp q0, q1, [pCRow0]
add pCRow0, pCRow0, #32
- prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ldp q2, q3, [pCRow0]
fmla v2.2d, v18.2d, alphaV0
@@ -356,7 +355,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp q4, q5, [pCRow1]
add pCRow1, pCRow1, #32
- prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ldp q6, q7, [pCRow1]
fmla v6.2d, v22.2d, alphaV0
@@ -373,7 +371,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp q0, q1, [pCRow2]
add pCRow2, pCRow2, #32
- prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
ldp q2, q3, [pCRow2]
fmla v2.2d, v26.2d, alphaV0
@@ -390,7 +387,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp q4, q5, [pCRow3]
add pCRow3, pCRow3, #32
- prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
ldp q6, q7, [pCRow3]
fmla v6.2d, v30.2d, alphaV0
@@ -434,33 +430,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x4
fmov alpha0, alpha
+
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
- add pCRow1, pCRow0, LDC
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+ add pCRow0, pCRow0, #32
ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
- add pCRow2, pCRow1, LDC
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add pCRow1, pCRow1, #32
ld1 {v8.2d, v9.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV0
fmla v9.2d, v25.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow2]
- add pCRow1, pCRow2, LDC
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+ add pCRow2, pCRow2, #32
- ld1 {v12.2d, v13.2d}, [pCRow1]
+ ld1 {v12.2d, v13.2d}, [pCRow3]
fmla v12.2d, v28.2d, alphaV0
fmla v13.2d, v29.2d, alphaV0
- st1 {v12.2d, v13.2d}, [pCRow1]
+ st1 {v12.2d, v13.2d}, [pCRow3]
- add pCRow0, pCRow0, #32
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+ add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@@ -487,29 +488,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE2x4
fmov alpha0, alpha
+
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
- add pCRow1, pCRow0, LDC
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+ add pCRow0, pCRow0, #16
ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
- add pCRow2, pCRow1, LDC
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add pCRow1, pCRow1, #16
ld1 {v8.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV0
st1 {v8.2d}, [pCRow2]
- add pCRow1, pCRow2, LDC
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+ add pCRow2, pCRow2, #16
- ld1 {v12.2d}, [pCRow1]
+ ld1 {v12.2d}, [pCRow3]
fmla v12.2d, v28.2d, alphaV0
- st1 {v12.2d}, [pCRow1]
+ st1 {v12.2d}, [pCRow3]
- add pCRow0, pCRow0, #16
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+ add pCRow3, pCRow3, #16
.endm
/******************************************************************************/
@@ -532,7 +538,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE1x4
fmov alpha0, alpha
- add pCRow1, pCRow0, LDC
ld1 {v8.d}[0], [pCRow0]
ld1 {v8.d}[1], [pCRow1]
@@ -540,16 +545,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
st1 {v8.d}[0], [pCRow0]
st1 {v8.d}[1], [pCRow1]
- add pCRow2, pCRow1, LDC
- add pCRow1, pCRow2, LDC
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+ add pCRow0, pCRow0, #8
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add pCRow1, pCRow1, #8
ld1 {v12.d}[0], [pCRow2]
- ld1 {v12.d}[1], [pCRow1]
+ ld1 {v12.d}[1], [pCRow3]
fmla v12.2d, v20.2d, alphaV0
st1 {v12.d}[0], [pCRow2]
- st1 {v12.d}[1], [pCRow1]
+ st1 {v12.d}[1], [pCRow3]
- add pCRow0, pCRow0, #8
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+ add pCRow2, pCRow2, #8
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+ add pCRow3, pCRow3, #8
.endm
/******************************************************************************/
@@ -578,6 +588,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.d[0]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.d[1]
@@ -586,7 +598,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE8x2
fmov alpha0, alpha
- add pCRow1, pCRow0, LDC
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0
@@ -595,6 +606,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+ add pCRow0, pCRow0, #64
+
ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0
fmla v5.2d, v21.2d, alphaV0
@@ -602,7 +616,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla v7.2d, v23.2d, alphaV0
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
- add pCRow0, pCRow0, #64
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add pCRow1, pCRow1, #64
.endm
/******************************************************************************/
@@ -628,19 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x2
fmov alpha0, alpha
+
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
- add pCRow1, pCRow0, LDC
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+ add pCRow0, pCRow0, #32
ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
- add pCRow0, pCRow0, #32
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add pCRow1, pCRow1, #32
.endm
/******************************************************************************/
@@ -663,17 +681,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE2x2
fmov alpha0, alpha
+
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
- add pCRow1 , pCRow0, LDC
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+ add pCRow0, pCRow0, #16
ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
- add pCRow0, pCRow0, #16
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add pCRow1, pCRow1, #16
.endm
/******************************************************************************/
@@ -694,7 +715,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE1x2
fmov alpha0, alpha
- add pCRow1 , pCRow0, LDC
ld1 {v8.d}[0], [pCRow0]
ld1 {v8.d}[1], [pCRow1]
@@ -702,7 +722,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
st1 {v8.d}[0], [pCRow0]
st1 {v8.d}[1], [pCRow1]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #8
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add pCRow1, pCRow1, #8
.endm
/******************************************************************************/
@@ -726,12 +749,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.d[0]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.d[0]
.endm
.macro SAVE8x1
fmov alpha0, alpha
+
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV0
@@ -739,6 +764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #64
.endm
@@ -763,11 +789,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x1
fmov alpha0, alpha
+
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #32
.endm
@@ -790,10 +818,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE2x1
fmov alpha0, alpha
+
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #16
.endm
@@ -819,6 +849,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmadd d8, d16, alpha0, d8
str d8, [pCRow0]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #8
.endm
@@ -858,6 +889,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/******************************************************************************/
+ .align 5
dgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
@@ -989,17 +1021,26 @@ dgemm_kernel_L4_M4_20:
cmp counterL , #0
ble dgemm_kernel_L4_M4_40
+ .align 5
dgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
@@ -1012,6 +1053,8 @@ dgemm_kernel_L4_M4_40:
dgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
@@ -1022,7 +1065,6 @@ dgemm_kernel_L4_M4_100:
dgemm_kernel_L4_M4_END:
-
dgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
@@ -1042,16 +1084,23 @@ dgemm_kernel_L4_M2_20:
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
+ .align 5
dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x4_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x4_SUB
KERNEL2x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x4_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x4_SUB
subs counterL, counterL, #1
@@ -1063,9 +1112,12 @@ dgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
dgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
@@ -1092,15 +1144,22 @@ dgemm_kernel_L4_M1_20:
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
+ .align 5
dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
KERNEL1x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
KERNEL1x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
KERNEL1x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
subs counterL, counterL, #1
@@ -1112,9 +1171,11 @@ dgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
dgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
@@ -1143,9 +1204,10 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
- mov pCRow0, pC // pCRow0 = pC
+ mov pCRow0, pC
+ add pCRow1, pCRow0, LDC
- add pC,pC,LDC, lsl #1
+ add pC, pCRow1, LDC
mov pA, origPA // pA = A
@@ -1156,6 +1218,7 @@ dgemm_kernel_L2_M8_BEGIN:
cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN
+ .align 5
dgemm_kernel_L2_M8_20:
INIT8x2
@@ -1165,28 +1228,31 @@ dgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M8_40
- .align 5
+ .align 5
dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL8x2_SUB
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22
-
dgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
@@ -1221,17 +1287,23 @@ dgemm_kernel_L2_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
- .align 5
+ .align 5
dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
KERNEL4x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
subs counterL, counterL, #1
@@ -1243,9 +1315,12 @@ dgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
@@ -1279,19 +1354,26 @@ dgemm_kernel_L2_M2_20:
dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x2_SUB
KERNEL2x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x2_SUB
KERNEL2x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x2_SUB
KERNEL2x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
-
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
@@ -1329,18 +1411,24 @@ dgemm_kernel_L2_M1_20:
dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x2_SUB
KERNEL1x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
KERNEL1x2_SUB
KERNEL1x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
-
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
@@ -1380,6 +1468,7 @@ dgemm_kernel_L1_M8_BEGIN:
cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN
+ .align 5
dgemm_kernel_L1_M8_20:
INIT8x1
@@ -1388,14 +1477,16 @@ dgemm_kernel_L1_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M8_40
- .align 5
+ .align 5
dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@@ -1410,6 +1501,7 @@ dgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
@@ -1443,17 +1535,23 @@ dgemm_kernel_L1_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
- .align 5
+ .align 5
dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
KERNEL4x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
KERNEL4x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
KERNEL4x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
subs counterL, counterL, #1
@@ -1465,9 +1563,11 @@ dgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
@@ -1501,18 +1601,24 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x1_SUB
KERNEL2x1_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
KERNEL2x1_SUB
KERNEL2x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
-
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
@@ -1547,14 +1653,17 @@ dgemm_kernel_L1_M1_20:
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
+
dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -1567,6 +1676,8 @@ dgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S
index b06c7560d..2b8173715 100644
--- a/kernel/arm64/dtrmm_kernel_8x4.S
+++ b/kernel/arm64/dtrmm_kernel_8x4.S
@@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
-#define pA x15
-#define temp x16
-#define tempOffset x17
-#define tempK x18
+#define pCRow3 x15
+#define pA x16
+#define alpha x17
+#define temp x18
+#define tempOffset x19
+#define tempK x20
#define alpha0 d10
#define alphaV0 v10.d[0]
-#define alpha1 d11
-#define alphaV1 v11.d[0]
-#define alpha2 d14
-#define alphaV2 v14.d[0]
-#define alpha3 d15
-#define alphaV3 v15.d[0]
+
+#define A_PRE_SIZE 2560
+#define B_PRE_SIZE 448
+#define C_PRE_SIZE 128
// 00 origM
// 01 origN
@@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v05 pA1_2, pA1_3
//v06 pA1_4, pA1_5
//v07 pA1_6, pA1_7
-//v08 must save pB0_0, pB0_1
-//v09 must save pB0_2, pB0_3
-//v10 must save ALPHA0
-//v11 must save ALPHA1
-//v12 must save pB1_0, pB1_1
-//v13 must save pB1_2, pB1_3
-//v14 must save ALPHA2
-//v15 must save ALPHA3
+//v08 must save pB0_0
+//v09 must save pB0_1
+//v10 must save pB0_2 --> ALPHA0
+//v11 must save pB0_3
+//v12 must save pB1_0
+//v13 must save pB1_1
+//v14 must save pB1_2
+//v15 must save pB1_3
//v16 must save C00, C01
//v17 must save C02, C03
//v18 C04, C05
@@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_I
- ld1 {v0.2d, v1.2d}, [pA]
- add pA, pA, #32
- ld1 {v8.2d, v9.2d}, [pB]
- add pB, pB, #32
- ld1 {v2.2d, v3.2d}, [pA]
- add pA, pA, #32
+ ldp q0, q1, [pA], #32
+
+ ldp d8, d9, [pB], #16
fmul v16.2d, v0.2d, v8.d[0]
+ fmul v20.2d, v0.2d, v9.d[0]
+
+ ldp d10, d11, [pB], #16
+
fmul v17.2d, v1.2d, v8.d[0]
+ fmul v21.2d, v1.2d, v9.d[0]
+
+ ldp q2, q3, [pA], #32
+
+ fmul v24.2d, v0.2d, v10.d[0]
+ fmul v28.2d, v0.2d, v11.d[0]
+
+ ldp q4, q5, [pA], #32
+
+ fmul v25.2d, v1.2d, v10.d[0]
+ fmul v29.2d, v1.2d, v11.d[0]
+
+ ldp d12, d13, [pB], #16
+
fmul v18.2d, v2.2d, v8.d[0]
- fmul v19.2d, v3.2d, v8.d[0]
+ fmul v22.2d, v2.2d, v9.d[0]
- fmul v20.2d, v0.2d, v8.d[1]
- fmul v21.2d, v1.2d, v8.d[1]
- fmul v22.2d, v2.2d, v8.d[1]
- fmul v23.2d, v3.2d, v8.d[1]
+ ldp d14, d15, [pB], #16
- fmul v24.2d, v0.2d, v9.d[0]
- fmul v25.2d, v1.2d, v9.d[0]
- fmul v26.2d, v2.2d, v9.d[0]
- fmul v27.2d, v3.2d, v9.d[0]
+ fmul v26.2d, v2.2d, v10.d[0]
+ fmul v30.2d, v2.2d, v11.d[0]
- fmul v28.2d, v0.2d, v9.d[1]
- fmul v29.2d, v1.2d, v9.d[1]
- fmul v30.2d, v2.2d, v9.d[1]
- fmul v31.2d, v3.2d, v9.d[1]
+ ldp q6, q7, [pA], #32
- ld1 {v4.2d, v5.2d}, [pA]
- add pA, pA, #32
- ld1 {v12.2d, v13.2d}, [pB]
- add pB, pB, #32
- ld1 {v6.2d, v7.2d}, [pA]
- add pA, pA, #32
+ fmul v19.2d, v3.2d, v8.d[0]
+ fmul v27.2d, v3.2d, v10.d[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+ fmul v31.2d, v3.2d, v11.d[0]
+ fmul v23.2d, v3.2d, v9.d[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL8x4_M1
fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v9.d[0]
+
+ ldp q4, q5, [pA], #32
+
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v28.2d, v0.2d, v11.d[0]
+
+ ldp d12, d13, [pB], #16
+
fmla v17.2d, v1.2d, v8.d[0]
- fmla v18.2d, v2.2d, v8.d[0]
- fmla v19.2d, v3.2d, v8.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
- fmla v20.2d, v0.2d, v8.d[1]
- fmla v21.2d, v1.2d, v8.d[1]
- fmla v22.2d, v2.2d, v8.d[1]
- fmla v23.2d, v3.2d, v8.d[1]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
- fmla v24.2d, v0.2d, v9.d[0]
- fmla v25.2d, v1.2d, v9.d[0]
- fmla v26.2d, v2.2d, v9.d[0]
- fmla v27.2d, v3.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
- fmla v28.2d, v0.2d, v9.d[1]
- fmla v29.2d, v1.2d, v9.d[1]
- fmla v30.2d, v2.2d, v9.d[1]
- fmla v31.2d, v3.2d, v9.d[1]
+ ldp d14, d15, [pB], #16
- ld1 {v4.2d, v5.2d}, [pA]
- add pA, pA, #32
- ld1 {v12.2d, v13.2d}, [pB]
- add pB, pB, #32
- ld1 {v6.2d, v7.2d}, [pA]
- add pA, pA, #32
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v22.2d, v2.2d, v9.d[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+ fmla v26.2d, v2.2d, v10.d[0]
+ fmla v30.2d, v2.2d, v11.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
+ fmla v23.2d, v3.2d, v9.d[0]
+
+ ldp q6, q7, [pA], #32
- prfm PLDL1KEEP, [pA, #512]
+ fmla v27.2d, v3.2d, v10.d[0]
+ fmla v31.2d, v3.2d, v11.d[0]
.endm
.macro KERNEL8x4_M2
fmla v16.2d, v4.2d, v12.d[0]
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v28.2d, v4.2d, v15.d[0]
+
+ ldp q0, q1, [pA], #32
+
fmla v17.2d, v5.2d, v12.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
+
+ ldp d8, d9, [pB], #16
+
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
+
+ ldp d10, d11, [pB], #16
+
fmla v18.2d, v6.2d, v12.d[0]
- fmla v19.2d, v7.2d, v12.d[0]
+ fmla v22.2d, v6.2d, v13.d[0]
- fmla v20.2d, v4.2d, v12.d[1]
- fmla v21.2d, v5.2d, v12.d[1]
- fmla v22.2d, v6.2d, v12.d[1]
- fmla v23.2d, v7.2d, v12.d[1]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- fmla v24.2d, v4.2d, v13.d[0]
- fmla v25.2d, v5.2d, v13.d[0]
- fmla v26.2d, v6.2d, v13.d[0]
- fmla v27.2d, v7.2d, v13.d[0]
+ fmla v26.2d, v6.2d, v14.d[0]
+ fmla v30.2d, v6.2d, v15.d[0]
- fmla v28.2d, v4.2d, v13.d[1]
- fmla v29.2d, v5.2d, v13.d[1]
- fmla v30.2d, v6.2d, v13.d[1]
- fmla v31.2d, v7.2d, v13.d[1]
+ fmla v19.2d, v7.2d, v12.d[0]
+ fmla v23.2d, v7.2d, v13.d[0]
- ld1 {v0.2d, v1.2d}, [pA]
- add pA, pA, #32
- ld1 {v8.2d, v9.2d}, [pB]
- add pB, pB, #32
- ld1 {v2.2d, v3.2d}, [pA]
- add pA, pA, #32
+ ldp q2, q3, [pA], #32
- prfm PLDL1KEEP, [pB, #512]
+ fmla v27.2d, v7.2d, v14.d[0]
+ fmla v31.2d, v7.2d, v15.d[0]
.endm
.macro KERNEL8x4_E
fmla v16.2d, v4.2d, v12.d[0]
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v28.2d, v4.2d, v15.d[0]
+
fmla v17.2d, v5.2d, v12.d[0]
- fmla v18.2d, v6.2d, v12.d[0]
- fmla v19.2d, v7.2d, v12.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
- fmla v20.2d, v4.2d, v12.d[1]
- fmla v21.2d, v5.2d, v12.d[1]
- fmla v22.2d, v6.2d, v12.d[1]
- fmla v23.2d, v7.2d, v12.d[1]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- fmla v24.2d, v4.2d, v13.d[0]
- fmla v25.2d, v5.2d, v13.d[0]
- fmla v26.2d, v6.2d, v13.d[0]
- fmla v27.2d, v7.2d, v13.d[0]
+ fmla v18.2d, v6.2d, v12.d[0]
+ fmla v22.2d, v6.2d, v13.d[0]
+ fmla v26.2d, v6.2d, v14.d[0]
+ fmla v30.2d, v6.2d, v15.d[0]
- fmla v28.2d, v4.2d, v13.d[1]
- fmla v29.2d, v5.2d, v13.d[1]
- fmla v30.2d, v6.2d, v13.d[1]
- fmla v31.2d, v7.2d, v13.d[1]
+ fmla v19.2d, v7.2d, v12.d[0]
+ fmla v23.2d, v7.2d, v13.d[0]
+ fmla v27.2d, v7.2d, v14.d[0]
+ fmla v31.2d, v7.2d, v15.d[0]
.endm
.macro KERNEL8x4_SUB
- ld1 {v0.2d, v1.2d}, [pA]
- add pA, pA, #32
- ld1 {v8.2d, v9.2d}, [pB]
- add pB, pB, #32
- ld1 {v2.2d, v3.2d}, [pA]
- add pA, pA, #32
+ ldp q0, q1, [pA], #32
+
+ ldp d8, d9, [pB], #16
fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v9.d[0]
+
+ ldp d10, d11, [pB], #16
+
fmla v17.2d, v1.2d, v8.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+
+ ldp q2, q3, [pA], #32
+
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v28.2d, v0.2d, v11.d[0]
+
+ fmla v25.2d, v1.2d, v10.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
fmla v18.2d, v2.2d, v8.d[0]
- fmla v19.2d, v3.2d, v8.d[0]
+ fmla v22.2d, v2.2d, v9.d[0]
- fmla v20.2d, v0.2d, v8.d[1]
- fmla v21.2d, v1.2d, v8.d[1]
- fmla v22.2d, v2.2d, v8.d[1]
- fmla v23.2d, v3.2d, v8.d[1]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
- fmla v24.2d, v0.2d, v9.d[0]
- fmla v25.2d, v1.2d, v9.d[0]
- fmla v26.2d, v2.2d, v9.d[0]
- fmla v27.2d, v3.2d, v9.d[0]
+ fmla v26.2d, v2.2d, v10.d[0]
+ fmla v30.2d, v2.2d, v11.d[0]
- fmla v28.2d, v0.2d, v9.d[1]
- fmla v29.2d, v1.2d, v9.d[1]
- fmla v30.2d, v2.2d, v9.d[1]
- fmla v31.2d, v3.2d, v9.d[1]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+ fmla v19.2d, v3.2d, v8.d[0]
+ fmla v27.2d, v3.2d, v10.d[0]
+
+ fmla v31.2d, v3.2d, v11.d[0]
+ fmla v23.2d, v3.2d, v9.d[0]
.endm
.macro SAVE8x4
- add pCRow1, pCRow0, LDC
+ fmov alpha0, alpha
+
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
fmul v0.2d, v16.2d, alphaV0
- fmul v1.2d, v17.2d, alphaV1
- fmul v2.2d, v18.2d, alphaV2
- fmul v3.2d, v19.2d, alphaV3
- st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+ fmul v1.2d, v17.2d, alphaV0
+ stp q0, q1, [pCRow0]
- add pCRow2, pCRow1, LDC
+ add pCRow0, pCRow0, #32
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+ fmul v2.2d, v18.2d, alphaV0
+ fmul v3.2d, v19.2d, alphaV0
+ stp q2, q3, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
fmul v4.2d, v20.2d, alphaV0
- fmul v5.2d, v21.2d, alphaV1
- fmul v6.2d, v22.2d, alphaV2
- fmul v7.2d, v23.2d, alphaV3
- st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
+ fmul v5.2d, v21.2d, alphaV0
+ stp q4, q5, [pCRow1]
- add pCRow1, pCRow2, LDC
+ add pCRow1, pCRow1, #32
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+ fmul v6.2d, v22.2d, alphaV0
+ fmul v7.2d, v23.2d, alphaV0
+ stp q6, q7, [pCRow1]
+
+ add pCRow1, pCRow1, #32
+
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
fmul v0.2d, v24.2d, alphaV0
- fmul v1.2d, v25.2d, alphaV1
- fmul v2.2d, v26.2d, alphaV2
- fmul v3.2d, v27.2d, alphaV3
- st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2]
+ fmul v1.2d, v25.2d, alphaV0
+ stp q0, q1, [pCRow2]
+
+ add pCRow2, pCRow2, #32
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+ fmul v2.2d, v26.2d, alphaV0
+ fmul v3.2d, v27.2d, alphaV0
+ stp q2, q3, [pCRow2]
+
+ add pCRow2, pCRow2, #32
+
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
fmul v4.2d, v28.2d, alphaV0
- fmul v5.2d, v29.2d, alphaV1
- fmul v6.2d, v30.2d, alphaV2
- fmul v7.2d, v31.2d, alphaV3
- st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
+ fmul v5.2d, v29.2d, alphaV0
+ stp q4, q5, [pCRow3]
- add pCRow0, pCRow0, #64
+ add pCRow3, pCRow3, #32
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+ fmul v6.2d, v30.2d, alphaV0
+ fmul v7.2d, v31.2d, alphaV0
+ stp q6, q7, [pCRow3]
+
+ add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
+ fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
- fmul v9.2d, v17.2d, alphaV1
+ fmul v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
- fmul v12.2d, v20.2d, alphaV2
- fmul v13.2d, v21.2d, alphaV3
+ fmul v12.2d, v20.2d, alphaV0
+ fmul v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
fmul v8.2d, v24.2d, alphaV0
- fmul v9.2d, v25.2d, alphaV1
+ fmul v9.2d, v25.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
- fmul v12.2d, v28.2d, alphaV2
- fmul v13.2d, v29.2d, alphaV3
+ fmul v12.2d, v28.2d, alphaV0
+ fmul v13.2d, v29.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
+ fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
- fmul v12.2d, v20.2d, alphaV1
+ fmul v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
- fmul v8.2d, v24.2d, alphaV2
+ fmul v8.2d, v24.2d, alphaV0
st1 {v8.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
- fmul v12.2d, v28.2d, alphaV3
+ fmul v12.2d, v28.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
@@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
+ fmov alpha0, alpha
+
add pCRow1, pCRow0, LDC
fmul v8.2d, v16.2d, alphaV0
@@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow2, pCRow1, LDC
add pCRow1, pCRow2, LDC
- fmul v12.2d, v20.2d, alphaV1
+ fmul v12.2d, v20.2d, alphaV0
st1 {v12.d}[0], [pCRow2]
st1 {v12.d}[1], [pCRow1]
@@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x2
+ fmov alpha0, alpha
add pCRow1, pCRow0, LDC
fmul v0.2d, v16.2d, alphaV0
- fmul v1.2d, v17.2d, alphaV1
- fmul v2.2d, v18.2d, alphaV2
- fmul v3.2d, v19.2d, alphaV3
+ fmul v1.2d, v17.2d, alphaV0
+ fmul v2.2d, v18.2d, alphaV0
+ fmul v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmul v4.2d, v20.2d, alphaV0
- fmul v5.2d, v21.2d, alphaV1
- fmul v6.2d, v22.2d, alphaV2
- fmul v7.2d, v23.2d, alphaV3
+ fmul v5.2d, v21.2d, alphaV0
+ fmul v6.2d, v22.2d, alphaV0
+ fmul v7.2d, v23.2d, alphaV0
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
add pCRow0, pCRow0, #64
@@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
+ fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
- fmul v9.2d, v17.2d, alphaV1
+ fmul v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
- fmul v12.2d, v20.2d, alphaV2
- fmul v13.2d, v21.2d, alphaV3
+ fmul v12.2d, v20.2d, alphaV0
+ fmul v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
+ fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
add pCRow1 , pCRow0, LDC
- fmul v12.2d, v20.2d, alphaV1
+ fmul v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
@@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
+ fmov alpha0, alpha
add pCRow1 , pCRow0, LDC
fmul v8.2d, v16.2d, alphaV0
@@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x1
+ fmov alpha0, alpha
fmul v0.2d, v16.2d, alphaV0
- fmul v1.2d, v17.2d, alphaV1
- fmul v2.2d, v18.2d, alphaV2
- fmul v3.2d, v19.2d, alphaV3
+ fmul v1.2d, v17.2d, alphaV0
+ fmul v2.2d, v18.2d, alphaV0
+ fmul v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #64
@@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
+ fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
- fmul v9.2d, v17.2d, alphaV1
+ fmul v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow0, pCRow0, #32
@@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
+ fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
@@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
+ fmov alpha0, alpha
fmul d8, d16, alpha0
str d8, [pCRow0]
@@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
- fmov alpha0, d0
- fmov alpha1, d0
- fmov alpha2, d0
- fmov alpha3, d0
+ prfm PLDL1KEEP, [origPB]
+ prfm PLDL1KEEP, [origPA]
+
+ fmov alpha, d0
lsl LDC, LDC, #3 // ldc = ldc * 8
@@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/******************************************************************************/
dtrmm_kernel_L4_BEGIN:
- mov pCRow0, pC // pCRow0 = C
- add pC, pC, LDC, lsl #2
+ mov pCRow0, pC
+ add pCRow1, pCRow0, LDC
+ add pCRow2, pCRow1, LDC
+ add pCRow3, pCRow2, LDC
+
+ add pC, pCRow3, LDC
+
#if defined(LEFT)
mov tempOffset, offset
@@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN:
cmp counterI, #0
ble dtrmm_kernel_L4_M4_BEGIN
+ .align 5
dtrmm_kernel_L4_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20:
add tempK, tempOffset, #4
#endif
- asr counterL , tempK, #1 // L = K / 2
+ asr counterL , tempK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble dtrmm_kernel_L4_M8_22a
- .align 5
+ .align 5
dtrmm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M8_22
-
+ .align 5
dtrmm_kernel_L4_M8_22a:
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b dtrmm_kernel_L4_M8_44
+ .align 5
dtrmm_kernel_L4_M8_32:
tst counterL, #1
ble dtrmm_kernel_L4_M8_40
KERNEL8x4_I
-
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
+ KERNEL8x4_M2
+ KERNEL8x4_M1
KERNEL8x4_E
b dtrmm_kernel_L4_M8_44
@@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40:
dtrmm_kernel_L4_M8_44:
- ands counterL , tempK, #1
+ ands counterL , tempK, #7
ble dtrmm_kernel_L4_M8_100
+ .align 5
dtrmm_kernel_L4_M8_46:
KERNEL8x4_SUB
+ subs counterL, counterL, #1
+ bne dtrmm_kernel_L4_M8_46
+
dtrmm_kernel_L4_M8_100:
SAVE8x4
@@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100:
#if defined(LEFT)
add tempOffset, tempOffset, #8
#endif
+ prfm PLDL1KEEP, [pA]
+ prfm PLDL1KEEP, [pA, #64]
+ prfm PLDL1KEEP, [origPB]
dtrmm_kernel_L4_M8_END:
subs counterI, counterI, #1
diff --git a/kernel/arm64/gemv_n.S b/kernel/arm64/gemv_n.S
index 6279c2250..162f721c3 100644
--- a/kernel/arm64/gemv_n.S
+++ b/kernel/arm64/gemv_n.S
@@ -68,6 +68,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SHZ 3
#endif
+#define A_PRE_SIZE 768
+#define Y_PRE_SIZE 768
+
/******************************************************************************/
.macro SAVE_REGS
@@ -105,36 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.4s, v3.4s}, [A_PTR], #32
ld1 {v4.4s, v5.4s}, [Y_IPTR], #32
fmla v4.4s, v1.4s, v2.4s
+ prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
fmla v5.4s, v1.4s, v3.4s
st1 {v4.4s, v5.4s}, [Y_OPTR], #32
ld1 {v6.4s, v7.4s}, [A_PTR], #32
ld1 {v8.4s, v9.4s}, [Y_IPTR], #32
fmla v8.4s, v1.4s, v6.4s
+ prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v9.4s, v1.4s, v7.4s
st1 {v8.4s, v9.4s}, [Y_OPTR], #32
#else //DOUBLE
ld1 {v2.2d, v3.2d}, [A_PTR], #32
ld1 {v4.2d, v5.2d}, [Y_IPTR], #32
fmla v4.2d, v1.2d, v2.2d
+ prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
fmla v5.2d, v1.2d, v3.2d
st1 {v4.2d, v5.2d}, [Y_OPTR], #32
ld1 {v6.2d, v7.2d}, [A_PTR], #32
ld1 {v8.2d, v9.2d}, [Y_IPTR], #32
fmla v8.2d, v1.2d, v6.2d
+ prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v9.2d, v1.2d, v7.2d
st1 {v8.2d, v9.2d}, [Y_OPTR], #32
ld1 {v10.2d, v11.2d}, [A_PTR], #32
ld1 {v12.2d, v13.2d}, [Y_IPTR], #32
fmla v12.2d, v1.2d, v10.2d
+ prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
fmla v13.2d, v1.2d, v11.2d
st1 {v12.2d, v13.2d}, [Y_OPTR], #32
ld1 {v14.2d, v15.2d}, [A_PTR], #32
ld1 {v16.2d, v17.2d}, [Y_IPTR], #32
fmla v16.2d, v1.2d, v14.2d
+ prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v17.2d, v1.2d, v15.2d
st1 {v16.2d, v17.2d}, [Y_OPTR], #32
#endif
diff --git a/kernel/arm64/gemv_t.S b/kernel/arm64/gemv_t.S
index 0145af621..28325f784 100644
--- a/kernel/arm64/gemv_t.S
+++ b/kernel/arm64/gemv_t.S
@@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define J x11 /* loop variable */
#define I x12 /* loop variable */
+#define X_PREFETCH_SIZE 768
+#define A_PREFETCH_SIZE 768
+
/*******************************************************************************
* Macro definitions
*******************************************************************************/
@@ -112,42 +115,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64
ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64
fmla v1.4s, v5.4s, v9.4s
+ prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.4s, v6.4s, v10.4s
+ prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.4s, v7.4s, v11.4s
+ ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
fmla v4.4s, v8.4s, v12.4s
- ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64
fmla v1.4s, v13.4s, v17.4s
+ prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.4s, v14.4s, v18.4s
+ prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.4s, v15.4s, v19.4s
fmla v4.4s, v16.4s, v20.4s
#else
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
fmla v1.2d, v5.2d, v9.2d
+ prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v6.2d, v10.2d
+ prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v7.2d, v11.2d
fmla v4.2d, v8.2d, v12.2d
ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
fmla v1.2d, v13.2d, v17.2d
+ prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v14.2d, v18.2d
+ prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v15.2d, v19.2d
fmla v4.2d, v16.2d, v20.2d
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
fmla v1.2d, v5.2d, v9.2d
+ prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v6.2d, v10.2d
+ prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v7.2d, v11.2d
fmla v4.2d, v8.2d, v12.2d
ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
fmla v1.2d, v13.2d, v17.2d
+ prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v14.2d, v18.2d
+ prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v15.2d, v19.2d
fmla v4.2d, v16.2d, v20.2d
#endif
diff --git a/kernel/arm64/iamax.S b/kernel/arm64/iamax.S
index 575c15e53..6c0d84f98 100644
--- a/kernel/arm64/iamax.S
+++ b/kernel/arm64/iamax.S
@@ -72,6 +72,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fabs MAXF, MAXF
.endm
+.macro KERNEL_F8
+#if !defined(DOUBLE)
+ ldp q2, q3, [X], #32
+ fabs v2.4s, v2.4s
+ fabs v3.4s, v3.4s
+ fmax v2.4s, v2.4s, v3.4s
+ fmaxv TMPF, v2.4s
+ fcmp MAXF, TMPF
+ fcsel MAXF, MAXF, TMPF, COND
+ csel INDEX, INDEX, Z, COND
+ add Z, Z, #8
+#else
+ ldp q2, q3, [X], #32
+ ldp q4, q5, [X], #32
+ fabs v2.2d, v2.2d
+ fabs v3.2d, v3.2d
+ fabs v4.2d, v4.2d
+ fabs v5.2d, v5.2d
+
+ fmax v2.2d, v2.2d, v3.2d
+ fmax v4.2d, v4.2d, v5.2d
+ fmax v2.2d, v2.2d, v4.2d
+ fmaxp TMPF, v2.2d
+
+ fcmp MAXF, TMPF
+ fcsel MAXF, MAXF, TMPF, COND
+ csel INDEX, INDEX, Z, COND
+ add Z, Z, #8
+#endif
+ PRFM PLDL1KEEP, [X, #1024]
+.endm
+
+.macro KERNEL_F8_FINALIZE
+ sub x6, INDEX, #1
+#if !defined(DOUBLE)
+ lsl x6, x6, #2
+ add x7, x7, x6
+ ldp q2, q3, [x7]
+ fabs v2.4s, v2.4s
+ fabs v3.4s, v3.4s
+
+ ins v4.s[0], v3.s[0]
+ ins v5.s[0], v3.s[1]
+ ins v6.s[0], v3.s[2]
+ ins v7.s[0], v3.s[3]
+
+ add x6, INDEX, #7
+ fcmp MAXF, s7
+ csel INDEX, x6, INDEX, eq
+
+ sub x6, x6, #1
+ fcmp MAXF, s6
+ csel INDEX, x6, INDEX, eq
+
+ sub x6, x6, #1
+ fcmp MAXF, s5
+ csel INDEX, x6, INDEX, eq
+
+ sub x6, x6, #1
+ fcmp MAXF, s4
+ csel INDEX, x6, INDEX, eq
+
+ ins v4.s[0], v2.s[0]
+ ins v5.s[0], v2.s[1]
+ ins v6.s[0], v2.s[2]
+ ins v7.s[0], v2.s[3]
+
+ sub x6, x6, #1
+ fcmp MAXF, s7
+ csel INDEX, x6, INDEX, eq
+
+ sub x6, x6, #1
+ fcmp MAXF, s6
+ csel INDEX, x6, INDEX, eq
+
+ sub x6, x6, #1
+ fcmp MAXF, s5
+ csel INDEX, x6, INDEX, eq
+
+ sub x6, x6, #1
+ fcmp MAXF, s4
+ csel INDEX, x6, INDEX, eq
+#else
+ add x6, x6, #4
+ lsl x6, x6, #3
+ add x7, x7, x6
+ ldp q2, q3, [x7]
+
+ fabs v2.2d, v2.2d
+ fabs v3.2d, v3.2d
+
+ ins v4.d[0], v2.d[0]
+ ins v5.d[0], v2.d[1]
+ ins v6.d[0], v3.d[0]
+ ins v7.d[0], v3.d[1]
+
+ add x6, INDEX, #7
+ fcmp MAXF, d7
+ csel INDEX, x6, INDEX, eq
+
+ sub x6, x6, #1
+ fcmp MAXF, d6
+ csel INDEX, x6, INDEX, eq
+
+ sub x6, x6, #1
+ fcmp MAXF, d5
+ csel INDEX, x6, INDEX, eq
+
+ sub x6, x6, #1
+ fcmp MAXF, d4
+ csel INDEX, x6, INDEX, eq
+
+ sub x7, x7, #32
+ ldp q2, q3, [x7]
+
+ fabs v2.2d, v2.2d
+ fabs v3.2d, v3.2d
+
+ ins v4.d[0], v2.d[0]
+ ins v5.d[0], v2.d[1]
+ ins v6.d[0], v3.d[0]
+ ins v7.d[0], v3.d[1]
+
+ sub x6, x6, #1
+ fcmp MAXF, d7
+ csel INDEX, x6, INDEX, eq
+
+ sub x6, x6, #1
+ fcmp MAXF, d6
+ csel INDEX, x6, INDEX, eq
+
+ sub x6, x6, #1
+ fcmp MAXF, d5
+ csel INDEX, x6, INDEX, eq
+
+ sub x6, x6, #1
+ fcmp MAXF, d4
+ csel INDEX, x6, INDEX, eq
+#endif
+.endm
+
+
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
add Z, Z, #1
@@ -92,6 +234,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp INC_X, xzr
ble iamax_kernel_zero
+ cmp INC_X, #1
+ bne iamax_kernel_S_BEGIN
+ mov x7, X
+
+iamax_kernel_F_BEGIN:
+
+ INIT_S
+
+ subs N, N, #1
+ ble iamax_kernel_L999
+
+ asr I, N, #3
+ cmp I, xzr
+ beq iamax_kernel_F1
+
+ add Z, Z, #1
+iamax_kernel_F8:
+
+ KERNEL_F8
+
+ subs I, I, #1
+ bne iamax_kernel_F8
+
+ KERNEL_F8_FINALIZE
+
+ sub Z, Z, #1
+iamax_kernel_F1:
+
+ ands I, N, #7
+ ble iamax_kernel_L999
+
+iamax_kernel_F10:
+
+ KERNEL_S1
+
+ subs I, I, #1
+ bne iamax_kernel_F10
+
+ b iamax_kernel_L999
+
+iamax_kernel_S_BEGIN:
+
INIT_S
subs N, N, #1
diff --git a/kernel/arm64/izamax.S b/kernel/arm64/izamax.S
index ebdc671e0..9b252ec98 100644
--- a/kernel/arm64/izamax.S
+++ b/kernel/arm64/izamax.S
@@ -78,6 +78,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
.endm
+.macro KERNEL_F8
+#if !defined(DOUBLE)
+ ldp q2, q3, [X], #32
+ ldp q4, q5, [X], #32
+
+ fabs v2.4s, v2.4s
+ fabs v3.4s, v3.4s
+ fabs v4.4s, v4.4s
+ fabs v5.4s, v5.4s
+
+ faddp v2.4s, v2.4s, v3.4s
+ faddp v3.4s, v4.4s, v5.4s
+
+ fmax v2.4s, v2.4s, v3.4s
+ fmaxv TMPF, v2.4s
+ fcmp MAXF, TMPF
+ fcsel MAXF, MAXF, TMPF, COND
+ csel INDEX, INDEX, Z, COND
+ add Z, Z, #8
+#else
+ ldp q2, q3, [X], #32
+ ldp q4, q5, [X], #32
+ ldp q16, q17, [X], #32
+ ldp q18, q19, [X], #32
+
+ fabs v2.2d, v2.2d
+ fabs v3.2d, v3.2d
+ fabs v4.2d, v4.2d
+ fabs v5.2d, v5.2d
+ fabs v16.2d, v16.2d
+ fabs v17.2d, v17.2d
+ fabs v18.2d, v18.2d
+ fabs v19.2d, v19.2d
+
+ faddp v2.2d, v2.2d, v3.2d
+ faddp v3.2d, v4.2d, v5.2d
+ faddp v4.2d, v16.2d, v17.2d
+ faddp v5.2d, v18.2d, v19.2d
+
+ fmax v2.2d, v2.2d, v3.2d
+ fmax v4.2d, v4.2d, v5.2d
+ fmax v2.2d, v2.2d, v4.2d
+ fmaxp TMPF, v2.2d
+
+ fcmp MAXF, TMPF
+ fcsel MAXF, MAXF, TMPF, COND
+ csel INDEX, INDEX, Z, COND
+ add Z, Z, #8
+#endif
+ PRFM PLDL1KEEP, [X, #1024]
+.endm
+
+.macro KERNEL_F8_FINALIZE
+ sub x6, INDEX, #1
+#if !defined(DOUBLE)
+ lsl x6, x6, #3
+ add x7, x7, x6
+
+ ldp q2, q3, [x7]
+ ldp q4, q5, [x7, #32]
+
+ fabs v2.4s, v2.4s
+ fabs v3.4s, v3.4s
+ fabs v4.4s, v4.4s
+ fabs v5.4s, v5.4s
+
+ faddp v2.4s, v2.4s, v3.4s
+ faddp v3.4s, v4.4s, v5.4s
+
+ ins v4.s[0], v3.s[3]
+ add x6, INDEX, #7
+ fcmp MAXF, s4
+ csel INDEX, x6, INDEX, eq
+
+ ins v4.s[0], v3.s[2]
+ sub x6, x6, #1
+ fcmp MAXF, s4
+ csel INDEX, x6, INDEX, eq
+
+ ins v4.s[0], v3.s[1]
+ sub x6, x6, #1
+ fcmp MAXF, s4
+ csel INDEX, x6, INDEX, eq
+
+ ins v4.s[0], v3.s[0]
+ sub x6, x6, #1
+ fcmp MAXF, s4
+ csel INDEX, x6, INDEX, eq
+
+ ins v4.s[0], v2.s[3]
+ sub x6, x6, #1
+ fcmp MAXF, s4
+ csel INDEX, x6, INDEX, eq
+
+ ins v4.s[0], v2.s[2]
+ sub x6, x6, #1
+ fcmp MAXF, s4
+ csel INDEX, x6, INDEX, eq
+
+ ins v4.s[0], v2.s[1]
+ sub x6, x6, #1
+ fcmp MAXF, s4
+ csel INDEX, x6, INDEX, eq
+
+ ins v4.s[0], v2.s[0]
+ sub x6, x6, #1
+ fcmp MAXF, s4
+ csel INDEX, x6, INDEX, eq
+#else
+ lsl x6, x6, #4
+ add x7, x7, x6
+
+ ldp q2, q3, [x7]
+ ldp q4, q5, [x7, #32]
+ ldp q16, q17, [x7, #64]
+ ldp q18, q19, [x7, #96]
+
+ fabs v2.2d, v2.2d
+ fabs v3.2d, v3.2d
+ fabs v4.2d, v4.2d
+ fabs v5.2d, v5.2d
+ fabs v16.2d, v16.2d
+ fabs v17.2d, v17.2d
+ fabs v18.2d, v18.2d
+ fabs v19.2d, v19.2d
+
+ faddp v2.2d, v2.2d, v3.2d
+ faddp v3.2d, v4.2d, v5.2d
+ faddp v4.2d, v16.2d, v17.2d
+ faddp v5.2d, v18.2d, v19.2d
+
+ ins v7.d[0], v5.d[1]
+ add x6, INDEX, #7
+ fcmp MAXF, d7
+ csel INDEX, x6, INDEX, eq
+
+ ins v7.d[0], v5.d[0]
+ sub x6, x6, #1
+ fcmp MAXF, d7
+ csel INDEX, x6, INDEX, eq
+
+ ins v7.d[0], v4.d[1]
+ sub x6, x6, #1
+ fcmp MAXF, d7
+ csel INDEX, x6, INDEX, eq
+
+ ins v7.d[0], v4.d[0]
+ sub x6, x6, #1
+ fcmp MAXF, d7
+ csel INDEX, x6, INDEX, eq
+
+ ins v7.d[0], v3.d[1]
+ sub x6, x6, #1
+ fcmp MAXF, d7
+ csel INDEX, x6, INDEX, eq
+
+ ins v7.d[0], v3.d[0]
+ sub x6, x6, #1
+ fcmp MAXF, d7
+ csel INDEX, x6, INDEX, eq
+
+ ins v7.d[0], v2.d[1]
+ sub x6, x6, #1
+ fcmp MAXF, d7
+ csel INDEX, x6, INDEX, eq
+
+ ins v7.d[0], v2.d[0]
+ sub x6, x6, #1
+ fcmp MAXF, d7
+ csel INDEX, x6, INDEX, eq
+#endif
+.endm
+
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v1.2s}, [X], INC_X
@@ -107,6 +280,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp INC_X, xzr
ble iamax_kernel_zero
+ cmp INC_X, #1
+ bne iamax_kernel_S_BEGIN
+ mov x7, X
+
+
+iamax_kernel_F_BEGIN:
+
+ INIT_S
+
+ subs N, N, #1
+ ble iamax_kernel_L999
+
+ asr I, N, #3
+ cmp I, xzr
+ ble iamax_kernel_F1
+
+ add Z, Z, #1
+
+iamax_kernel_F8:
+
+ KERNEL_F8
+
+ subs I, I, #1
+ bne iamax_kernel_F8
+
+ KERNEL_F8_FINALIZE
+
+ sub Z, Z, #1
+iamax_kernel_F1:
+
+ ands I, N, #7
+ ble iamax_kernel_L999
+
+iamax_kernel_F10:
+
+ KERNEL_S1
+
+ subs I, I, #1
+ bne iamax_kernel_F10
+
+ b iamax_kernel_L999
+
+iamax_kernel_S_BEGIN:
+
INIT_S
subs N, N, #1
diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S
index 68366d9f2..6e3645b76 100644
--- a/kernel/arm64/sgemm_kernel_16x4.S
+++ b/kernel/arm64/sgemm_kernel_16x4.S
@@ -46,16 +46,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
-#define pA x15
+#define pCRow3 x15
+#define pA x16
+#define alpha w17
#define alpha0 s10
#define alphaV0 v10.s[0]
-#define alpha1 s11
-#define alphaV1 v11.s[0]
-#define alpha2 s14
-#define alphaV2 v14.s[0]
-#define alpha3 s15
-#define alphaV3 v15.s[0]
+
+#define A_PRE_SIZE 2560
+#define B_PRE_SIZE 224
+#define C_PRE_SIZE 160
+
// 00 origM
// 01 origN
@@ -98,14 +99,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v05 pA1_04, pA1_05, pA1_06, pA1_07
//v06 pA1_08, pA1_09, pA1_10, pA1_11
//v07 pA1_12, pA1_13, pA1_14, pA1_15
-//v08 must save pB00, pB01
-//v09 must save pB02, pB03
-//v10 must save ALPHA0
-//v11 must save ALPHA1
-//v12 must save pB10, pB11
-//v13 must save pB12, pB13
-//v14 must save ALPHA2
-//v15 must save ALPHA3
+//v08 must save pB00
+//v09 must save pB01
+//v10 must save pB02
+//v11 must save pB03
+//v12 must save pB10
+//v13 must save pB11
+//v14 must save pB12
+//v15 must save pB13
//v16 must save C00, C01, C02, C03
//v17 must save C04, C05, C06, C07
//v18 C08, C09, C10, C11
@@ -147,206 +148,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL16x4_I
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.4s}, [pA]
- add pA, pA, #16
- ld1 {v1.4s}, [pA]
- add pA, pA, #16
- ld1 {v2.4s}, [pA]
- add pA, pA, #16
- ld1 {v3.4s}, [pA]
- add pA, pA, #16
+ ldp q0, q1, [pA], #32
+
+ ldp s8, s9, [pB], #8
fmul v16.4s, v0.4s, v8.s[0]
+ fmul v20.4s, v0.4s, v9.s[0]
+
+ ldp s10, s11, [pB], #8
+
+ fmul v24.4s, v0.4s, v10.s[0]
+ fmul v28.4s, v0.4s, v11.s[0]
+
+ ldp q2, q3, [pA], #32
+
fmul v17.4s, v1.4s, v8.s[0]
+ fmul v21.4s, v1.4s, v9.s[0]
+
+ ldp q4, q5, [pA], #32
+
+ fmul v25.4s, v1.4s, v10.s[0]
+ fmul v29.4s, v1.4s, v11.s[0]
+
+ ldp s12, s13, [pB], #8
+
fmul v18.4s, v2.4s, v8.s[0]
+ fmul v22.4s, v2.4s, v9.s[0]
+
+ ldp s14, s15, [pB], #8
+
fmul v19.4s, v3.4s, v8.s[0]
+ fmul v23.4s, v3.4s, v9.s[0]
- fmul v20.4s, v0.4s, v8.s[1]
- fmul v21.4s, v1.4s, v8.s[1]
- fmul v22.4s, v2.4s, v8.s[1]
- fmul v23.4s, v3.4s, v8.s[1]
+ ldp q6, q7, [pA], #32
- fmul v24.4s, v0.4s, v9.s[0]
- fmul v25.4s, v1.4s, v9.s[0]
- fmul v26.4s, v2.4s, v9.s[0]
- fmul v27.4s, v3.4s, v9.s[0]
+ fmul v26.4s, v2.4s, v10.s[0]
+ fmul v30.4s, v2.4s, v11.s[0]
- fmul v28.4s, v0.4s, v9.s[1]
- fmul v29.4s, v1.4s, v9.s[1]
- fmul v30.4s, v2.4s, v9.s[1]
- fmul v31.4s, v3.4s, v9.s[1]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
- ld1 {v12.2s, v13.2s}, [pB]
- add pB, pB, #16
- ld1 {v4.4s}, [pA]
- add pA, pA, #16
- ld1 {v5.4s}, [pA]
- add pA, pA, #16
- ld1 {v6.4s}, [pA]
- add pA, pA, #16
- ld1 {v7.4s}, [pA]
- add pA, pA, #16
+ fmul v27.4s, v3.4s, v10.s[0]
+ fmul v31.4s, v3.4s, v11.s[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL16x4_M1
fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.s[0]
+
+ ldp q4, q5, [pA], #32
+
fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.s[1]
- fmla v21.4s, v1.4s, v8.s[1]
- fmla v22.4s, v2.4s, v8.s[1]
- fmla v23.4s, v3.4s, v8.s[1]
+ fmla v20.4s, v0.4s, v9.s[0]
+ fmla v21.4s, v1.4s, v9.s[0]
- fmla v24.4s, v0.4s, v9.s[0]
- fmla v25.4s, v1.4s, v9.s[0]
- fmla v26.4s, v2.4s, v9.s[0]
- fmla v27.4s, v3.4s, v9.s[0]
+ ldp s12, s13, [pB], #8
- fmla v28.4s, v0.4s, v9.s[1]
- fmla v29.4s, v1.4s, v9.s[1]
- fmla v30.4s, v2.4s, v9.s[1]
- fmla v31.4s, v3.4s, v9.s[1]
+ fmla v22.4s, v2.4s, v9.s[0]
+ fmla v23.4s, v3.4s, v9.s[0]
- ld1 {v12.2s, v13.2s}, [pB]
- add pB, pB, #16
- ld1 {v4.4s}, [pA]
- add pA, pA, #16
- ld1 {v5.4s}, [pA]
- add pA, pA, #16
- ld1 {v6.4s}, [pA]
- add pA, pA, #16
- ld1 {v7.4s}, [pA]
- add pA, pA, #16
+ ldp s14, s15, [pB], #8
+
+ fmla v24.4s, v0.4s, v10.s[0]
+ fmla v25.4s, v1.4s, v10.s[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+
+ fmla v26.4s, v2.4s, v10.s[0]
+ fmla v27.4s, v3.4s, v10.s[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+ fmla v28.4s, v0.4s, v11.s[0]
+ fmla v29.4s, v1.4s, v11.s[0]
+
+ ldp q6, q7, [pA], #32
+
+ fmla v30.4s, v2.4s, v11.s[0]
+ fmla v31.4s, v3.4s, v11.s[0]
.endm
.macro KERNEL16x4_M2
fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.s[0]
+
+ ldp q0, q1, [pA], #32
+
fmla v18.4s, v6.4s, v12.s[0]
fmla v19.4s, v7.4s, v12.s[0]
- fmla v20.4s, v4.4s, v12.s[1]
- fmla v21.4s, v5.4s, v12.s[1]
- fmla v22.4s, v6.4s, v12.s[1]
- fmla v23.4s, v7.4s, v12.s[1]
+ fmla v20.4s, v4.4s, v13.s[0]
+ fmla v21.4s, v5.4s, v13.s[0]
- fmla v24.4s, v4.4s, v13.s[0]
- fmla v25.4s, v5.4s, v13.s[0]
- fmla v26.4s, v6.4s, v13.s[0]
- fmla v27.4s, v7.4s, v13.s[0]
+ ldp s8, s9, [pB], #8
- fmla v28.4s, v4.4s, v13.s[1]
- fmla v29.4s, v5.4s, v13.s[1]
- fmla v30.4s, v6.4s, v13.s[1]
- fmla v31.4s, v7.4s, v13.s[1]
+ fmla v22.4s, v6.4s, v13.s[0]
+ fmla v23.4s, v7.4s, v13.s[0]
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.4s}, [pA]
- add pA, pA, #16
- ld1 {v1.4s}, [pA]
- add pA, pA, #16
- ld1 {v2.4s}, [pA]
- add pA, pA, #16
- ld1 {v3.4s}, [pA]
- add pA, pA, #16
+ ldp s10, s11, [pB], #8
+
+ fmla v24.4s, v4.4s, v14.s[0]
+ fmla v25.4s, v5.4s, v14.s[0]
+
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+ fmla v26.4s, v6.4s, v14.s[0]
+ fmla v27.4s, v7.4s, v14.s[0]
+
+ ldp q2, q3, [pA], #32
+
+ fmla v28.4s, v4.4s, v15.s[0]
+ fmla v29.4s, v5.4s, v15.s[0]
+
+ fmla v30.4s, v6.4s, v15.s[0]
+ fmla v31.4s, v7.4s, v15.s[0]
.endm
.macro KERNEL16x4_E
fmla v16.4s, v4.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v13.s[0]
+ fmla v24.4s, v4.4s, v14.s[0]
+ fmla v28.4s, v4.4s, v15.s[0]
+
fmla v17.4s, v5.4s, v12.s[0]
- fmla v18.4s, v6.4s, v12.s[0]
- fmla v19.4s, v7.4s, v12.s[0]
+ fmla v21.4s, v5.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v14.s[0]
+ fmla v29.4s, v5.4s, v15.s[0]
- fmla v20.4s, v4.4s, v12.s[1]
- fmla v21.4s, v5.4s, v12.s[1]
- fmla v22.4s, v6.4s, v12.s[1]
- fmla v23.4s, v7.4s, v12.s[1]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- fmla v24.4s, v4.4s, v13.s[0]
- fmla v25.4s, v5.4s, v13.s[0]
- fmla v26.4s, v6.4s, v13.s[0]
- fmla v27.4s, v7.4s, v13.s[0]
+ fmla v18.4s, v6.4s, v12.s[0]
+ fmla v22.4s, v6.4s, v13.s[0]
+ fmla v26.4s, v6.4s, v14.s[0]
+ fmla v30.4s, v6.4s, v15.s[0]
- fmla v28.4s, v4.4s, v13.s[1]
- fmla v29.4s, v5.4s, v13.s[1]
- fmla v30.4s, v6.4s, v13.s[1]
- fmla v31.4s, v7.4s, v13.s[1]
+ fmla v19.4s, v7.4s, v12.s[0]
+ fmla v23.4s, v7.4s, v13.s[0]
+ fmla v27.4s, v7.4s, v14.s[0]
+ fmla v31.4s, v7.4s, v15.s[0]
.endm
.macro KERNEL16x4_SUB
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.4s}, [pA]
- add pA, pA, #16
- ld1 {v1.4s}, [pA]
- add pA, pA, #16
- ld1 {v2.4s}, [pA]
- add pA, pA, #16
- ld1 {v3.4s}, [pA]
- add pA, pA, #16
+ ldp q0, q1, [pA], #32
+ ldp s8, s9, [pB], #8
fmla v16.4s, v0.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v9.s[0]
+
+ ldp s10, s11, [pB], #8
+
+ fmla v24.4s, v0.4s, v10.s[0]
+ fmla v28.4s, v0.4s, v11.s[0]
+
+ ldp q2, q3, [pA], #32
+
fmla v17.4s, v1.4s, v8.s[0]
+ fmla v21.4s, v1.4s, v9.s[0]
+
+ fmla v25.4s, v1.4s, v10.s[0]
+ fmla v29.4s, v1.4s, v11.s[0]
+
fmla v18.4s, v2.4s, v8.s[0]
+ fmla v22.4s, v2.4s, v9.s[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
fmla v19.4s, v3.4s, v8.s[0]
+ fmla v23.4s, v3.4s, v9.s[0]
- fmla v20.4s, v0.4s, v8.s[1]
- fmla v21.4s, v1.4s, v8.s[1]
- fmla v22.4s, v2.4s, v8.s[1]
- fmla v23.4s, v3.4s, v8.s[1]
+ fmla v26.4s, v2.4s, v10.s[0]
+ fmla v30.4s, v2.4s, v11.s[0]
- fmla v24.4s, v0.4s, v9.s[0]
- fmla v25.4s, v1.4s, v9.s[0]
- fmla v26.4s, v2.4s, v9.s[0]
- fmla v27.4s, v3.4s, v9.s[0]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- fmla v28.4s, v0.4s, v9.s[1]
- fmla v29.4s, v1.4s, v9.s[1]
- fmla v30.4s, v2.4s, v9.s[1]
- fmla v31.4s, v3.4s, v9.s[1]
+ fmla v27.4s, v3.4s, v10.s[0]
+ fmla v31.4s, v3.4s, v11.s[0]
.endm
.macro SAVE16x4
- add pCRow1, pCRow0, LDC
+ fmov alpha0, alpha
- ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+ ldp q0, q1, [pCRow0]
fmla v0.4s, v16.4s, alphaV0
- fmla v1.4s, v17.4s, alphaV1
- fmla v2.4s, v18.4s, alphaV2
- fmla v3.4s, v19.4s, alphaV3
- st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+ fmla v1.4s, v17.4s, alphaV0
+ stp q0, q1, [pCRow0]
- add pCRow2, pCRow1, LDC
+ add pCRow0, pCRow0, #32
- ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+ ldp q2, q3, [pCRow0]
+ fmla v2.4s, v18.4s, alphaV0
+ fmla v3.4s, v19.4s, alphaV0
+ stp q2, q3, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+ ldp q4, q5, [pCRow1]
fmla v4.4s, v20.4s, alphaV0
- fmla v5.4s, v21.4s, alphaV1
- fmla v6.4s, v22.4s, alphaV2
- fmla v7.4s, v23.4s, alphaV3
- st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+ fmla v5.4s, v21.4s, alphaV0
+ stp q4, q5, [pCRow1]
+
+ add pCRow1, pCRow1, #32
+
+ ldp q6, q7, [pCRow1]
+ fmla v6.4s, v22.4s, alphaV0
+ fmla v7.4s, v23.4s, alphaV0
+ stp q6, q7, [pCRow1]
- add pCRow1, pCRow2, LDC
+ add pCRow1, pCRow1, #32
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
- ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2]
+ ldp q0, q1, [pCRow2]
fmla v0.4s, v24.4s, alphaV0
- fmla v1.4s, v25.4s, alphaV1
- fmla v2.4s, v26.4s, alphaV2
- fmla v3.4s, v27.4s, alphaV3
- st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2]
+ fmla v1.4s, v25.4s, alphaV0
+ stp q0, q1, [pCRow2]
- ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+ add pCRow2, pCRow2, #32
+
+ ldp q2, q3, [pCRow2]
+ fmla v2.4s, v26.4s, alphaV0
+ fmla v3.4s, v27.4s, alphaV0
+ stp q2, q3, [pCRow2]
+
+ add pCRow2, pCRow2, #32
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+ ldp q4, q5, [pCRow3]
fmla v4.4s, v28.4s, alphaV0
- fmla v5.4s, v29.4s, alphaV1
- fmla v6.4s, v30.4s, alphaV2
- fmla v7.4s, v31.4s, alphaV3
- st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+ fmla v5.4s, v29.4s, alphaV0
+ stp q4, q5, [pCRow3]
- add pCRow0, pCRow0, #64
+ add pCRow3, pCRow3, #32
+
+ ldp q6, q7, [pCRow3]
+ fmla v6.4s, v30.4s, alphaV0
+ fmla v7.4s, v31.4s, alphaV0
+ stp q6, q7, [pCRow3]
+
+ add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@@ -363,264 +407,217 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_I
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.4s}, [pA]
- add pA, pA, #16
- ld1 {v1.4s}, [pA]
- add pA, pA, #16
+ ldp s8, s9, [pB], #8
+ ldp s10, s11, [pB], #8
+
+ ldr q0, [pA], #16
+ ldr q1, [pA], #16
fmul v16.4s, v0.4s, v8.s[0]
fmul v17.4s, v1.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.s[1]
- fmul v21.4s, v1.4s, v8.s[1]
- fmul v24.4s, v0.4s, v9.s[0]
- fmul v25.4s, v1.4s, v9.s[0]
- fmul v28.4s, v0.4s, v9.s[1]
- fmul v29.4s, v1.4s, v9.s[1]
-
- ld1 {v12.2s, v13.2s}, [pB]
- add pB, pB, #16
- ld1 {v4.4s}, [pA]
- add pA, pA, #16
- ld1 {v5.4s}, [pA]
- add pA, pA, #16
+ fmul v20.4s, v0.4s, v9.s[0]
+ fmul v21.4s, v1.4s, v9.s[0]
+ fmul v24.4s, v0.4s, v10.s[0]
+ fmul v25.4s, v1.4s, v10.s[0]
+ fmul v28.4s, v0.4s, v11.s[0]
+ fmul v29.4s, v1.4s, v11.s[0]
+
+ ldp s12, s13, [pB], #8
+ ldp s14, s15, [pB], #8
+
+ ldr q4, [pA], #16
+ ldr q5, [pA], #16
.endm
.macro KERNEL8x4_M1
fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.s[1]
- fmla v21.4s, v1.4s, v8.s[1]
- fmla v24.4s, v0.4s, v9.s[0]
- fmla v25.4s, v1.4s, v9.s[0]
- fmla v28.4s, v0.4s, v9.s[1]
- fmla v29.4s, v1.4s, v9.s[1]
-
- ld1 {v12.2s, v13.2s}, [pB]
- add pB, pB, #16
- ld1 {v4.4s}, [pA]
- add pA, pA, #16
- ld1 {v5.4s}, [pA]
- add pA, pA, #16
+ fmla v20.4s, v0.4s, v9.s[0]
+ fmla v21.4s, v1.4s, v9.s[0]
+ fmla v24.4s, v0.4s, v10.s[0]
+ fmla v25.4s, v1.4s, v10.s[0]
+ fmla v28.4s, v0.4s, v11.s[0]
+ fmla v29.4s, v1.4s, v11.s[0]
+
+ ldp s12, s13, [pB], #8
+ ldp s14, s15, [pB], #8
+
+ ldr q4, [pA], #16
+ ldr q5, [pA], #16
.endm
.macro KERNEL8x4_M2
fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.s[0]
- fmla v20.4s, v4.4s, v12.s[1]
- fmla v21.4s, v5.4s, v12.s[1]
- fmla v24.4s, v4.4s, v13.s[0]
- fmla v25.4s, v5.4s, v13.s[0]
- fmla v28.4s, v4.4s, v13.s[1]
- fmla v29.4s, v5.4s, v13.s[1]
-
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.4s}, [pA]
- add pA, pA, #16
- ld1 {v1.4s}, [pA]
- add pA, pA, #16
+ fmla v20.4s, v4.4s, v13.s[0]
+ fmla v21.4s, v5.4s, v13.s[0]
+ fmla v24.4s, v4.4s, v14.s[0]
+ fmla v25.4s, v5.4s, v14.s[0]
+ fmla v28.4s, v4.4s, v15.s[0]
+ fmla v29.4s, v5.4s, v15.s[0]
+
+ ldp s8, s9, [pB], #8
+ ldp s10, s11, [pB], #8
+
+ ldr q0, [pA], #16
+ ldr q1, [pA], #16
.endm
.macro KERNEL8x4_E
fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.s[0]
- fmla v20.4s, v4.4s, v12.s[1]
- fmla v21.4s, v5.4s, v12.s[1]
- fmla v24.4s, v4.4s, v13.s[0]
- fmla v25.4s, v5.4s, v13.s[0]
- fmla v28.4s, v4.4s, v13.s[1]
- fmla v29.4s, v5.4s, v13.s[1]
+ fmla v20.4s, v4.4s, v13.s[0]
+ fmla v21.4s, v5.4s, v13.s[0]
+ fmla v24.4s, v4.4s, v14.s[0]
+ fmla v25.4s, v5.4s, v14.s[0]
+ fmla v28.4s, v4.4s, v15.s[0]
+ fmla v29.4s, v5.4s, v15.s[0]
.endm
.macro KERNEL8x4_SUB
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.4s}, [pA]
- add pA, pA, #16
- ld1 {v1.4s}, [pA]
- add pA, pA, #16
+ ldp s8, s9, [pB], #8
+ ldp s10, s11, [pB], #8
+
+ ldr q0, [pA], #16
+ ldr q1, [pA], #16
fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.s[1]
- fmla v21.4s, v1.4s, v8.s[1]
- fmla v24.4s, v0.4s, v9.s[0]
- fmla v25.4s, v1.4s, v9.s[0]
- fmla v28.4s, v0.4s, v9.s[1]
- fmla v29.4s, v1.4s, v9.s[1]
+ fmla v20.4s, v0.4s, v9.s[0]
+ fmla v21.4s, v1.4s, v9.s[0]
+ fmla v24.4s, v0.4s, v10.s[0]
+ fmla v25.4s, v1.4s, v10.s[0]
+ fmla v28.4s, v0.4s, v11.s[0]
+ fmla v29.4s, v1.4s, v11.s[0]
.endm
.macro SAVE8x4
- add pCRow1, pCRow0, LDC
+ fmov alpha0, alpha
- ld1 {v0.4s, v1.4s}, [pCRow0]
+ ldp q0, q1, [pCRow0]
fmla v0.4s, v16.4s, alphaV0
- fmla v1.4s, v17.4s, alphaV1
- st1 {v0.4s, v1.4s}, [pCRow0]
+ fmla v1.4s, v17.4s, alphaV0
+ stp q0, q1, [pCRow0]
- add pCRow2, pCRow1, LDC
+ add pCRow0, pCRow0, #32
- ld1 {v4.4s, v5.4s}, [pCRow1]
- fmla v4.4s, v20.4s, alphaV0
- fmla v5.4s, v21.4s, alphaV1
- st1 {v4.4s, v5.4s}, [pCRow1]
+ ldp q2, q3, [pCRow1]
+ fmla v2.4s, v20.4s, alphaV0
+ fmla v3.4s, v21.4s, alphaV0
+ stp q2, q3, [pCRow1]
- add pCRow1, pCRow2, LDC
+ add pCRow1, pCRow1, #32
- ld1 {v0.4s, v1.4s}, [pCRow2]
- fmla v0.4s, v24.4s, alphaV0
- fmla v1.4s, v25.4s, alphaV1
- st1 {v0.4s, v1.4s}, [pCRow2]
+ ldp q4, q5, [pCRow2]
+ fmla v4.4s, v24.4s, alphaV0
+ fmla v5.4s, v25.4s, alphaV0
+ stp q4, q5, [pCRow2]
- ld1 {v4.4s, v5.4s}, [pCRow1]
- fmla v4.4s, v28.4s, alphaV0
- fmla v5.4s, v29.4s, alphaV1
- st1 {v4.4s, v5.4s}, [pCRow1]
+ add pCRow2, pCRow2, #32
- add pCRow0, pCRow0, #32
+ ldp q6, q7, [pCRow3]
+ fmla v6.4s, v28.4s, alphaV0
+ fmla v7.4s, v29.4s, alphaV0
+ stp q6, q7, [pCRow3]
+
+ add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
.macro INIT4x4
fmov s16, wzr
- fmov s17, s16
- fmov s20, s17
- fmov s21, s16
- fmov s24, s17
- fmov s25, s16
- fmov s28, s17
- fmov s29, s16
+ fmov s20, wzr
+ fmov s24, wzr
+ fmov s28, wzr
.endm
.macro KERNEL4x4_I
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.2s, v1.2s}, [pA]
- add pA, pA, #16
+ ldp s8, s9, [pB], #8
+ ldp s10, s11, [pB], #8
- fmul v16.2s, v0.2s, v8.s[0]
- fmul v29.2s, v1.2s, v9.s[1]
+ ldr q0, [pA], #16
- fmul v20.2s, v0.2s, v8.s[1]
- fmul v25.2s, v1.2s, v9.s[0]
-
- fmul v24.2s, v0.2s, v9.s[0]
- fmul v21.2s, v1.2s, v8.s[1]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v20.4s, v0.4s, v9.s[0]
+ fmul v24.4s, v0.4s, v10.s[0]
+ fmul v28.4s, v0.4s, v11.s[0]
- fmul v28.2s, v0.2s, v9.s[1]
- fmul v17.2s, v1.2s, v8.s[0]
+ ldp s12, s13, [pB], #8
+ ldp s14, s15, [pB], #8
- ld1 {v12.2s, v13.2s}, [pB]
- add pB, pB, #16
- ld1 {v4.2s, v5.2s}, [pA]
- add pA, pA, #16
+ ldr q1, [pA], #16
.endm
.macro KERNEL4x4_M1
- fmla v16.2s, v0.2s, v8.s[0]
- fmla v29.2s, v1.2s, v9.s[1]
-
- ld1 {v12.2s, v13.2s}, [pB] // For next round
- add pB, pB, #16
-
- fmla v20.2s, v0.2s, v8.s[1]
- fmla v25.2s, v1.2s, v9.s[0]
-
- ld1 {v4.2s, v5.2s}, [pA] // For next round
- add pA, pA, #16
-
- fmla v24.2s, v0.2s, v9.s[0]
- fmla v21.2s, v1.2s, v8.s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v9.s[0]
+ fmla v24.4s, v0.4s, v10.s[0]
+ fmla v28.4s, v0.4s, v11.s[0]
- prfm PLDL1KEEP, [pB, #512]
+ ldp s12, s13, [pB], #8
+ ldp s14, s15, [pB], #8
- fmla v28.2s, v0.2s, v9.s[1]
- fmla v17.2s, v1.2s, v8.s[0]
+ ldr q1, [pA], #16
.endm
.macro KERNEL4x4_M2
- fmla v16.2s, v4.2s, v12.s[0]
- fmla v29.2s, v5.2s, v13.s[1]
+ fmla v16.4s, v1.4s, v12.s[0]
+ fmla v20.4s, v1.4s, v13.s[0]
+ fmla v24.4s, v1.4s, v14.s[0]
+ fmla v28.4s, v1.4s, v15.s[0]
- ld1 {v8.2s, v9.2s}, [pB] // For next round
- add pB, pB, #16
+ ldp s8, s9, [pB], #8
+ ldp s10, s11, [pB], #8
- fmla v20.2s, v4.2s, v12.s[1]
- fmla v25.2s, v5.2s, v13.s[0]
-
- ld1 {v0.2s, v1.2s}, [pA] // For next round
- add pA, pA, #16
-
- fmla v24.2s, v4.2s, v13.s[0]
- fmla v21.2s, v5.2s, v12.s[1]
-
- prfm PLDL1KEEP, [pA, #512]
-
- fmla v28.2s, v4.2s, v13.s[1]
- fmla v17.2s, v5.2s, v12.s[0]
+ ldr q0, [pA], #16
.endm
.macro KERNEL4x4_E
- fmla v16.2s, v4.2s, v12.s[0]
- fmla v29.2s, v5.2s, v13.s[1]
+ fmla v16.4s, v1.4s, v12.s[0]
+ fmla v20.4s, v1.4s, v13.s[0]
+ fmla v24.4s, v1.4s, v14.s[0]
+ fmla v28.4s, v1.4s, v15.s[0]
+.endm
- fmla v20.2s, v4.2s, v12.s[1]
- fmla v25.2s, v5.2s, v13.s[0]
+.macro KERNEL4x4_SUB
+ ldp s8, s9, [pB], #8
+ ldp s10, s11, [pB], #8
- fmla v24.2s, v4.2s, v13.s[0]
- fmla v21.2s, v5.2s, v12.s[1]
+ ldr q0, [pA], #16
- fmla v28.2s, v4.2s, v13.s[1]
- fmla v17.2s, v5.2s, v12.s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v9.s[0]
+ fmla v24.4s, v0.4s, v10.s[0]
+ fmla v28.4s, v0.4s, v11.s[0]
.endm
-.macro KERNEL4x4_SUB
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.2s, v1.2s}, [pA]
- add pA, pA, #16
-
- fmla v16.2s, v0.2s, v8.s[0]
- fmla v29.2s, v1.2s, v9.s[1]
+.macro SAVE4x4
+ fmov alpha0, alpha
- fmla v20.2s, v0.2s, v8.s[1]
- fmla v25.2s, v1.2s, v9.s[0]
+ ldr q0, [pCRow0]
+ fmla v0.4s, v16.4s, alphaV0
+ str q0, [pCRow0]
- fmla v24.2s, v0.2s, v9.s[0]
- fmla v21.2s, v1.2s, v8.s[1]
+ add pCRow0, pCRow0, #16
- fmla v28.2s, v0.2s, v9.s[1]
- fmla v17.2s, v1.2s, v8.s[0]
-.endm
+ ldr q1, [pCRow1]
+ fmla v1.4s, v20.4s, alphaV0
+ str q1, [pCRow1]
-.macro SAVE4x4
- ld1 {v8.2s, v9.2s}, [pCRow0]
- fmla v8.2s, v16.2s, alphaV0
- fmla v9.2s, v17.2s, alphaV1
- st1 {v8.2s, v9.2s}, [pCRow0]
+ add pCRow1, pCRow1, #16
- add pCRow1, pCRow0, LDC
- ld1 {v12.2s, v13.2s}, [pCRow1]
- fmla v12.2s, v20.2s, alphaV2
- fmla v13.2s, v21.2s, alphaV3
- st1 {v12.2s, v13.2s}, [pCRow1]
+ ldr q2, [pCRow2]
+ fmla v2.4s, v24.4s, alphaV0
+ str q2, [pCRow2]
- add pCRow2, pCRow1, LDC
- ld1 {v8.2s, v9.2s}, [pCRow2]
- fmla v8.2s, v24.2s, alphaV0
- fmla v9.2s, v25.2s, alphaV1
- st1 {v8.2s, v9.2s}, [pCRow2]
+ add pCRow2, pCRow2, #16
- add pCRow1, pCRow2, LDC
- ld1 {v12.2s, v13.2s}, [pCRow1]
- fmla v12.2s, v28.2s, alphaV2
- fmla v13.2s, v29.2s, alphaV3
- st1 {v12.2s, v13.2s}, [pCRow1]
+ ldr q3, [pCRow3]
+ fmla v3.4s, v28.4s, alphaV0
+ str q3, [pCRow3]
- add pCRow0, pCRow0, #16
+ add pCRow3, pCRow3, #16
.endm
/******************************************************************************/
@@ -633,38 +630,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL2x4_SUB
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.2s}, [pA]
- add pA, pA, #8
+ ldp s8, s9, [pB], #8
+ ldp s10, s11, [pB], #8
+
+ ldr d0, [pA], #8
fmla v16.2s, v0.2s, v8.s[0]
- fmla v20.2s, v0.2s, v8.s[1]
- fmla v24.2s, v0.2s, v9.s[0]
- fmla v28.2s, v0.2s, v9.s[1]
+ fmla v20.2s, v0.2s, v9.s[0]
+ fmla v24.2s, v0.2s, v10.s[0]
+ fmla v28.2s, v0.2s, v11.s[0]
.endm
.macro SAVE2x4
- ld1 {v8.2s}, [pCRow0]
- fmla v8.2s, v16.2s, alphaV0
- st1 {v8.2s}, [pCRow0]
+ fmov alpha0, alpha
- add pCRow1, pCRow0, LDC
- ld1 {v12.2s}, [pCRow1]
- fmla v12.2s, v20.2s, alphaV1
- st1 {v12.2s}, [pCRow1]
+ ldr d0, [pCRow0]
+ fmla v0.2s, v16.2s, alphaV0
+ str d0, [pCRow0]
- add pCRow2, pCRow1, LDC
- ld1 {v8.2s}, [pCRow2]
- fmla v8.2s, v24.2s, alphaV2
- st1 {v8.2s}, [pCRow2]
+ add pCRow0, pCRow0, #8
- add pCRow1, pCRow2, LDC
- ld1 {v12.2s}, [pCRow1]
- fmla v12.2s, v28.2s, alphaV3
- st1 {v12.2s}, [pCRow1]
+ ldr d1, [pCRow1]
+ fmla v1.2s, v20.2s, alphaV0
+ str d1, [pCRow1]
- add pCRow0, pCRow0, #8
+ add pCRow1, pCRow1, #8
+
+ ldr d0, [pCRow2]
+ fmla v0.2s, v24.2s, alphaV0
+ str d0, [pCRow2]
+
+ add pCRow2, pCRow2, #8
+
+ ldr d1, [pCRow3]
+ fmla v1.2s, v28.2s, alphaV0
+ str d1, [pCRow3]
+
+ add pCRow3, pCRow3, #8
.endm
/******************************************************************************/
@@ -686,22 +688,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
- add pCRow1, pCRow0, LDC
+ fmov alpha0, alpha
+
ld1 {v8.s}[0], [pCRow0]
ld1 {v8.s}[1], [pCRow1]
fmla v8.2s, v16.2s, alphaV0
st1 {v8.s}[0], [pCRow0]
st1 {v8.s}[1], [pCRow1]
- add pCRow2, pCRow1, LDC
- add pCRow1, pCRow2, LDC
+ add pCRow0, pCRow0, #4
+ add pCRow1, pCRow1, #4
+
ld1 {v12.s}[0], [pCRow2]
- ld1 {v12.s}[1], [pCRow1]
- fmla v12.2s, v20.2s, alphaV1
+ ld1 {v12.s}[1], [pCRow3]
+ fmla v12.2s, v20.2s, alphaV0
st1 {v12.s}[0], [pCRow2]
- st1 {v12.s}[1], [pCRow1]
+ st1 {v12.s}[1], [pCRow3]
- add pCRow0, pCRow0, #4
+ add pCRow2, pCRow2, #4
+ add pCRow3, pCRow3, #4
.endm
/******************************************************************************/
@@ -741,20 +746,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE16x2
+ fmov alpha0, alpha
+
add pCRow1, pCRow0, LDC
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
fmla v0.4s, v16.4s, alphaV0
- fmla v1.4s, v17.4s, alphaV1
- fmla v2.4s, v18.4s, alphaV2
- fmla v3.4s, v19.4s, alphaV3
+ fmla v1.4s, v17.4s, alphaV0
+ fmla v2.4s, v18.4s, alphaV0
+ fmla v3.4s, v19.4s, alphaV0
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
fmla v4.4s, v20.4s, alphaV0
- fmla v5.4s, v21.4s, alphaV1
- fmla v6.4s, v22.4s, alphaV2
- fmla v7.4s, v23.4s, alphaV3
+ fmla v5.4s, v21.4s, alphaV0
+ fmla v6.4s, v22.4s, alphaV0
+ fmla v7.4s, v23.4s, alphaV0
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
add pCRow0, pCRow0, #64
@@ -785,18 +792,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x2
+ fmov alpha0, alpha
+
add pCRow1, pCRow0, LDC
ld1 {v0.4s, v1.4s}, [pCRow0]
fmla v0.4s, v16.4s, alphaV0
- fmla v1.4s, v17.4s, alphaV1
+ fmla v1.4s, v17.4s, alphaV0
st1 {v0.4s, v1.4s}, [pCRow0]
add pCRow2, pCRow1, LDC
ld1 {v4.4s, v5.4s}, [pCRow1]
fmla v4.4s, v20.4s, alphaV0
- fmla v5.4s, v21.4s, alphaV1
+ fmla v5.4s, v21.4s, alphaV0
st1 {v4.4s, v5.4s}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -824,15 +833,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
+ fmov alpha0, alpha
+
ld1 {v8.2s, v9.2s}, [pCRow0]
fmla v8.2s, v16.2s, alphaV0
- fmla v9.2s, v17.2s, alphaV1
+ fmla v9.2s, v17.2s, alphaV0
st1 {v8.2s, v9.2s}, [pCRow0]
add pCRow1, pCRow0, LDC
ld1 {v12.2s, v13.2s}, [pCRow1]
- fmla v12.2s, v20.2s, alphaV2
- fmla v13.2s, v21.2s, alphaV3
+ fmla v12.2s, v20.2s, alphaV0
+ fmla v13.2s, v21.2s, alphaV0
st1 {v12.2s, v13.2s}, [pCRow1]
add pCRow0, pCRow0, #16
@@ -857,13 +868,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
+ fmov alpha0, alpha
+
ld1 {v8.2s}, [pCRow0]
fmla v8.2s, v16.2s, alphaV0
st1 {v8.2s}, [pCRow0]
add pCRow1 , pCRow0, LDC
ld1 {v12.2s}, [pCRow1]
- fmla v12.2s, v20.2s, alphaV1
+ fmla v12.2s, v20.2s, alphaV0
st1 {v12.2s}, [pCRow1]
add pCRow0, pCRow0, #8
@@ -886,6 +899,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
+ fmov alpha0, alpha
+
add pCRow1 , pCRow0, LDC
ld1 {v8.s}[0], [pCRow0]
ld1 {v8.s}[1], [pCRow1]
@@ -925,11 +940,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE16x1
+ fmov alpha0, alpha
+
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
fmla v0.4s, v16.4s, alphaV0
- fmla v1.4s, v17.4s, alphaV1
- fmla v2.4s, v18.4s, alphaV2
- fmla v3.4s, v19.4s, alphaV3
+ fmla v1.4s, v17.4s, alphaV0
+ fmla v2.4s, v18.4s, alphaV0
+ fmla v3.4s, v19.4s, alphaV0
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
add pCRow0, pCRow0, #64
@@ -956,9 +973,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x1
+ fmov alpha0, alpha
+
ld1 {v0.4s, v1.4s}, [pCRow0]
fmla v0.4s, v16.4s, alphaV0
- fmla v1.4s, v17.4s, alphaV1
+ fmla v1.4s, v17.4s, alphaV0
st1 {v0.4s, v1.4s}, [pCRow0]
add pCRow0, pCRow0, #32
@@ -983,9 +1002,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
+ fmov alpha0, alpha
+
ld1 {v8.2s, v9.2s}, [pCRow0]
fmla v8.2s, v16.2s, alphaV0
- fmla v9.2s, v17.2s, alphaV1
+ fmla v9.2s, v17.2s, alphaV0
st1 {v8.2s, v9.2s}, [pCRow0]
add pCRow0, pCRow0, #16
@@ -1008,6 +1029,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
+ fmov alpha0, alpha
+
ld1 {v8.2s}, [pCRow0]
fmla v8.2s, v16.2s, alphaV0
st1 {v8.2s}, [pCRow0]
@@ -1032,6 +1055,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
+ fmov alpha0, alpha
+
ldr s8, [pCRow0]
fmla s8, s16, alphaV0
str s8, [pCRow0]
@@ -1061,10 +1086,10 @@ sgemm_kernel_begin:
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
- fmov alpha0, s0
- fmov alpha1, s0
- fmov alpha2, s0
- fmov alpha3, s0
+ prfm PLDL1KEEP, [origPB]
+ prfm PLDL1KEEP, [origPA]
+
+ fmov alpha, s0
lsl LDC, LDC, #2 // ldc = ldc * 4
@@ -1078,8 +1103,12 @@ sgemm_kernel_begin:
/******************************************************************************/
sgemm_kernel_L4_BEGIN:
- mov pCRow0, pC // pCRow0 = C
- add pC, pC, LDC, lsl #2
+ mov pCRow0, pC
+ add pCRow1, pCRow0, LDC
+ add pCRow2, pCRow1, LDC
+ add pCRow3, pCRow2, LDC
+
+ add pC, pCRow3, LDC
mov pA, origPA // pA = start of A array
@@ -1090,42 +1119,69 @@ sgemm_kernel_L4_M16_BEGIN:
cmp counterI, #0
ble sgemm_kernel_L4_M8_BEGIN
+ .align 5
sgemm_kernel_L4_M16_20:
mov pB, origPB
- asr counterL , origK, #1 // L = K / 2
- cmp counterL , #2 // is there at least 4 to do?
+ asr counterL , origK, #3
+ cmp counterL , #2
blt sgemm_kernel_L4_M16_32
- KERNEL16x4_I // do one in the K
- KERNEL16x4_M2 // do another in the K
+ KERNEL16x4_I
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
subs counterL, counterL, #2
ble sgemm_kernel_L4_M16_22a
- .align 5
+ .align 5
sgemm_kernel_L4_M16_22:
KERNEL16x4_M1
KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M16_22
+ .align 5
sgemm_kernel_L4_M16_22a:
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
KERNEL16x4_M1
KERNEL16x4_E
b sgemm_kernel_L4_M16_44
+ .align 5
sgemm_kernel_L4_M16_32:
tst counterL, #1
ble sgemm_kernel_L4_M16_40
KERNEL16x4_I
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
KERNEL16x4_E
b sgemm_kernel_L4_M16_44
@@ -1136,14 +1192,20 @@ sgemm_kernel_L4_M16_40:
sgemm_kernel_L4_M16_44:
- ands counterL , origK, #1
+ ands counterL , origK, #7
ble sgemm_kernel_L4_M16_100
+ .align 5
sgemm_kernel_L4_M16_46:
KERNEL16x4_SUB
+ subs counterL, counterL, #1
+ bne sgemm_kernel_L4_M16_46
sgemm_kernel_L4_M16_100:
+ prfm PLDL1KEEP, [pA]
+ prfm PLDL1KEEP, [pA, #64]
+ prfm PLDL1KEEP, [origPB]
SAVE16x4
diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S
index 28b321651..77e05103d 100644
--- a/kernel/arm64/strmm_kernel_16x4.S
+++ b/kernel/arm64/strmm_kernel_16x4.S
@@ -46,19 +46,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
-#define pA x15
-#define temp x16
-#define tempOffset x17
-#define tempK x18
+#define pCRow3 x15
+#define pA x16
+#define alpha w17
+#define temp x18
+#define tempOffset x19
+#define tempK x20
#define alpha0 s10
#define alphaV0 v10.s[0]
-#define alpha1 s11
-#define alphaV1 v11.s[0]
-#define alpha2 s14
-#define alphaV2 v14.s[0]
-#define alpha3 s15
-#define alphaV3 v15.s[0]
+
+#define A_PRE_SIZE 2560
+#define B_PRE_SIZE 224
+#define C_PRE_SIZE 160
+
// 00 origM
// 01 origN
@@ -101,14 +102,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v05 pA1_04, pA1_05, pA1_06, pA1_07
//v06 pA1_08, pA1_09, pA1_10, pA1_11
//v07 pA1_12, pA1_13, pA1_14, pA1_15
-//v08 must save pB00, pB01
-//v09 must save pB02, pB03
-//v10 must save ALPHA0
-//v11 must save ALPHA1
-//v12 must save pB10, pB11
-//v13 must save pB12, pB13
-//v14 must save ALPHA2
-//v15 must save ALPHA3
+//v08 must save pB00
+//v09 must save pB01
+//v10 must save pB02
+//v11 must save pB03
+//v12 must save pB10
+//v13 must save pB11
+//v14 must save pB12
+//v15 must save pB13
//v16 must save C00, C01, C02, C03
//v17 must save C04, C05, C06, C07
//v18 C08, C09, C10, C11
@@ -150,202 +151,240 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL16x4_I
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.4s}, [pA]
- add pA, pA, #16
- ld1 {v1.4s}, [pA]
- add pA, pA, #16
- ld1 {v2.4s}, [pA]
- add pA, pA, #16
- ld1 {v3.4s}, [pA]
- add pA, pA, #16
+ ldp q0, q1, [pA], #32
+
+ ldp s8, s9, [pB], #8
fmul v16.4s, v0.4s, v8.s[0]
+ fmul v20.4s, v0.4s, v9.s[0]
+
+ ldp s10, s11, [pB], #8
+
+ fmul v24.4s, v0.4s, v10.s[0]
+ fmul v28.4s, v0.4s, v11.s[0]
+
+ ldp q2, q3, [pA], #32
+
fmul v17.4s, v1.4s, v8.s[0]
+ fmul v21.4s, v1.4s, v9.s[0]
+
+ ldp q4, q5, [pA], #32
+
+ fmul v25.4s, v1.4s, v10.s[0]
+ fmul v29.4s, v1.4s, v11.s[0]
+
+ ldp s12, s13, [pB], #8
+
fmul v18.4s, v2.4s, v8.s[0]
+ fmul v22.4s, v2.4s, v9.s[0]
+
+ ldp s14, s15, [pB], #8
+
fmul v19.4s, v3.4s, v8.s[0]
+ fmul v23.4s, v3.4s, v9.s[0]
- fmul v20.4s, v0.4s, v8.s[1]
- fmul v21.4s, v1.4s, v8.s[1]
- fmul v22.4s, v2.4s, v8.s[1]
- fmul v23.4s, v3.4s, v8.s[1]
+ ldp q6, q7, [pA], #32
- fmul v24.4s, v0.4s, v9.s[0]
- fmul v25.4s, v1.4s, v9.s[0]
- fmul v26.4s, v2.4s, v9.s[0]
- fmul v27.4s, v3.4s, v9.s[0]
+ fmul v26.4s, v2.4s, v10.s[0]
+ fmul v30.4s, v2.4s, v11.s[0]
- fmul v28.4s, v0.4s, v9.s[1]
- fmul v29.4s, v1.4s, v9.s[1]
- fmul v30.4s, v2.4s, v9.s[1]
- fmul v31.4s, v3.4s, v9.s[1]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
- ld1 {v12.2s, v13.2s}, [pB]
- add pB, pB, #16
- ld1 {v4.4s}, [pA]
- add pA, pA, #16
- ld1 {v5.4s}, [pA]
- add pA, pA, #16
- ld1 {v6.4s}, [pA]
- add pA, pA, #16
- ld1 {v7.4s}, [pA]
- add pA, pA, #16
+ fmul v27.4s, v3.4s, v10.s[0]
+ fmul v31.4s, v3.4s, v11.s[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL16x4_M1
fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.s[0]
+
+ ldp q4, q5, [pA], #32
+
fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.s[1]
- fmla v21.4s, v1.4s, v8.s[1]
- fmla v22.4s, v2.4s, v8.s[1]
- fmla v23.4s, v3.4s, v8.s[1]
+ fmla v20.4s, v0.4s, v9.s[0]
+ fmla v21.4s, v1.4s, v9.s[0]
- fmla v24.4s, v0.4s, v9.s[0]
- fmla v25.4s, v1.4s, v9.s[0]
- fmla v26.4s, v2.4s, v9.s[0]
- fmla v27.4s, v3.4s, v9.s[0]
+ ldp s12, s13, [pB], #8
- fmla v28.4s, v0.4s, v9.s[1]
- fmla v29.4s, v1.4s, v9.s[1]
- fmla v30.4s, v2.4s, v9.s[1]
- fmla v31.4s, v3.4s, v9.s[1]
+ fmla v22.4s, v2.4s, v9.s[0]
+ fmla v23.4s, v3.4s, v9.s[0]
- ld1 {v12.2s, v13.2s}, [pB]
- add pB, pB, #16
- ld1 {v4.4s}, [pA]
- add pA, pA, #16
- ld1 {v5.4s}, [pA]
- add pA, pA, #16
- ld1 {v6.4s}, [pA]
- add pA, pA, #16
- ld1 {v7.4s}, [pA]
- add pA, pA, #16
+ ldp s14, s15, [pB], #8
+
+ fmla v24.4s, v0.4s, v10.s[0]
+ fmla v25.4s, v1.4s, v10.s[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+
+ fmla v26.4s, v2.4s, v10.s[0]
+ fmla v27.4s, v3.4s, v10.s[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+ fmla v28.4s, v0.4s, v11.s[0]
+ fmla v29.4s, v1.4s, v11.s[0]
+
+ ldp q6, q7, [pA], #32
+
+ fmla v30.4s, v2.4s, v11.s[0]
+ fmla v31.4s, v3.4s, v11.s[0]
.endm
.macro KERNEL16x4_M2
fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.s[0]
+
+ ldp q0, q1, [pA], #32
+
fmla v18.4s, v6.4s, v12.s[0]
fmla v19.4s, v7.4s, v12.s[0]
- fmla v20.4s, v4.4s, v12.s[1]
- fmla v21.4s, v5.4s, v12.s[1]
- fmla v22.4s, v6.4s, v12.s[1]
- fmla v23.4s, v7.4s, v12.s[1]
+ fmla v20.4s, v4.4s, v13.s[0]
+ fmla v21.4s, v5.4s, v13.s[0]
- fmla v24.4s, v4.4s, v13.s[0]
- fmla v25.4s, v5.4s, v13.s[0]
- fmla v26.4s, v6.4s, v13.s[0]
- fmla v27.4s, v7.4s, v13.s[0]
+ ldp s8, s9, [pB], #8
- fmla v28.4s, v4.4s, v13.s[1]
- fmla v29.4s, v5.4s, v13.s[1]
- fmla v30.4s, v6.4s, v13.s[1]
- fmla v31.4s, v7.4s, v13.s[1]
+ fmla v22.4s, v6.4s, v13.s[0]
+ fmla v23.4s, v7.4s, v13.s[0]
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.4s}, [pA]
- add pA, pA, #16
- ld1 {v1.4s}, [pA]
- add pA, pA, #16
- ld1 {v2.4s}, [pA]
- add pA, pA, #16
- ld1 {v3.4s}, [pA]
- add pA, pA, #16
+ ldp s10, s11, [pB], #8
+
+ fmla v24.4s, v4.4s, v14.s[0]
+ fmla v25.4s, v5.4s, v14.s[0]
+
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+ fmla v26.4s, v6.4s, v14.s[0]
+ fmla v27.4s, v7.4s, v14.s[0]
+
+ ldp q2, q3, [pA], #32
+
+ fmla v28.4s, v4.4s, v15.s[0]
+ fmla v29.4s, v5.4s, v15.s[0]
+
+ fmla v30.4s, v6.4s, v15.s[0]
+ fmla v31.4s, v7.4s, v15.s[0]
.endm
.macro KERNEL16x4_E
fmla v16.4s, v4.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v13.s[0]
+ fmla v24.4s, v4.4s, v14.s[0]
+ fmla v28.4s, v4.4s, v15.s[0]
+
fmla v17.4s, v5.4s, v12.s[0]
- fmla v18.4s, v6.4s, v12.s[0]
- fmla v19.4s, v7.4s, v12.s[0]
+ fmla v21.4s, v5.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v14.s[0]
+ fmla v29.4s, v5.4s, v15.s[0]
- fmla v20.4s, v4.4s, v12.s[1]
- fmla v21.4s, v5.4s, v12.s[1]
- fmla v22.4s, v6.4s, v12.s[1]
- fmla v23.4s, v7.4s, v12.s[1]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- fmla v24.4s, v4.4s, v13.s[0]
- fmla v25.4s, v5.4s, v13.s[0]
- fmla v26.4s, v6.4s, v13.s[0]
- fmla v27.4s, v7.4s, v13.s[0]
+ fmla v18.4s, v6.4s, v12.s[0]
+ fmla v22.4s, v6.4s, v13.s[0]
+ fmla v26.4s, v6.4s, v14.s[0]
+ fmla v30.4s, v6.4s, v15.s[0]
- fmla v28.4s, v4.4s, v13.s[1]
- fmla v29.4s, v5.4s, v13.s[1]
- fmla v30.4s, v6.4s, v13.s[1]
- fmla v31.4s, v7.4s, v13.s[1]
+ fmla v19.4s, v7.4s, v12.s[0]
+ fmla v23.4s, v7.4s, v13.s[0]
+ fmla v27.4s, v7.4s, v14.s[0]
+ fmla v31.4s, v7.4s, v15.s[0]
.endm
.macro KERNEL16x4_SUB
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.4s}, [pA]
- add pA, pA, #16
- ld1 {v1.4s}, [pA]
- add pA, pA, #16
- ld1 {v2.4s}, [pA]
- add pA, pA, #16
- ld1 {v3.4s}, [pA]
- add pA, pA, #16
+ ldp q0, q1, [pA], #32
+ ldp s8, s9, [pB], #8
fmla v16.4s, v0.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v9.s[0]
+
+ ldp s10, s11, [pB], #8
+
+ fmla v24.4s, v0.4s, v10.s[0]
+ fmla v28.4s, v0.4s, v11.s[0]
+
+ ldp q2, q3, [pA], #32
+
fmla v17.4s, v1.4s, v8.s[0]
+ fmla v21.4s, v1.4s, v9.s[0]
+
+ fmla v25.4s, v1.4s, v10.s[0]
+ fmla v29.4s, v1.4s, v11.s[0]
+
fmla v18.4s, v2.4s, v8.s[0]
+ fmla v22.4s, v2.4s, v9.s[0]
+
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
fmla v19.4s, v3.4s, v8.s[0]
+ fmla v23.4s, v3.4s, v9.s[0]
- fmla v20.4s, v0.4s, v8.s[1]
- fmla v21.4s, v1.4s, v8.s[1]
- fmla v22.4s, v2.4s, v8.s[1]
- fmla v23.4s, v3.4s, v8.s[1]
+ fmla v26.4s, v2.4s, v10.s[0]
+ fmla v30.4s, v2.4s, v11.s[0]
- fmla v24.4s, v0.4s, v9.s[0]
- fmla v25.4s, v1.4s, v9.s[0]
- fmla v26.4s, v2.4s, v9.s[0]
- fmla v27.4s, v3.4s, v9.s[0]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- fmla v28.4s, v0.4s, v9.s[1]
- fmla v29.4s, v1.4s, v9.s[1]
- fmla v30.4s, v2.4s, v9.s[1]
- fmla v31.4s, v3.4s, v9.s[1]
+ fmla v27.4s, v3.4s, v10.s[0]
+ fmla v31.4s, v3.4s, v11.s[0]
.endm
.macro SAVE16x4
- add pCRow1, pCRow0, LDC
+ fmov alpha0, alpha
+
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
fmul v0.4s, v16.4s, alphaV0
- fmul v1.4s, v17.4s, alphaV1
- fmul v2.4s, v18.4s, alphaV2
- fmul v3.4s, v19.4s, alphaV3
- st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+ fmul v1.4s, v17.4s, alphaV0
+ stp q0, q1, [pCRow0]
- add pCRow2, pCRow1, LDC
+ add pCRow0, pCRow0, #32
+
+ fmul v2.4s, v18.4s, alphaV0
+ fmul v3.4s, v19.4s, alphaV0
+ stp q2, q3, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
fmul v4.4s, v20.4s, alphaV0
- fmul v5.4s, v21.4s, alphaV1
- fmul v6.4s, v22.4s, alphaV2
- fmul v7.4s, v23.4s, alphaV3
- st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+ fmul v5.4s, v21.4s, alphaV0
+ stp q4, q5, [pCRow1]
+
+ add pCRow1, pCRow1, #32
- add pCRow1, pCRow2, LDC
+ fmul v6.4s, v22.4s, alphaV0
+ fmul v7.4s, v23.4s, alphaV0
+ stp q6, q7, [pCRow1]
+
+ add pCRow1, pCRow1, #32
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
fmul v0.4s, v24.4s, alphaV0
- fmul v1.4s, v25.4s, alphaV1
- fmul v2.4s, v26.4s, alphaV2
- fmul v3.4s, v27.4s, alphaV3
- st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2]
+ fmul v1.4s, v25.4s, alphaV0
+ stp q0, q1, [pCRow2]
+
+ add pCRow2, pCRow2, #32
+
+ fmul v2.4s, v26.4s, alphaV0
+ fmul v3.4s, v27.4s, alphaV0
+ stp q2, q3, [pCRow2]
+
+ add pCRow2, pCRow2, #32
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
fmul v4.4s, v28.4s, alphaV0
- fmul v5.4s, v29.4s, alphaV1
- fmul v6.4s, v30.4s, alphaV2
- fmul v7.4s, v31.4s, alphaV3
- st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+ fmul v5.4s, v29.4s, alphaV0
+ stp q4, q5, [pCRow3]
- add pCRow0, pCRow0, #64
+ add pCRow3, pCRow3, #32
+ fmul v6.4s, v30.4s, alphaV0
+ fmul v7.4s, v31.4s, alphaV0
+ stp q6, q7, [pCRow3]
+
+ add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@@ -362,260 +401,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_I
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.4s}, [pA]
- add pA, pA, #16
- ld1 {v1.4s}, [pA]
- add pA, pA, #16
+ ldp s8, s9, [pB], #8
+ ldp s10, s11, [pB], #8
+
+ ldr q0, [pA], #16
+ ldr q1, [pA], #16
fmul v16.4s, v0.4s, v8.s[0]
fmul v17.4s, v1.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.s[1]
- fmul v21.4s, v1.4s, v8.s[1]
- fmul v24.4s, v0.4s, v9.s[0]
- fmul v25.4s, v1.4s, v9.s[0]
- fmul v28.4s, v0.4s, v9.s[1]
- fmul v29.4s, v1.4s, v9.s[1]
-
- ld1 {v12.2s, v13.2s}, [pB]
- add pB, pB, #16
- ld1 {v4.4s}, [pA]
- add pA, pA, #16
- ld1 {v5.4s}, [pA]
- add pA, pA, #16
+ fmul v20.4s, v0.4s, v9.s[0]
+ fmul v21.4s, v1.4s, v9.s[0]
+ fmul v24.4s, v0.4s, v10.s[0]
+ fmul v25.4s, v1.4s, v10.s[0]
+ fmul v28.4s, v0.4s, v11.s[0]
+ fmul v29.4s, v1.4s, v11.s[0]
+
+ ldp s12, s13, [pB], #8
+ ldp s14, s15, [pB], #8
+
+ ldr q4, [pA], #16
+ ldr q5, [pA], #16
.endm
.macro KERNEL8x4_M1
fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.s[1]
- fmla v21.4s, v1.4s, v8.s[1]
- fmla v24.4s, v0.4s, v9.s[0]
- fmla v25.4s, v1.4s, v9.s[0]
- fmla v28.4s, v0.4s, v9.s[1]
- fmla v29.4s, v1.4s, v9.s[1]
-
- ld1 {v12.2s, v13.2s}, [pB]
- add pB, pB, #16
- ld1 {v4.4s}, [pA]
- add pA, pA, #16
- ld1 {v5.4s}, [pA]
- add pA, pA, #16
+ fmla v20.4s, v0.4s, v9.s[0]
+ fmla v21.4s, v1.4s, v9.s[0]
+ fmla v24.4s, v0.4s, v10.s[0]
+ fmla v25.4s, v1.4s, v10.s[0]
+ fmla v28.4s, v0.4s, v11.s[0]
+ fmla v29.4s, v1.4s, v11.s[0]
+
+ ldp s12, s13, [pB], #8
+ ldp s14, s15, [pB], #8
+
+ ldr q4, [pA], #16
+ ldr q5, [pA], #16
.endm
.macro KERNEL8x4_M2
fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.s[0]
- fmla v20.4s, v4.4s, v12.s[1]
- fmla v21.4s, v5.4s, v12.s[1]
- fmla v24.4s, v4.4s, v13.s[0]
- fmla v25.4s, v5.4s, v13.s[0]
- fmla v28.4s, v4.4s, v13.s[1]
- fmla v29.4s, v5.4s, v13.s[1]
-
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.4s}, [pA]
- add pA, pA, #16
- ld1 {v1.4s}, [pA]
- add pA, pA, #16
+ fmla v20.4s, v4.4s, v13.s[0]
+ fmla v21.4s, v5.4s, v13.s[0]
+ fmla v24.4s, v4.4s, v14.s[0]
+ fmla v25.4s, v5.4s, v14.s[0]
+ fmla v28.4s, v4.4s, v15.s[0]
+ fmla v29.4s, v5.4s, v15.s[0]
+
+ ldp s8, s9, [pB], #8
+ ldp s10, s11, [pB], #8
+
+ ldr q0, [pA], #16
+ ldr q1, [pA], #16
.endm
.macro KERNEL8x4_E
fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.s[0]
- fmla v20.4s, v4.4s, v12.s[1]
- fmla v21.4s, v5.4s, v12.s[1]
- fmla v24.4s, v4.4s, v13.s[0]
- fmla v25.4s, v5.4s, v13.s[0]
- fmla v28.4s, v4.4s, v13.s[1]
- fmla v29.4s, v5.4s, v13.s[1]
+ fmla v20.4s, v4.4s, v13.s[0]
+ fmla v21.4s, v5.4s, v13.s[0]
+ fmla v24.4s, v4.4s, v14.s[0]
+ fmla v25.4s, v5.4s, v14.s[0]
+ fmla v28.4s, v4.4s, v15.s[0]
+ fmla v29.4s, v5.4s, v15.s[0]
.endm
.macro KERNEL8x4_SUB
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.4s}, [pA]
- add pA, pA, #16
- ld1 {v1.4s}, [pA]
- add pA, pA, #16
+ ldp s8, s9, [pB], #8
+ ldp s10, s11, [pB], #8
+
+ ldr q0, [pA], #16
+ ldr q1, [pA], #16
fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.s[1]
- fmla v21.4s, v1.4s, v8.s[1]
- fmla v24.4s, v0.4s, v9.s[0]
- fmla v25.4s, v1.4s, v9.s[0]
- fmla v28.4s, v0.4s, v9.s[1]
- fmla v29.4s, v1.4s, v9.s[1]
+ fmla v20.4s, v0.4s, v9.s[0]
+ fmla v21.4s, v1.4s, v9.s[0]
+ fmla v24.4s, v0.4s, v10.s[0]
+ fmla v25.4s, v1.4s, v10.s[0]
+ fmla v28.4s, v0.4s, v11.s[0]
+ fmla v29.4s, v1.4s, v11.s[0]
.endm
.macro SAVE8x4
- add pCRow1, pCRow0, LDC
+ fmov alpha0, alpha
fmul v0.4s, v16.4s, alphaV0
- fmul v1.4s, v17.4s, alphaV1
- st1 {v0.4s, v1.4s}, [pCRow0]
+ fmul v1.4s, v17.4s, alphaV0
+ stp q0, q1, [pCRow0]
- add pCRow2, pCRow1, LDC
+ add pCRow0, pCRow0, #32
- fmul v4.4s, v20.4s, alphaV0
- fmul v5.4s, v21.4s, alphaV1
- st1 {v4.4s, v5.4s}, [pCRow1]
+ fmul v2.4s, v20.4s, alphaV0
+ fmul v3.4s, v21.4s, alphaV0
+ stp q2, q3, [pCRow1]
- add pCRow1, pCRow2, LDC
+ add pCRow1, pCRow1, #32
- fmul v0.4s, v24.4s, alphaV0
- fmul v1.4s, v25.4s, alphaV1
- st1 {v0.4s, v1.4s}, [pCRow2]
+ fmul v4.4s, v24.4s, alphaV0
+ fmul v5.4s, v25.4s, alphaV0
+ stp q4, q5, [pCRow2]
- fmul v4.4s, v28.4s, alphaV0
- fmul v5.4s, v29.4s, alphaV1
- st1 {v4.4s, v5.4s}, [pCRow1]
+ add pCRow2, pCRow2, #32
- add pCRow0, pCRow0, #32
+ fmul v6.4s, v28.4s, alphaV0
+ fmul v7.4s, v29.4s, alphaV0
+ stp q6, q7, [pCRow3]
+
+ add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
.macro INIT4x4
fmov s16, wzr
- fmov s17, s16
- fmov s20, s17
- fmov s21, s16
- fmov s24, s17
- fmov s25, s16
- fmov s28, s17
- fmov s29, s16
+ fmov s20, wzr
+ fmov s24, wzr
+ fmov s28, wzr
.endm
.macro KERNEL4x4_I
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.2s, v1.2s}, [pA]
- add pA, pA, #16
+ ldp s8, s9, [pB], #8
+ ldp s10, s11, [pB], #8
- fmul v16.2s, v0.2s, v8.s[0]
- fmul v29.2s, v1.2s, v9.s[1]
+ ldr q0, [pA], #16
- fmul v20.2s, v0.2s, v8.s[1]
- fmul v25.2s, v1.2s, v9.s[0]
-
- fmul v24.2s, v0.2s, v9.s[0]
- fmul v21.2s, v1.2s, v8.s[1]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v20.4s, v0.4s, v9.s[0]
+ fmul v24.4s, v0.4s, v10.s[0]
+ fmul v28.4s, v0.4s, v11.s[0]
- fmul v28.2s, v0.2s, v9.s[1]
- fmul v17.2s, v1.2s, v8.s[0]
+ ldp s12, s13, [pB], #8
+ ldp s14, s15, [pB], #8
- ld1 {v12.2s, v13.2s}, [pB]
- add pB, pB, #16
- ld1 {v4.2s, v5.2s}, [pA]
- add pA, pA, #16
+ ldr q1, [pA], #16
.endm
.macro KERNEL4x4_M1
- fmla v16.2s, v0.2s, v8.s[0]
- fmla v29.2s, v1.2s, v9.s[1]
-
- ld1 {v12.2s, v13.2s}, [pB] // For next round
- add pB, pB, #16
-
- fmla v20.2s, v0.2s, v8.s[1]
- fmla v25.2s, v1.2s, v9.s[0]
-
- ld1 {v4.2s, v5.2s}, [pA] // For next round
- add pA, pA, #16
-
- fmla v24.2s, v0.2s, v9.s[0]
- fmla v21.2s, v1.2s, v8.s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v9.s[0]
+ fmla v24.4s, v0.4s, v10.s[0]
+ fmla v28.4s, v0.4s, v11.s[0]
- prfm PLDL1KEEP, [pB, #512]
+ ldp s12, s13, [pB], #8
+ ldp s14, s15, [pB], #8
- fmla v28.2s, v0.2s, v9.s[1]
- fmla v17.2s, v1.2s, v8.s[0]
+ ldr q1, [pA], #16
.endm
.macro KERNEL4x4_M2
- fmla v16.2s, v4.2s, v12.s[0]
- fmla v29.2s, v5.2s, v13.s[1]
-
- ld1 {v8.2s, v9.2s}, [pB] // For next round
- add pB, pB, #16
+ fmla v16.4s, v1.4s, v12.s[0]
+ fmla v20.4s, v1.4s, v13.s[0]
+ fmla v24.4s, v1.4s, v14.s[0]
+ fmla v28.4s, v1.4s, v15.s[0]
- fmla v20.2s, v4.2s, v12.s[1]
- fmla v25.2s, v5.2s, v13.s[0]
+ ldp s8, s9, [pB], #8
+ ldp s10, s11, [pB], #8
- ld1 {v0.2s, v1.2s}, [pA] // For next round
- add pA, pA, #16
-
- fmla v24.2s, v4.2s, v13.s[0]
- fmla v21.2s, v5.2s, v12.s[1]
-
- prfm PLDL1KEEP, [pA, #512]
-
- fmla v28.2s, v4.2s, v13.s[1]
- fmla v17.2s, v5.2s, v12.s[0]
+ ldr q0, [pA], #16
.endm
.macro KERNEL4x4_E
- fmla v16.2s, v4.2s, v12.s[0]
- fmla v29.2s, v5.2s, v13.s[1]
-
- fmla v20.2s, v4.2s, v12.s[1]
- fmla v25.2s, v5.2s, v13.s[0]
-
- fmla v24.2s, v4.2s, v13.s[0]
- fmla v21.2s, v5.2s, v12.s[1]
-
- fmla v28.2s, v4.2s, v13.s[1]
- fmla v17.2s, v5.2s, v12.s[0]
+ fmla v16.4s, v1.4s, v12.s[0]
+ fmla v20.4s, v1.4s, v13.s[0]
+ fmla v24.4s, v1.4s, v14.s[0]
+ fmla v28.4s, v1.4s, v15.s[0]
.endm
.macro KERNEL4x4_SUB
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.2s, v1.2s}, [pA]
- add pA, pA, #16
+ ldp s8, s9, [pB], #8
+ ldp s10, s11, [pB], #8
- fmla v16.2s, v0.2s, v8.s[0]
- fmla v29.2s, v1.2s, v9.s[1]
+ ldr q0, [pA], #16
- fmla v20.2s, v0.2s, v8.s[1]
- fmla v25.2s, v1.2s, v9.s[0]
-
- fmla v24.2s, v0.2s, v9.s[0]
- fmla v21.2s, v1.2s, v8.s[1]
-
- fmla v28.2s, v0.2s, v9.s[1]
- fmla v17.2s, v1.2s, v8.s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v9.s[0]
+ fmla v24.4s, v0.4s, v10.s[0]
+ fmla v28.4s, v0.4s, v11.s[0]
.endm
.macro SAVE4x4
+ fmov alpha0, alpha
- fmul v8.2s, v16.2s, alphaV0
- fmul v9.2s, v17.2s, alphaV1
- st1 {v8.2s, v9.2s}, [pCRow0]
+ fmul v0.4s, v16.4s, alphaV0
+ str q0, [pCRow0]
- add pCRow1, pCRow0, LDC
+ add pCRow0, pCRow0, #16
- fmul v12.2s, v20.2s, alphaV2
- fmul v13.2s, v21.2s, alphaV3
- st1 {v12.2s, v13.2s}, [pCRow1]
+ fmul v1.4s, v20.4s, alphaV0
+ str q1, [pCRow1]
- add pCRow2, pCRow1, LDC
+ add pCRow1, pCRow1, #16
- fmul v8.2s, v24.2s, alphaV0
- fmul v9.2s, v25.2s, alphaV1
- st1 {v8.2s, v9.2s}, [pCRow2]
+ fmul v2.4s, v24.4s, alphaV0
+ str q2, [pCRow2]
- add pCRow1, pCRow2, LDC
+ add pCRow2, pCRow2, #16
- fmul v12.2s, v28.2s, alphaV2
- fmul v13.2s, v29.2s, alphaV3
- st1 {v12.2s, v13.2s}, [pCRow1]
+ fmul v3.4s, v28.4s, alphaV0
+ str q3, [pCRow3]
- add pCRow0, pCRow0, #16
+ add pCRow3, pCRow3, #16
.endm
/******************************************************************************/
@@ -628,34 +616,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL2x4_SUB
- ld1 {v8.2s, v9.2s}, [pB]
- add pB, pB, #16
- ld1 {v0.2s}, [pA]
- add pA, pA, #8
+ ldp s8, s9, [pB], #8
+ ldp s10, s11, [pB], #8
+
+ ldr d0, [pA], #8
fmla v16.2s, v0.2s, v8.s[0]
- fmla v20.2s, v0.2s, v8.s[1]
- fmla v24.2s, v0.2s, v9.s[0]
- fmla v28.2s, v0.2s, v9.s[1]
+ fmla v20.2s, v0.2s, v9.s[0]
+ fmla v24.2s, v0.2s, v10.s[0]
+ fmla v28.2s, v0.2s, v11.s[0]
.endm
.macro SAVE2x4
- fmul v8.2s, v16.2s, alphaV0
- st1 {v8.2s}, [pCRow0]
+ fmov alpha0, alpha
- add pCRow1, pCRow0, LDC
- fmul v12.2s, v20.2s, alphaV1
- st1 {v12.2s}, [pCRow1]
+ fmul v0.2s, v16.2s, alphaV0
+ str d0, [pCRow0]
- add pCRow2, pCRow1, LDC
- fmul v8.2s, v24.2s, alphaV2
- st1 {v8.2s}, [pCRow2]
+ add pCRow0, pCRow0, #8
- add pCRow1, pCRow2, LDC
- fmul v12.2s, v28.2s, alphaV3
- st1 {v12.2s}, [pCRow1]
+ fmul v1.2s, v20.2s, alphaV0
+ str d1, [pCRow1]
- add pCRow0, pCRow0, #8
+ add pCRow1, pCRow1, #8
+
+ fmul v0.2s, v24.2s, alphaV0
+ str d0, [pCRow2]
+
+ add pCRow2, pCRow2, #8
+
+ fmul v1.2s, v28.2s, alphaV0
+ str d1, [pCRow3]
+
+ add pCRow3, pCRow3, #8
.endm
/******************************************************************************/
@@ -677,20 +670,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
- add pCRow1, pCRow0, LDC
+ fmov alpha0, alpha
fmul v8.2s, v16.2s, alphaV0
st1 {v8.s}[0], [pCRow0]
st1 {v8.s}[1], [pCRow1]
- add pCRow2, pCRow1, LDC
- add pCRow1, pCRow2, LDC
+ add pCRow0, pCRow0, #4
+ add pCRow1, pCRow1, #4
- fmul v12.2s, v20.2s, alphaV1
+ fmul v12.2s, v20.2s, alphaV0
st1 {v12.s}[0], [pCRow2]
- st1 {v12.s}[1], [pCRow1]
+ st1 {v12.s}[1], [pCRow3]
- add pCRow0, pCRow0, #4
+ add pCRow2, pCRow2, #4
+ add pCRow3, pCRow3, #4
.endm
/******************************************************************************/
@@ -730,18 +724,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE16x2
+ fmov alpha0, alpha
+
add pCRow1, pCRow0, LDC
fmul v0.4s, v16.4s, alphaV0
- fmul v1.4s, v17.4s, alphaV1
- fmul v2.4s, v18.4s, alphaV2
- fmul v3.4s, v19.4s, alphaV3
+ fmul v1.4s, v17.4s, alphaV0
+ fmul v2.4s, v18.4s, alphaV0
+ fmul v3.4s, v19.4s, alphaV0
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
fmul v4.4s, v20.4s, alphaV0
- fmul v5.4s, v21.4s, alphaV1
- fmul v6.4s, v22.4s, alphaV2
- fmul v7.4s, v23.4s, alphaV3
+ fmul v5.4s, v21.4s, alphaV0
+ fmul v6.4s, v22.4s, alphaV0
+ fmul v7.4s, v23.4s, alphaV0
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
add pCRow0, pCRow0, #64
@@ -772,16 +768,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x2
+ fmov alpha0, alpha
+
add pCRow1, pCRow0, LDC
fmul v0.4s, v16.4s, alphaV0
- fmul v1.4s, v17.4s, alphaV1
+ fmul v1.4s, v17.4s, alphaV0
st1 {v0.4s, v1.4s}, [pCRow0]
add pCRow2, pCRow1, LDC
fmul v4.4s, v20.4s, alphaV0
- fmul v5.4s, v21.4s, alphaV1
+ fmul v5.4s, v21.4s, alphaV0
st1 {v4.4s, v5.4s}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -809,15 +807,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
+ fmov alpha0, alpha
fmul v8.2s, v16.2s, alphaV0
- fmul v9.2s, v17.2s, alphaV1
+ fmul v9.2s, v17.2s, alphaV0
st1 {v8.2s, v9.2s}, [pCRow0]
add pCRow1, pCRow0, LDC
- fmul v12.2s, v20.2s, alphaV2
- fmul v13.2s, v21.2s, alphaV3
+ fmul v12.2s, v20.2s, alphaV0
+ fmul v13.2s, v21.2s, alphaV0
st1 {v12.2s, v13.2s}, [pCRow1]
add pCRow0, pCRow0, #16
@@ -842,12 +841,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
+ fmov alpha0, alpha
+
fmul v8.2s, v16.2s, alphaV0
st1 {v8.2s}, [pCRow0]
add pCRow1 , pCRow0, LDC
- fmul v12.2s, v20.2s, alphaV1
+ fmul v12.2s, v20.2s, alphaV0
st1 {v12.2s}, [pCRow1]
add pCRow0, pCRow0, #8
@@ -870,6 +871,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
+ fmov alpha0, alpha
+
add pCRow1 , pCRow0, LDC
fmul v8.2s, v16.2s, alphaV0
@@ -908,11 +911,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE16x1
+ fmov alpha0, alpha
fmul v0.4s, v16.4s, alphaV0
- fmul v1.4s, v17.4s, alphaV1
- fmul v2.4s, v18.4s, alphaV2
- fmul v3.4s, v19.4s, alphaV3
+ fmul v1.4s, v17.4s, alphaV0
+ fmul v2.4s, v18.4s, alphaV0
+ fmul v3.4s, v19.4s, alphaV0
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
add pCRow0, pCRow0, #64
@@ -939,9 +943,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x1
+ fmov alpha0, alpha
fmul v0.4s, v16.4s, alphaV0
- fmul v1.4s, v17.4s, alphaV1
+ fmul v1.4s, v17.4s, alphaV0
st1 {v0.4s, v1.4s}, [pCRow0]
add pCRow0, pCRow0, #32
@@ -966,9 +971,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
+ fmov alpha0, alpha
fmul v8.2s, v16.2s, alphaV0
- fmul v9.2s, v17.2s, alphaV1
+ fmul v9.2s, v17.2s, alphaV0
st1 {v8.2s, v9.2s}, [pCRow0]
add pCRow0, pCRow0, #16
@@ -991,6 +997,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
+ fmov alpha0, alpha
fmul v8.2s, v16.2s, alphaV0
st1 {v8.2s}, [pCRow0]
@@ -1015,6 +1022,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
+ fmov alpha0, alpha
fmul s8, s16, alpha0
str s8, [pCRow0]
@@ -1043,10 +1051,10 @@ strmm_kernel_begin:
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
- fmov alpha0, s0
- fmov alpha1, s0
- fmov alpha2, s0
- fmov alpha3, s0
+ prfm PLDL1KEEP, [origPB]
+ prfm PLDL1KEEP, [origPA]
+
+ fmov alpha, s0
lsl LDC, LDC, #2 // ldc = ldc * 4
@@ -1063,8 +1071,13 @@ strmm_kernel_begin:
/******************************************************************************/
strmm_kernel_L4_BEGIN:
- mov pCRow0, pC // pCRow0 = C
- add pC, pC, LDC, lsl #2
+ mov pCRow0, pC
+ add pCRow1, pCRow0, LDC
+ add pCRow2, pCRow1, LDC
+ add pCRow3, pCRow2, LDC
+
+ add pC, pCRow3, LDC
+
#if defined(LEFT)
mov tempOffset, offset
@@ -1078,6 +1091,7 @@ strmm_kernel_L4_M16_BEGIN:
cmp counterI, #0
ble strmm_kernel_L4_M8_BEGIN
+ .align 5
strmm_kernel_L4_M16_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1098,38 +1112,64 @@ strmm_kernel_L4_M16_20:
add tempK, tempOffset, #4
#endif
- asr counterL , tempK, #1 // L = K / 2
- cmp counterL , #2 // is there at least 4 to do?
+ asr counterL , tempK, #3
+ cmp counterL , #2
blt strmm_kernel_L4_M16_32
- KERNEL16x4_I // do one in the K
- KERNEL16x4_M2 // do another in the K
+ KERNEL16x4_I
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
subs counterL, counterL, #2
ble strmm_kernel_L4_M16_22a
- .align 5
+ .align 5
strmm_kernel_L4_M16_22:
KERNEL16x4_M1
KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
subs counterL, counterL, #1
bgt strmm_kernel_L4_M16_22
+ .align 5
strmm_kernel_L4_M16_22a:
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
KERNEL16x4_M1
KERNEL16x4_E
b strmm_kernel_L4_M16_44
+ .align 5
strmm_kernel_L4_M16_32:
tst counterL, #1
ble strmm_kernel_L4_M16_40
KERNEL16x4_I
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
+ KERNEL16x4_M2
+ KERNEL16x4_M1
KERNEL16x4_E
b strmm_kernel_L4_M16_44
@@ -1140,12 +1180,15 @@ strmm_kernel_L4_M16_40:
strmm_kernel_L4_M16_44:
- ands counterL , tempK, #1
+ ands counterL , tempK, #7
ble strmm_kernel_L4_M16_100
+ .align 5
strmm_kernel_L4_M16_46:
KERNEL16x4_SUB
+ subs counterL, counterL, #1
+ bne strmm_kernel_L4_M16_46
strmm_kernel_L4_M16_100:
@@ -1166,6 +1209,9 @@ strmm_kernel_L4_M16_100:
#if defined(LEFT)
add tempOffset, tempOffset, #16
#endif
+ prfm PLDL1KEEP, [pA]
+ prfm PLDL1KEEP, [pA, #64]
+ prfm PLDL1KEEP, [origPB]
strmm_kernel_L4_M16_END:
subs counterI, counterI, #1
diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S
index 1cb695e56..08a1531cf 100644
--- a/kernel/arm64/zgemm_kernel_4x4.S
+++ b/kernel/arm64/zgemm_kernel_4x4.S
@@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
-#define pA x15
-#define alpha_save_R x16
-#define alpha_save_I x17
+#define pCRow3 x15
+#define pA x16
+#define alphaR x17
+#define alphaI x18
#define alpha0_R d10
#define alphaV0_R v10.d[0]
#define alpha0_I d11
#define alphaV0_I v11.d[0]
-#define alpha1_R d14
-#define alphaV1_R v14.d[0]
-#define alpha1_I d15
-#define alphaV1_I v15.d[0]
-
+#define A_PRE_SIZE 2560
+#define B_PRE_SIZE 448
+#define C_PRE_SIZE 128
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define OP_rr fmla
@@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
-// 15 pA
-// 16 alpha_save_R
-// 17 alpha_save_I
-// 18 must save
+// 15 pCRow3
+// 16 pA
+// 17 alpha_save_R
+// 18 must save alpha_save_I
// 19 must save
// 20 must save
// 21 must save
@@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_I
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
- ld2 {v10.2d, v11.2d}, [pB]
- add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- ld2 {v2.2d, v3.2d}, [pA]
- add pA, pA, #32
fmul v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
@@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v17.2d, v1.2d, v8.d[0]
- fmul v18.2d, v2.2d, v8.d[0]
- OP_ii v18.2d, v3.2d, v9.d[0]
-#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
- defined(RR) || defined(RC) || defined(CR) || defined(CC)
- eor v19.16b, v19.16b, v19.16b
- fmls v19.2d, v2.2d, v9.d[0]
-#else
- fmul v19.2d, v2.2d, v9.d[0]
-#endif
- OP_ir v19.2d, v3.2d, v8.d[0]
+ ld2 {v2.2d, v3.2d}, [pA]
+ add pA, pA, #32
fmul v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
@@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v21.2d, v1.2d, v8.d[1]
+ ld2 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
+
fmul v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v23.2d, v3.2d, v8.d[1]
+ ld2 {v12.2d, v13.2d}, [pB]
+ add pB, pB, #32
+
+ fmul v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v19.16b, v19.16b, v19.16b
+ fmls v19.2d, v2.2d, v9.d[0]
+#else
+ fmul v19.2d, v2.2d, v9.d[0]
+#endif
+ OP_ir v19.2d, v3.2d, v8.d[0]
+
+ ld2 {v4.2d, v5.2d} , [pA]
+ add pA, pA, #32
+
fmul v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v25.2d, v1.2d, v10.d[0]
+ ld2 {v6.2d, v7.2d} , [pA]
+ add pA, pA, #32
+
fmul v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v27.2d, v3.2d, v10.d[0]
+ ld2 {v14.2d, v15.2d}, [pB]
+ add pB, pB, #32
+
fmul v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v29.2d, v1.2d, v10.d[1]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
fmul v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v31.2d, v3.2d, v10.d[1]
- ld2 {v12.2d, v13.2d}, [pB]
- add pB, pB, #32
- ld2 {v14.2d, v15.2d}, [pB]
- add pB, pB, #32
- ld2 {v4.2d, v5.2d} , [pA]
- add pA, pA, #32
- ld2 {v6.2d, v7.2d} , [pA]
- add pA, pA, #32
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL4x4_M1
@@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
- ld2 {v12.2d, v13.2d}, [pB] // For next round
+ ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.d[0]
@@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
- ld2 {v14.2d, v15.2d}, [pB] // For next round
- add pB, pB, #32
+ ld2 {v4.2d, v5.2d} , [pA]
+ add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
- ld2 {v4.2d, v5.2d} , [pA] // For next round
+ ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
OP_rr v22.2d, v2.2d, v8.d[1]
@@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
- ld2 {v6.2d, v7.2d} , [pA] // For next round
- add pA, pA, #32
+ ld2 {v14.2d, v15.2d}, [pB]
+ add pB, pB, #32
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.d[0]
- prfm PLDL1KEEP, [pA, #512]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.d[0]
- prfm PLDL1KEEP, [pB, #512]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
@@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.d[0]
- ld2 {v8.2d, v9.2d}, [pB] // For next round
+ ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v6.2d, v12.d[0]
@@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.d[0]
- ld2 {v10.2d, v11.2d}, [pB] // For next round
- add pB, pB, #32
+ ld2 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
- ld2 {v0.2d, v1.2d}, [pA] // For next round
+ ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v22.2d, v6.2d, v12.d[1]
@@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.d[1]
- ld2 {v2.2d, v3.2d}, [pA] // For next round
- add pA, pA, #32
+ ld2 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
- prfm PLDL1KEEP, [pA, #512]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.d[0]
- prfm PLDL1KEEP, [pB, #512]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.d[1]
@@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.d[1]
@@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
@@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_SUB
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
- ld2 {v10.2d, v11.2d}, [pB]
- add pB, pB, #32
+
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- ld2 {v2.2d, v3.2d}, [pA]
- add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
- OP_rr v18.2d, v2.2d, v8.d[0]
- OP_ii v18.2d, v3.2d, v9.d[0]
- OP_ri v19.2d, v2.2d, v9.d[0]
- OP_ir v19.2d, v3.2d, v8.d[0]
+ ld2 {v2.2d, v3.2d}, [pA]
+ add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
+ ld2 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
+
+ OP_rr v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+ OP_ri v19.2d, v2.2d, v9.d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
+
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
@@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
- mov pCRow1, pCRow0
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
- ld2 {v0.2d, v1.2d}, [pCRow1]
+ ld2 {v0.2d, v1.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
- fmla v1.2d, v16.2d, alphaV1_I
- fmla v1.2d, v17.2d, alphaV1_R
- st2 {v0.2d, v1.2d}, [pCRow1]
- add pCRow2, pCRow1, #32
- ld2 {v2.2d, v3.2d}, [pCRow2]
+ fmla v1.2d, v16.2d, alphaV0_I
+ fmla v1.2d, v17.2d, alphaV0_R
+ st2 {v0.2d, v1.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+
+ ld2 {v2.2d, v3.2d}, [pCRow0]
fmla v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
- fmla v3.2d, v18.2d, alphaV1_I
- fmla v3.2d, v19.2d, alphaV1_R
- st2 {v2.2d, v3.2d}, [pCRow2]
+ fmla v3.2d, v18.2d, alphaV0_I
+ fmla v3.2d, v19.2d, alphaV0_R
+ st2 {v2.2d, v3.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
- add pCRow1, pCRow1, LDC
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
- fmla v5.2d, v20.2d, alphaV1_I
- fmla v5.2d, v21.2d, alphaV1_R
+ fmla v5.2d, v20.2d, alphaV0_I
+ fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
- add pCRow2, pCRow1, #32
- ld2 {v6.2d, v7.2d}, [pCRow2]
+
+ add pCRow1, pCRow1, #32
+
+ ld2 {v6.2d, v7.2d}, [pCRow1]
fmla v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
- fmla v7.2d, v22.2d, alphaV1_I
- fmla v7.2d, v23.2d, alphaV1_R
- st2 {v6.2d, v7.2d}, [pCRow2]
+ fmla v7.2d, v22.2d, alphaV0_I
+ fmla v7.2d, v23.2d, alphaV0_R
+ st2 {v6.2d, v7.2d}, [pCRow1]
- add pCRow1, pCRow1, LDC
- ld2 {v0.2d, v1.2d}, [pCRow1]
+ add pCRow1, pCRow1, #32
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+ ld2 {v0.2d, v1.2d}, [pCRow2]
fmla v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
- fmla v1.2d, v24.2d, alphaV1_I
- fmla v1.2d, v25.2d, alphaV1_R
- st2 {v0.2d, v1.2d}, [pCRow1]
- add pCRow2, pCRow1, #32
+ fmla v1.2d, v24.2d, alphaV0_I
+ fmla v1.2d, v25.2d, alphaV0_R
+ st2 {v0.2d, v1.2d}, [pCRow2]
+
+ add pCRow2, pCRow2, #32
+
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v2.2d, v26.2d, alphaV0_R
fmls v2.2d, v27.2d, alphaV0_I
- fmla v3.2d, v26.2d, alphaV1_I
- fmla v3.2d, v27.2d, alphaV1_R
+ fmla v3.2d, v26.2d, alphaV0_I
+ fmla v3.2d, v27.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
- add pCRow1, pCRow1, LDC
+ add pCRow2, pCRow2, #32
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
- ld2 {v4.2d, v5.2d}, [pCRow1]
+ ld2 {v4.2d, v5.2d}, [pCRow3]
fmla v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
- fmla v5.2d, v28.2d, alphaV1_I
- fmla v5.2d, v29.2d, alphaV1_R
- st2 {v4.2d, v5.2d}, [pCRow1]
- add pCRow2, pCRow1, #32
- ld2 {v6.2d, v7.2d}, [pCRow2]
+ fmla v5.2d, v28.2d, alphaV0_I
+ fmla v5.2d, v29.2d, alphaV0_R
+ st2 {v4.2d, v5.2d}, [pCRow3]
+
+ add pCRow3, pCRow3, #32
+
+ ld2 {v6.2d, v7.2d}, [pCRow3]
fmla v6.2d, v30.2d, alphaV0_R
fmls v6.2d, v31.2d, alphaV0_I
- fmla v7.2d, v30.2d, alphaV1_I
- fmla v7.2d, v31.2d, alphaV1_R
- st2 {v6.2d, v7.2d}, [pCRow2]
+ fmla v7.2d, v30.2d, alphaV0_I
+ fmla v7.2d, v31.2d, alphaV0_R
+ st2 {v6.2d, v7.2d}, [pCRow3]
- add pCRow0, pCRow0, #64
+ add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
- fmla v1.2d, v16.2d, alphaV1_I
- fmla v1.2d, v17.2d, alphaV1_R
+ fmla v1.2d, v16.2d, alphaV0_I
+ fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
- fmla v5.2d, v20.2d, alphaV1_I
- fmla v5.2d, v21.2d, alphaV1_R
+ fmla v5.2d, v20.2d, alphaV0_I
+ fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
- fmla v1.2d, v24.2d, alphaV1_I
- fmla v1.2d, v25.2d, alphaV1_R
+ fmla v1.2d, v24.2d, alphaV0_I
+ fmla v1.2d, v25.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
- fmla v5.2d, v28.2d, alphaV1_I
- fmla v5.2d, v29.2d, alphaV1_R
+ fmla v5.2d, v28.2d, alphaV0_I
+ fmla v5.2d, v29.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
- fmla d1, d16, alphaV1_I
- fmla d1, d17, alphaV1_R
+ fmla d1, d16, alphaV0_I
+ fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.d, v5.d}[0], [pCRow1]
fmla d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
- fmla d5, d20, alphaV1_I
- fmla d5, d21, alphaV1_R
+ fmla d5, d20, alphaV0_I
+ fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d24, alphaV0_R
fmls d0, d25, alphaV0_I
- fmla d1, d24, alphaV1_I
- fmla d1, d25, alphaV1_R
+ fmla d1, d24, alphaV0_I
+ fmla d1, d25, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.d, v5.d}[0], [pCRow1]
fmla d4, d28, alphaV0_R
fmls d4, d29, alphaV0_I
- fmla d5, d28, alphaV1_I
- fmla d5, d29, alphaV1_R
+ fmla d5, d28, alphaV0_I
+ fmla d5, d29, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
- fmla v1.2d, v16.2d, alphaV1_I
- fmla v1.2d, v17.2d, alphaV1_R
+ fmla v1.2d, v16.2d, alphaV0_I
+ fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
- fmla v3.2d, v18.2d, alphaV1_I
- fmla v3.2d, v19.2d, alphaV1_R
+ fmla v3.2d, v18.2d, alphaV0_I
+ fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow1, pCRow1, LDC
@@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
- fmla v5.2d, v20.2d, alphaV1_I
- fmla v5.2d, v21.2d, alphaV1_R
+ fmla v5.2d, v20.2d, alphaV0_I
+ fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v6.2d, v7.2d}, [pCRow2]
fmla v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
- fmla v7.2d, v22.2d, alphaV1_I
- fmla v7.2d, v23.2d, alphaV1_R
+ fmla v7.2d, v22.2d, alphaV0_I
+ fmla v7.2d, v23.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
- fmla v1.2d, v16.2d, alphaV1_I
- fmla v1.2d, v17.2d, alphaV1_R
+ fmla v1.2d, v16.2d, alphaV0_I
+ fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
- fmla v5.2d, v20.2d, alphaV1_I
- fmla v5.2d, v21.2d, alphaV1_R
+ fmla v5.2d, v20.2d, alphaV0_I
+ fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
- fmla d1, d16, alphaV1_I
- fmla d1, d17, alphaV1_R
+ fmla d1, d16, alphaV0_I
+ fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.d, v5.d}[0], [pCRow1]
fmla d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
- fmla d5, d20, alphaV1_I
- fmla d5, d21, alphaV1_R
+ fmla d5, d20, alphaV0_I
+ fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
- fmla v1.2d, v16.2d, alphaV1_I
- fmla v1.2d, v17.2d, alphaV1_R
+ fmla v1.2d, v16.2d, alphaV0_I
+ fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
- fmla v3.2d, v18.2d, alphaV1_I
- fmla v3.2d, v19.2d, alphaV1_R
+ fmla v3.2d, v18.2d, alphaV0_I
+ fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
- fmla v1.2d, v16.2d, alphaV1_I
- fmla v1.2d, v17.2d, alphaV1_R
+ fmla v1.2d, v16.2d, alphaV0_I
+ fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
- fmla d1, d16, alphaV1_I
- fmla d1, d17, alphaV1_R
+ fmla d1, d16, alphaV0_I
+ fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
- fmov alpha_save_R, d0
- fmov alpha_save_I, d1
+ prfm PLDL1KEEP, [origPB]
+ prfm PLDL1KEEP, [origPA]
+
+ fmov alphaR, d0
+ fmov alphaI, d1
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
@@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble zgemm_kernel_L2_BEGIN
zgemm_kernel_L4_BEGIN:
- mov pCRow0, pC // pCRow0 = C
- add pC, pC, LDC, lsl #2
+ mov pCRow0, pC
+ add pCRow1, pCRow0, LDC
+ add pCRow2, pCRow1, LDC
+ add pCRow3, pCRow2, LDC
+
+ add pC, pCRow3, LDC
+
mov pA, origPA // pA = start of A array
zgemm_kernel_L4_M4_BEGIN:
@@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN:
cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN
+ .align 5
zgemm_kernel_L4_M4_20:
mov pB, origPB
- asr counterL , origK, #1 // L = K / 2
- cmp counterL , #2 // is there at least 4 to do?
+ asr counterL , origK, #3
+ cmp counterL , #2
blt zgemm_kernel_L4_M4_32
- KERNEL4x4_I // do one in the K
- KERNEL4x4_M2 // do another in the K
+ KERNEL4x4_I
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a
- .align 5
+ .align 5
zgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22
-
+ .align 5
zgemm_kernel_L4_M4_22a:
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
+ .align 5
zgemm_kernel_L4_M4_32:
tst counterL, #1
ble zgemm_kernel_L4_M4_40
KERNEL4x4_I
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
@@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40:
zgemm_kernel_L4_M4_44:
- ands counterL , origK, #1
+ ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100
+ .align 5
zgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
+ subs counterL, counterL, #1
+ bne zgemm_kernel_L4_M4_46
+
zgemm_kernel_L4_M4_100:
+ prfm PLDL1KEEP, [pA]
+ prfm PLDL1KEEP, [pA, #64]
+ prfm PLDL1KEEP, [origPB]
SAVE4x4
diff --git a/kernel/arm64/zgemv_n.S b/kernel/arm64/zgemv_n.S
index 9e285e299..a28d1b0ce 100644
--- a/kernel/arm64/zgemv_n.S
+++ b/kernel/arm64/zgemv_n.S
@@ -43,6 +43,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define Y_OPTR x13 /* loop Y vector address */
#define X_PTR x14 /* loop X vector address */
+#define A_PRE_SIZE 768
+#define Y_PRE_SIZE 768
+
/*******************************************************************************
* Macro definitions
*******************************************************************************/
@@ -50,14 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(DOUBLE)
#define ALPHA_R s0
#define ALPHA_I s1
-#define ALPHA_R_COPY s7
-#define ALPHA_I_COPY s8
#define SHZ 3
#else
#define ALPHA_R d0
#define ALPHA_I d1
-#define ALPHA_R_COPY d7
-#define ALPHA_I_COPY d8
#define SHZ 4
#endif
@@ -95,20 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT
- /********** INIT FOR F4 LOOP **********/
- fmov ALPHA_R_COPY, ALPHA_R
- fmov ALPHA_I_COPY, ALPHA_I
-#if !defined(DOUBLE)
- ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA)
- ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA)
- ins v7.d[1], v7.d[0]
- ins v8.d[1], v8.d[0]
-#else
- ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA)
- ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA)
-#endif
-
- /******* INIT FOR F1 AND S1 LOOP ******/
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
eor v2.16b, v2.16b, v2.16b
@@ -129,47 +114,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro INIT_LOOP
- /********** INIT_LOOP FOR F4 LOOP **********/
#if !defined(DOUBLE)
- ld1 {v9.2s}, [X_PTR] // [I(X), R(X)]
- ins v10.s[0], v9.s[1]
- ins v9.s[1], v9.s[0] // [R(X), R(X)]
- ins v10.s[1], v10.s[0] // [I(X), I(X)]
- ins v9.d[1], v9.d[0]
- ins v10.d[1], v10.d[0]
+ ld1 {v2.2s}, [X_PTR] // [I(X), R(X)]
+ ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)]
+ fmul v2.2s, v0.2s, v2.2s
+ fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)]
+ ins v3.s[0], v2.s[1]
+
+ /********** INIT_LOOP FOR F4 LOOP **********/
#if !defined(CONJ)
#if !defined(XCONJ)
- fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
- fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
- fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
- fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
+ dup v21.4s, v2.s[0] // R[TEMP]
+ dup v22.4s, v2.s[0] // R[TEMP]
+ eor v25.16b, v25.16b, v25.16b
+ fsub s25, s25, s3
+ dup v23.4s, v25.s[0] // -I[TEMP]
+ dup v24.4s, v3.s[0] // I[TEMP]
#else
- fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
- fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
- fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
- fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
+ dup v21.4s, v2.s[0] // R[TEMP]
+ dup v22.4s, v2.s[0] // R[TEMP]
+ dup v23.4s, v3.s[0] // I[TEMP]
+ eor v25.16b, v25.16b, v25.16b
+ fsub s25, s25, s3
+ dup v24.4s, v25.s[0] // -I[TEMP]
#endif
#else // CONJ
#if !defined(XCONJ)
- fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
- fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
- fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
- fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
+ dup v21.4s, v2.s[0] // R[TEMP]
+ eor v25.16b, v25.16b, v25.16b
+ fsub s25, s25, s2
+ dup v22.4s, v25.s[0] // R[TEMP]
+ dup v23.4s, v3.s[0] // I[TEMP]
+ dup v24.4s, v3.s[0] // I[TEMP]
#else
- fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
- fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
- eor v12.16b, v12.16b, v12.16b
- fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
- fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
+ dup v21.4s, v2.s[0] // R[TEMP]
+ eor v25.16b, v25.16b, v25.16b
+ fsub s25, s25, s2
+ dup v22.4s, v25.s[0] // R[TEMP]
+
+ eor v25.16b, v25.16b, v25.16b
+ fsub s25, s25, s3
+ dup v23.4s, v25.s[0] // I[TEMP]
+ dup v24.4s, v25.s[0] // I[TEMP]
#endif
#endif // CONJ
+
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
- ld1 {v2.2s}, [X_PTR] // [I(X), R(X)]
- ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)]
- fmul v2.2s, v0.2s, v2.2s
- fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)]
- ins v3.s[0], v2.s[1]
#if !defined(CONJ)
#if !defined(XCONJ)
eor v4.16b, v4.16b, v4.16b
@@ -200,45 +191,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif // CONJ
#else // DOUBLE
+ ld1 {v2.2d}, [X_PTR] // [I(X), R(X)]
+ ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)]
+ fmul v2.2d, v0.2d, v2.2d
+ fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)]
+ ins v3.d[0], v2.d[1] // I(TEMP)
- /********** INIT_LOOP FOR F4 LOOP **********/
- ld1 {v9.2d}, [X_PTR] // [I(X), R(X)]
- ins v10.d[0], v9.d[1]
- ins v9.d[1], v9.d[0] // [R(X), R(X)]
- ins v10.d[1], v10.d[0] // [I(X), I(X)]
+ /****** INIT_LOOP FOR F4 LOOP ******/
#if !defined(CONJ)
#if !defined(XCONJ)
- fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
- fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
- fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
- fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
+ dup v21.2d, v2.d[0] // R[TEMP]
+ dup v22.2d, v2.d[0] // R[TEMP]
+ eor v25.16b, v25.16b, v25.16b
+ fsub d25, d25, d3
+ dup v23.2d, v25.d[0] // -I[TEMP]
+ dup v24.2d, v3.d[0] // I[TEMP]
#else
- fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
- fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
- fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
- fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
+ dup v21.2d, v2.d[0] // R[TEMP]
+ dup v22.2d, v2.d[0] // R[TEMP]
+ dup v23.2d, v3.d[0] // I[TEMP]
+ eor v25.16b, v25.16b, v25.16b
+ fsub d25, d25, d3
+ dup v24.2d, v25.d[0] // -I[TEMP]
#endif
#else // CONJ
#if !defined(XCONJ)
- fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
- fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
- fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
- fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
+ dup v21.2d, v2.d[0] // R[TEMP]
+ eor v25.16b, v25.16b, v25.16b
+ fsub d25, d25, d2
+ dup v22.2d, v25.d[0] // R[TEMP]
+ dup v23.2d, v3.d[0] // I[TEMP]
+ dup v24.2d, v3.d[0] // I[TEMP]
#else
- fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
- fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
- eor v12.16b, v12.16b, v12.16b
- fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
- fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
+ dup v21.2d, v2.d[0] // R[TEMP]
+ eor v25.16b, v25.16b, v25.16b
+ fsub d25, d25, d2
+ dup v22.2d, v25.d[0] // R[TEMP]
+
+ eor v25.16b, v25.16b, v25.16b
+ fsub d25, d25, d3
+ dup v23.2d, v25.d[0] // I[TEMP]
+ dup v24.2d, v25.d[0] // I[TEMP]
#endif
#endif // CONJ
+
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
- ld1 {v2.2d}, [X_PTR] // [I(X), R(X)]
- ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)]
- fmul v2.2d, v0.2d, v2.2d
- fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)]
- ins v3.d[0], v2.d[1] // I(TEMP)
#if !defined(CONJ)
#if !defined(XCONJ)
eor v4.16b, v4.16b, v4.16b
@@ -276,91 +274,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v13.4s, v14.4s}, [A_PTR], #32
ld2 {v15.4s, v16.4s}, [Y_IPTR], #32
-#if !defined(CONJ)
-#if !defined(XCONJ)
- fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
- fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
- fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
- fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
-#else
- fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
- fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
- fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
- fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
-#endif
-#else // CONJ
-#if !defined(XCONJ)
- fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
- fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
- fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
- fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
-#else
- fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
- fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
- fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
- fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
-#endif
-#endif // CONJ
+
+ prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
+ prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
+
+ fmla v15.4s, v21.4s, v13.4s
+ fmla v15.4s, v23.4s, v14.4s
+ fmla v16.4s, v22.4s, v14.4s
+ fmla v16.4s, v24.4s, v13.4s
+
st2 {v15.4s, v16.4s}, [Y_OPTR], #32
#else // DOUBLE
ld2 {v13.2d, v14.2d}, [A_PTR], #32
ld2 {v15.2d, v16.2d}, [Y_IPTR], #32
-#if !defined(CONJ)
-#if !defined(XCONJ)
- fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
- fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
- fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
- fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
-#else
- fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
- fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
- fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
- fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
-#endif
-#else // CONJ
-#if !defined(XCONJ)
- fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
- fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
- fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
- fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
-#else
- fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
- fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
- fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
- fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
-#endif
-#endif // CONJ
+ prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
+
+ fmla v15.2d, v21.2d, v13.2d
+ fmla v15.2d, v23.2d, v14.2d
+ fmla v16.2d, v22.2d, v14.2d
+ fmla v16.2d, v24.2d, v13.2d
+
st2 {v15.2d, v16.2d}, [Y_OPTR], #32
ld2 {v17.2d, v18.2d}, [A_PTR], #32
ld2 {v19.2d, v20.2d}, [Y_IPTR], #32
-#if !defined(CONJ)
-#if !defined(XCONJ)
- fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
- fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
- fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
- fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
-#else
- fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
- fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
- fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
- fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
-#endif
-#else // CONJ
-#if !defined(XCONJ)
- fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
- fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
- fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
- fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
-#else
- fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
- fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
- fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
- fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
-#endif
-#endif // CONJ
+ prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
+
+ fmla v19.2d, v21.2d, v17.2d
+ fmla v19.2d, v23.2d, v18.2d
+ fmla v20.2d, v22.2d, v18.2d
+ fmla v20.2d, v24.2d, v17.2d
+
st2 {v19.2d, v20.2d}, [Y_OPTR], #32
#endif
@@ -445,10 +391,7 @@ zgemv_n_kernel_F_LOOP:
zgemv_n_kernel_F4:
- KERNEL_F1
- KERNEL_F1
- KERNEL_F1
- KERNEL_F1
+ KERNEL_F4
subs I, I, #1
bne zgemv_n_kernel_F4
diff --git a/kernel/arm64/zgemv_t.S b/kernel/arm64/zgemv_t.S
index e61c17152..79ce9bcf2 100644
--- a/kernel/arm64/zgemv_t.S
+++ b/kernel/arm64/zgemv_t.S
@@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define J x11 /* loop variable */
#define I x12 /* loop variable */
+#define A_PRE_SIZE 768
+#define X_PRE_SIZE 768
+
/*******************************************************************************
* Macro definitions
*******************************************************************************/
@@ -139,6 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v11.4s, v12.4s}, [X_PTR], #32
ld2 {v13.4s, v14.4s}, [A_PTR], #32
+ prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE]
+ prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE]
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
@@ -155,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else // DOUBLE
ld2 {v11.2d, v12.2d}, [X_PTR], #32
ld2 {v13.2d, v14.2d}, [A_PTR], #32
- prfm PLDL1STRM, [X_PTR, #512]
+ prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE]
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
@@ -171,7 +176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v17.2d, v18.2d}, [X_PTR], #32
ld2 {v19.2d, v20.2d}, [A_PTR], #32
- prfm PLDL1STRM, [A_PTR, #512]
+ prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE]
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S
index 7945870d6..77a7857ff 100644
--- a/kernel/arm64/ztrmm_kernel_4x4.S
+++ b/kernel/arm64/ztrmm_kernel_4x4.S
@@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
-#define pA x15
-#define alpha_save_R x16
-#define alpha_save_I x17
-#define temp x18
-#define tempOffset x19
-#define tempK x20
+#define pCRow3 x15
+#define pA x16
+#define alphaR x17
+#define alphaI x18
+#define temp x19
+#define tempOffset x20
+#define tempK x21
#define alpha0_R d10
#define alphaV0_R v10.d[0]
#define alpha0_I d11
#define alphaV0_I v11.d[0]
-#define alpha1_R d14
-#define alphaV1_R v14.d[0]
-#define alpha1_I d15
-#define alphaV1_I v15.d[0]
-
+#define A_PRE_SIZE 2560
+#define B_PRE_SIZE 448
+#define C_PRE_SIZE 128
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define OP_rr fmla
@@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 04 origPB
// 05 pC
// 06 origLDC -> LDC
-// 07 offset
+// 07 offset -> temp
// 08 counterL
// 09 counterI
// 10 counterJ
@@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
-// 15 pA
-// 16 alpha_save_R
-// 17 alpha_save_I
-// 18 must save temp
-// 19 must save tempOffset
-// 20 must save tempK
-// 21 must save
+// 15 pCRow3
+// 16 pA
+// 17 alpha_save_R
+// 18 must save alpha_save_I
+// 19 must save temp
+// 20 must save tempOffset
+// 21 must save tempK
// 22 must save
// 23 must save
// 24 must save
@@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_I
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
- ld2 {v10.2d, v11.2d}, [pB]
- add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- ld2 {v2.2d, v3.2d}, [pA]
- add pA, pA, #32
fmul v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
@@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v17.2d, v1.2d, v8.d[0]
- fmul v18.2d, v2.2d, v8.d[0]
- OP_ii v18.2d, v3.2d, v9.d[0]
-#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
- defined(RR) || defined(RC) || defined(CR) || defined(CC)
- eor v19.16b, v19.16b, v19.16b
- fmls v19.2d, v2.2d, v9.d[0]
-#else
- fmul v19.2d, v2.2d, v9.d[0]
-#endif
- OP_ir v19.2d, v3.2d, v8.d[0]
+ ld2 {v2.2d, v3.2d}, [pA]
+ add pA, pA, #32
fmul v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
@@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v21.2d, v1.2d, v8.d[1]
+ ld2 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
+
fmul v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v23.2d, v3.2d, v8.d[1]
+ ld2 {v12.2d, v13.2d}, [pB]
+ add pB, pB, #32
+
+ fmul v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+ defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ eor v19.16b, v19.16b, v19.16b
+ fmls v19.2d, v2.2d, v9.d[0]
+#else
+ fmul v19.2d, v2.2d, v9.d[0]
+#endif
+ OP_ir v19.2d, v3.2d, v8.d[0]
+
+ ld2 {v4.2d, v5.2d} , [pA]
+ add pA, pA, #32
+
fmul v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v25.2d, v1.2d, v10.d[0]
+ ld2 {v6.2d, v7.2d} , [pA]
+ add pA, pA, #32
+
fmul v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v27.2d, v3.2d, v10.d[0]
+ ld2 {v14.2d, v15.2d}, [pB]
+ add pB, pB, #32
+
fmul v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v29.2d, v1.2d, v10.d[1]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
fmul v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v31.2d, v3.2d, v10.d[1]
- ld2 {v12.2d, v13.2d}, [pB]
- add pB, pB, #32
- ld2 {v14.2d, v15.2d}, [pB]
- add pB, pB, #32
- ld2 {v4.2d, v5.2d} , [pA]
- add pA, pA, #32
- ld2 {v6.2d, v7.2d} , [pA]
- add pA, pA, #32
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL4x4_M1
@@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
- ld2 {v12.2d, v13.2d}, [pB] // For next round
+ ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.d[0]
@@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
- ld2 {v14.2d, v15.2d}, [pB] // For next round
- add pB, pB, #32
+ ld2 {v4.2d, v5.2d} , [pA]
+ add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
- ld2 {v4.2d, v5.2d} , [pA] // For next round
+ ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
OP_rr v22.2d, v2.2d, v8.d[1]
@@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
- ld2 {v6.2d, v7.2d} , [pA] // For next round
- add pA, pA, #32
+ ld2 {v14.2d, v15.2d}, [pB]
+ add pB, pB, #32
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.d[0]
- prfm PLDL1KEEP, [pA, #512]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.d[0]
- prfm PLDL1KEEP, [pB, #512]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
@@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.d[0]
- ld2 {v8.2d, v9.2d}, [pB] // For next round
+ ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v6.2d, v12.d[0]
@@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.d[0]
- ld2 {v10.2d, v11.2d}, [pB] // For next round
- add pB, pB, #32
+ ld2 {v0.2d, v1.2d}, [pA]
+ add pA, pA, #32
OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
- ld2 {v0.2d, v1.2d}, [pA] // For next round
+ ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v22.2d, v6.2d, v12.d[1]
@@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.d[1]
- ld2 {v2.2d, v3.2d}, [pA] // For next round
- add pA, pA, #32
+ ld2 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
- prfm PLDL1KEEP, [pA, #512]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.d[0]
- prfm PLDL1KEEP, [pB, #512]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.d[1]
@@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.d[1]
@@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
@@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_SUB
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
- ld2 {v10.2d, v11.2d}, [pB]
- add pB, pB, #32
+
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- ld2 {v2.2d, v3.2d}, [pA]
- add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
- OP_rr v18.2d, v2.2d, v8.d[0]
- OP_ii v18.2d, v3.2d, v9.d[0]
- OP_ri v19.2d, v2.2d, v9.d[0]
- OP_ir v19.2d, v3.2d, v8.d[0]
+ ld2 {v2.2d, v3.2d}, [pA]
+ add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
+ ld2 {v10.2d, v11.2d}, [pB]
+ add pB, pB, #32
+
+ OP_rr v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+ OP_ri v19.2d, v2.2d, v9.d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
+
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
@@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
- mov pCRow1, pCRow0
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
- fmul v1.2d, v16.2d, alphaV1_I
- fmla v1.2d, v17.2d, alphaV1_R
- st2 {v0.2d, v1.2d}, [pCRow1]
- add pCRow2, pCRow1, #32
+ fmul v1.2d, v16.2d, alphaV0_I
+ fmla v1.2d, v17.2d, alphaV0_R
+ st2 {v0.2d, v1.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+
fmul v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
- fmul v3.2d, v18.2d, alphaV1_I
- fmla v3.2d, v19.2d, alphaV1_R
- st2 {v2.2d, v3.2d}, [pCRow2]
+ fmul v3.2d, v18.2d, alphaV0_I
+ fmla v3.2d, v19.2d, alphaV0_R
+ st2 {v2.2d, v3.2d}, [pCRow0]
+
+ add pCRow0, pCRow0, #32
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
- add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
- fmul v5.2d, v20.2d, alphaV1_I
- fmla v5.2d, v21.2d, alphaV1_R
+ fmul v5.2d, v20.2d, alphaV0_I
+ fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
- add pCRow2, pCRow1, #32
+
+ add pCRow1, pCRow1, #32
+
fmul v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
- fmul v7.2d, v22.2d, alphaV1_I
- fmla v7.2d, v23.2d, alphaV1_R
- st2 {v6.2d, v7.2d}, [pCRow2]
+ fmul v7.2d, v22.2d, alphaV0_I
+ fmla v7.2d, v23.2d, alphaV0_R
+ st2 {v6.2d, v7.2d}, [pCRow1]
+
+ add pCRow1, pCRow1, #32
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
- add pCRow1, pCRow1, LDC
fmul v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
- fmul v1.2d, v24.2d, alphaV1_I
- fmla v1.2d, v25.2d, alphaV1_R
- st2 {v0.2d, v1.2d}, [pCRow1]
- add pCRow2, pCRow1, #32
+ fmul v1.2d, v24.2d, alphaV0_I
+ fmla v1.2d, v25.2d, alphaV0_R
+ st2 {v0.2d, v1.2d}, [pCRow2]
+
+ add pCRow2, pCRow2, #32
+
fmul v2.2d, v26.2d, alphaV0_R
fmls v2.2d, v27.2d, alphaV0_I
- fmul v3.2d, v26.2d, alphaV1_I
- fmla v3.2d, v27.2d, alphaV1_R
+ fmul v3.2d, v26.2d, alphaV0_I
+ fmla v3.2d, v27.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
- add pCRow1, pCRow1, LDC
+ add pCRow2, pCRow2, #32
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
fmul v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
- fmul v5.2d, v28.2d, alphaV1_I
- fmla v5.2d, v29.2d, alphaV1_R
- st2 {v4.2d, v5.2d}, [pCRow1]
- add pCRow2, pCRow1, #32
+ fmul v5.2d, v28.2d, alphaV0_I
+ fmla v5.2d, v29.2d, alphaV0_R
+ st2 {v4.2d, v5.2d}, [pCRow3]
+
+ add pCRow3, pCRow3, #32
+
fmul v6.2d, v30.2d, alphaV0_R
fmls v6.2d, v31.2d, alphaV0_I
- fmul v7.2d, v30.2d, alphaV1_I
- fmla v7.2d, v31.2d, alphaV1_R
- st2 {v6.2d, v7.2d}, [pCRow2]
+ fmul v7.2d, v30.2d, alphaV0_I
+ fmla v7.2d, v31.2d, alphaV0_R
+ st2 {v6.2d, v7.2d}, [pCRow3]
- add pCRow0, pCRow0, #64
+ add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
- fmul v1.2d, v16.2d, alphaV1_I
- fmla v1.2d, v17.2d, alphaV1_R
+ fmul v1.2d, v16.2d, alphaV0_I
+ fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
- fmul v5.2d, v20.2d, alphaV1_I
- fmla v5.2d, v21.2d, alphaV1_R
+ fmul v5.2d, v20.2d, alphaV0_I
+ fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
- fmul v1.2d, v24.2d, alphaV1_I
- fmla v1.2d, v25.2d, alphaV1_R
+ fmul v1.2d, v24.2d, alphaV0_I
+ fmla v1.2d, v25.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
- fmul v5.2d, v28.2d, alphaV1_I
- fmla v5.2d, v29.2d, alphaV1_R
+ fmul v5.2d, v28.2d, alphaV0_I
+ fmla v5.2d, v29.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
- fmul d1, d16, alphaV1_I
- fmla d1, d17, alphaV1_R
+ fmul d1, d16, alphaV0_I
+ fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
- fmul d5, d20, alphaV1_I
- fmla d5, d21, alphaV1_R
+ fmul d5, d20, alphaV0_I
+ fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d0, d24, alphaV0_R
fmls d0, d25, alphaV0_I
- fmul d1, d24, alphaV1_I
- fmla d1, d25, alphaV1_R
+ fmul d1, d24, alphaV0_I
+ fmla d1, d25, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d4, d28, alphaV0_R
fmls d4, d29, alphaV0_I
- fmul d5, d28, alphaV1_I
- fmla d5, d29, alphaV1_R
+ fmul d5, d28, alphaV0_I
+ fmla d5, d29, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
- fmul v1.2d, v16.2d, alphaV1_I
- fmla v1.2d, v17.2d, alphaV1_R
+ fmul v1.2d, v16.2d, alphaV0_I
+ fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
- fmul v3.2d, v18.2d, alphaV1_I
- fmla v3.2d, v19.2d, alphaV1_R
+ fmul v3.2d, v18.2d, alphaV0_I
+ fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
- fmul v5.2d, v20.2d, alphaV1_I
- fmla v5.2d, v21.2d, alphaV1_R
+ fmul v5.2d, v20.2d, alphaV0_I
+ fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
- fmul v7.2d, v22.2d, alphaV1_I
- fmla v7.2d, v23.2d, alphaV1_R
+ fmul v7.2d, v22.2d, alphaV0_I
+ fmla v7.2d, v23.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
- fmul v1.2d, v16.2d, alphaV1_I
- fmla v1.2d, v17.2d, alphaV1_R
+ fmul v1.2d, v16.2d, alphaV0_I
+ fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
- fmul v5.2d, v20.2d, alphaV1_I
- fmla v5.2d, v21.2d, alphaV1_R
+ fmul v5.2d, v20.2d, alphaV0_I
+ fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
- fmul d1, d16, alphaV1_I
- fmla d1, d17, alphaV1_R
+ fmul d1, d16, alphaV0_I
+ fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
- fmul d5, d20, alphaV1_I
- fmla d5, d21, alphaV1_R
+ fmul d5, d20, alphaV0_I
+ fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
- fmul v1.2d, v16.2d, alphaV1_I
- fmla v1.2d, v17.2d, alphaV1_R
+ fmul v1.2d, v16.2d, alphaV0_I
+ fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
- fmul v3.2d, v18.2d, alphaV1_I
- fmla v3.2d, v19.2d, alphaV1_R
+ fmul v3.2d, v18.2d, alphaV0_I
+ fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
- fmul v1.2d, v16.2d, alphaV1_I
- fmla v1.2d, v17.2d, alphaV1_R
+ fmul v1.2d, v16.2d, alphaV0_I
+ fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
- fmov alpha0_R, alpha_save_R
- fmov alpha0_I, alpha_save_I
- fmov alpha1_R, alpha0_R
- fmov alpha1_I, alpha0_I
+ fmov alpha0_R, alphaR
+ fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
- fmul d1, d16, alphaV1_I
- fmla d1, d17, alphaV1_R
+ fmul d1, d16, alphaV0_I
+ fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
- fmov alpha_save_R, d0
- fmov alpha_save_I, d1
+ prfm PLDL1KEEP, [origPB]
+ prfm PLDL1KEEP, [origPA]
+
+ fmov alphaR, d0
+ fmov alphaI, d1
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
@@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble ztrmm_kernel_L2_BEGIN
ztrmm_kernel_L4_BEGIN:
- mov pCRow0, pC // pCRow0 = C
- add pC, pC, LDC, lsl #2
+ mov pCRow0, pC
+ add pCRow1, pCRow0, LDC
+ add pCRow2, pCRow1, LDC
+ add pCRow3, pCRow2, LDC
+
+ add pC, pCRow3, LDC
+
#if defined(LEFT)
mov tempOffset, offset
@@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN:
cmp counterI, #0
ble ztrmm_kernel_L4_M2_BEGIN
+ .align 5
ztrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20:
add tempK, tempOffset, #4
#endif
- asr counterL , tempK, #1 // L = K / 2
- cmp counterL , #2 // is there at least 4 to do?
+ asr counterL , tempK, #3
+ cmp counterL , #2
blt ztrmm_kernel_L4_M4_32
- KERNEL4x4_I // do one in the K
- KERNEL4x4_M2 // do another in the K
+ KERNEL4x4_I
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
subs counterL, counterL, #2
ble ztrmm_kernel_L4_M4_22a
- .align 5
+ .align 5
ztrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M4_22
-
+ .align 5
ztrmm_kernel_L4_M4_22a:
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b ztrmm_kernel_L4_M4_44
+ .align 5
ztrmm_kernel_L4_M4_32:
tst counterL, #1
ble ztrmm_kernel_L4_M4_40
KERNEL4x4_I
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
KERNEL4x4_E
b ztrmm_kernel_L4_M4_44
@@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40:
ztrmm_kernel_L4_M4_44:
- ands counterL , tempK, #1
+ ands counterL , tempK, #7
ble ztrmm_kernel_L4_M4_100
+ .align 5
ztrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
+ subs counterL, counterL, #1
+ bne ztrmm_kernel_L4_M4_46
+
ztrmm_kernel_L4_M4_100:
SAVE4x4
@@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
+ prfm PLDL1KEEP, [pA]
+ prfm PLDL1KEEP, [pA, #64]
+ prfm PLDL1KEEP, [origPB]
+
ztrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne ztrmm_kernel_L4_M4_20
diff --git a/kernel/mips/KERNEL b/kernel/mips/KERNEL
new file mode 100644
index 000000000..aeccfbf4c
--- /dev/null
+++ b/kernel/mips/KERNEL
@@ -0,0 +1,46 @@
+ifndef SNRM2KERNEL
+SNRM2KERNEL = nrm2.c
+endif
+
+ifndef DNRM2KERNEL
+DNRM2KERNEL = nrm2.c
+endif
+
+ifndef CNRM2KERNEL
+CNRM2KERNEL = znrm2.c
+endif
+
+ifndef ZNRM2KERNEL
+ZNRM2KERNEL = znrm2.c
+endif
+
+ifndef SCABS_KERNEL
+SCABS_KERNEL = ../generic/cabs.c
+endif
+
+ifndef DCABS_KERNEL
+DCABS_KERNEL = ../generic/cabs.c
+endif
+
+ifndef QCABS_KERNEL
+QCABS_KERNEL = ../generic/cabs.c
+endif
+
+ifndef LSAME_KERNEL
+LSAME_KERNEL = ../generic/lsame.c
+endif
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
+
+
diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600
new file mode 100644
index 000000000..683579221
--- /dev/null
+++ b/kernel/mips/KERNEL.P5600
@@ -0,0 +1,221 @@
+SAMAXKERNEL = ../mips/amax.c
+DAMAXKERNEL = ../mips/amax.c
+CAMAXKERNEL = ../mips/zamax.c
+ZAMAXKERNEL = ../mips/zamax.c
+
+SAMINKERNEL = ../mips/amin.c
+DAMINKERNEL = ../mips/amin.c
+CAMINKERNEL = ../mips/zamin.c
+ZAMINKERNEL = ../mips/zamin.c
+
+SMAXKERNEL = ../mips/max.c
+DMAXKERNEL = ../mips/max.c
+
+SMINKERNEL = ../mips/min.c
+DMINKERNEL = ../mips/min.c
+
+ISAMAXKERNEL = ../mips/iamax.c
+IDAMAXKERNEL = ../mips/iamax.c
+ICAMAXKERNEL = ../mips/izamax.c
+IZAMAXKERNEL = ../mips/izamax.c
+
+ISAMINKERNEL = ../mips/iamin.c
+IDAMINKERNEL = ../mips/iamin.c
+ICAMINKERNEL = ../mips/izamin.c
+IZAMINKERNEL = ../mips/izamin.c
+
+ISMAXKERNEL = ../mips/imax.c
+IDMAXKERNEL = ../mips/imax.c
+
+ISMINKERNEL = ../mips/imin.c
+IDMINKERNEL = ../mips/imin.c
+
+ifdef HAVE_MSA
+SASUMKERNEL = ../mips/sasum_msa.c
+DASUMKERNEL = ../mips/dasum_msa.c
+CASUMKERNEL = ../mips/casum_msa.c
+ZASUMKERNEL = ../mips/zasum_msa.c
+else
+SASUMKERNEL = ../mips/asum.c
+DASUMKERNEL = ../mips/asum.c
+CASUMKERNEL = ../mips/asum.c
+ZASUMKERNEL = ../mips/asum.c
+endif
+
+SAXPYKERNEL = ../mips/axpy.c
+DAXPYKERNEL = ../mips/axpy.c
+CAXPYKERNEL = ../mips/zaxpy.c
+ZAXPYKERNEL = ../mips/zaxpy.c
+
+SCOPYKERNEL = ../mips/copy.c
+DCOPYKERNEL = ../mips/copy.c
+CCOPYKERNEL = ../mips/zcopy.c
+ZCOPYKERNEL = ../mips/zcopy.c
+
+ifdef HAVE_MSA
+SDOTKERNEL = ../mips/sdot_msa.c
+DDOTKERNEL = ../mips/ddot_msa.c
+CDOTKERNEL = ../mips/cdot_msa.c
+ZDOTKERNEL = ../mips/zdot_msa.c
+else
+SDOTKERNEL = ../mips/dot.c
+DDOTKERNEL = ../mips/dot.c
+CDOTKERNEL = ../mips/zdot.c
+ZDOTKERNEL = ../mips/zdot.c
+endif
+
+SNRM2KERNEL = ../mips/nrm2.c
+DNRM2KERNEL = ../mips/nrm2.c
+CNRM2KERNEL = ../mips/znrm2.c
+ZNRM2KERNEL = ../mips/znrm2.c
+
+SROTKERNEL = ../mips/rot.c
+DROTKERNEL = ../mips/rot.c
+CROTKERNEL = ../mips/zrot.c
+ZROTKERNEL = ../mips/zrot.c
+
+SSCALKERNEL = ../mips/scal.c
+DSCALKERNEL = ../mips/scal.c
+CSCALKERNEL = ../mips/zscal.c
+ZSCALKERNEL = ../mips/zscal.c
+
+SSWAPKERNEL = ../mips/swap.c
+DSWAPKERNEL = ../mips/swap.c
+CSWAPKERNEL = ../mips/zswap.c
+ZSWAPKERNEL = ../mips/zswap.c
+
+ifdef HAVE_MSA
+SGEMVNKERNEL = ../mips/sgemv_n_msa.c
+DGEMVNKERNEL = ../mips/dgemv_n_msa.c
+CGEMVNKERNEL = ../mips/cgemv_n_msa.c
+ZGEMVNKERNEL = ../mips/zgemv_n_msa.c
+else
+SGEMVNKERNEL = ../mips/gemv_n.c
+DGEMVNKERNEL = ../mips/gemv_n.c
+CGEMVNKERNEL = ../mips/zgemv_n.c
+ZGEMVNKERNEL = ../mips/zgemv_n.c
+endif
+
+ifdef HAVE_MSA
+SGEMVTKERNEL = ../mips/sgemv_t_msa.c
+DGEMVTKERNEL = ../mips/dgemv_t_msa.c
+CGEMVTKERNEL = ../mips/cgemv_t_msa.c
+ZGEMVTKERNEL = ../mips/zgemv_t_msa.c
+else
+SGEMVTKERNEL = ../mips/gemv_t.c
+DGEMVTKERNEL = ../mips/gemv_t.c
+CGEMVTKERNEL = ../mips/zgemv_t.c
+ZGEMVTKERNEL = ../mips/zgemv_t.c
+endif
+
+ifdef HAVE_MSA
+SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c
+SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c
+SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c
+SGEMMONCOPYOBJ = sgemm_oncopy.o
+SGEMMOTCOPYOBJ = sgemm_otcopy.o
+else
+SGEMMKERNEL = ../generic/gemmkernel_2x2.c
+SGEMMONCOPY = ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ = sgemm_oncopy.o
+SGEMMOTCOPYOBJ = sgemm_otcopy.o
+endif
+
+ifdef HAVE_MSA
+DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c
+DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c
+DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c
+DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c
+DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+else
+DGEMMKERNEL = ../generic/gemmkernel_2x2.c
+DGEMMONCOPY = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+endif
+
+ifdef HAVE_MSA
+CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c
+CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c
+CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c
+CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c
+CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c
+CGEMMINCOPYOBJ = cgemm_incopy.o
+CGEMMITCOPYOBJ = cgemm_itcopy.o
+CGEMMONCOPYOBJ = cgemm_oncopy.o
+CGEMMOTCOPYOBJ = cgemm_otcopy.o
+else
+CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ = cgemm_oncopy.o
+CGEMMOTCOPYOBJ = cgemm_otcopy.o
+endif
+
+ifdef HAVE_MSA
+ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c
+ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c
+ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c
+ZGEMMONCOPYOBJ = zgemm_oncopy.o
+ZGEMMOTCOPYOBJ = zgemm_otcopy.o
+else
+ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ = zgemm_oncopy.o
+ZGEMMOTCOPYOBJ = zgemm_otcopy.o
+endif
+
+ifdef HAVE_MSA
+STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
+STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
+STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
+STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c
+else
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
+
+ifdef HAVE_MSA
+DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
+DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
+DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
+DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c
+else
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
+
+ifdef HAVE_MSA
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+else
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
+
+ifdef HAVE_MSA
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+else
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
\ No newline at end of file
diff --git a/kernel/mips/Makefile b/kernel/mips/Makefile
new file mode 100644
index 000000000..efae70d7b
--- /dev/null
+++ b/kernel/mips/Makefile
@@ -0,0 +1,2 @@
+clean ::
+
diff --git a/kernel/mips/amax.c b/kernel/mips/amax.c
new file mode 100644
index 000000000..ad14081f5
--- /dev/null
+++ b/kernel/mips/amax.c
@@ -0,0 +1,66 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0;
+ FLOAT maxf=0.0;
+
+ if (n <= 0 || inc_x <= 0) return(maxf);
+
+ maxf=ABS(x[0]);
+ ix += inc_x;
+ i++;
+
+ while(i < n)
+ {
+ if( ABS(x[ix]) > maxf )
+ {
+ maxf = ABS(x[ix]);
+ }
+ ix += inc_x;
+ i++;
+ }
+ return(maxf);
+}
+
+
diff --git a/kernel/mips/amin.c b/kernel/mips/amin.c
new file mode 100644
index 000000000..8079450ff
--- /dev/null
+++ b/kernel/mips/amin.c
@@ -0,0 +1,66 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0;
+ FLOAT minf=0.0;
+
+ if (n <= 0 || inc_x <= 0) return(minf);
+
+ minf=ABS(x[0]);
+ ix += inc_x;
+ i++;
+
+ while(i < n)
+ {
+ if( ABS(x[ix]) < minf )
+ {
+ minf = ABS(x[ix]);
+ }
+ ix += inc_x;
+ i++;
+ }
+ return(minf);
+}
+
+
diff --git a/kernel/mips/asum.c b/kernel/mips/asum.c
new file mode 100644
index 000000000..d221464de
--- /dev/null
+++ b/kernel/mips/asum.c
@@ -0,0 +1,57 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ FLOAT sumf = 0.0;
+ if (n <= 0 || inc_x <= 0) return(sumf);
+
+ n *= inc_x;
+ while(i < n)
+ {
+ sumf += ABS(x[i]);
+ i += inc_x;
+ }
+ return(sumf);
+}
+
+
diff --git a/kernel/mips/axpby.c b/kernel/mips/axpby.c
new file mode 100644
index 000000000..af4fccde2
--- /dev/null
+++ b/kernel/mips/axpby.c
@@ -0,0 +1,95 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i=0;
+ BLASLONG ix,iy;
+
+ if ( n < 0 ) return(0);
+
+ ix = 0;
+ iy = 0;
+
+ if ( beta == 0.0 )
+ {
+
+ if ( alpha == 0.0 )
+ {
+ while(i < n)
+ {
+ y[iy] = 0.0 ;
+ iy += inc_y ;
+ i++ ;
+ }
+ }
+ else
+ {
+ while(i < n)
+ {
+ y[iy] = alpha * x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+ }
+
+
+ }
+
+ }
+ else
+ {
+
+ if ( alpha == 0.0 )
+ {
+ while(i < n)
+ {
+ y[iy] = beta * y[iy] ;
+ iy += inc_y ;
+ i++ ;
+ }
+ }
+ else
+ {
+ while(i < n)
+ {
+ y[iy] = alpha * x[ix] + beta * y[iy] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+ }
+ }
+
+ }
+
+ return(0);
+
+}
+
+
diff --git a/kernel/mips/axpy.c b/kernel/mips/axpy.c
new file mode 100644
index 000000000..42f181ee1
--- /dev/null
+++ b/kernel/mips/axpy.c
@@ -0,0 +1,54 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0;
+ BLASLONG ix,iy;
+
+ if ( n < 0 ) return(0);
+ if ( da == 0.0 ) return(0);
+
+ ix = 0;
+ iy = 0;
+
+ while(i < n)
+ {
+
+ y[iy] += da * x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/mips/casum_msa.c b/kernel/mips/casum_msa.c
new file mode 100644
index 000000000..454573d56
--- /dev/null
+++ b/kernel/mips/casum_msa.c
@@ -0,0 +1,338 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include
+#include "macros_msa.h"
+
+#define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec))
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i, inc_x2;
+ FLOAT sumf = 0.0;
+ v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+ v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
+ v4f32 zero_v = {0};
+ v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+
+ if (n <= 0 || inc_x <= 0) return (sumf);
+
+ if (1 == inc_x)
+ {
+ if (n > 15)
+ {
+ n -= 16;
+
+ LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ sum_abs0 = AND_VEC_W(src0);
+ sum_abs1 = AND_VEC_W(src1);
+ sum_abs2 = AND_VEC_W(src2);
+ sum_abs3 = AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+ sum_abs1 += AND_VEC_W(src5);
+ sum_abs2 += AND_VEC_W(src6);
+ sum_abs3 += AND_VEC_W(src7);
+ }
+ else
+ {
+ sum_abs0 = zero_v;
+ sum_abs1 = zero_v;
+ sum_abs2 = zero_v;
+ sum_abs3 = zero_v;
+ }
+
+ for (i = (n >> 4); i--;)
+ {
+ LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+ sum_abs1 += AND_VEC_W(src5);
+ sum_abs2 += AND_VEC_W(src6);
+ sum_abs3 += AND_VEC_W(src7);
+ }
+
+ if (n & 15)
+ {
+ if ((n & 8) && (n & 4) && (n & 2))
+ {
+ LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+ sum_abs1 += AND_VEC_W(src5);
+ sum_abs2 += AND_VEC_W(src6);
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf = sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ else if ((n & 8) && (n & 4))
+ {
+ LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+ sum_abs1 += AND_VEC_W(src5);
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf = sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ else if ((n & 8) && (n & 2))
+ {
+ LD_SP5_INC(x, 4, src0, src1, src2, src3, src4);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf = sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ else if ((n & 4) && (n & 2))
+ {
+ LD_SP3_INC(x, 4, src0, src1, src2);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf = sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ else if (n & 8)
+ {
+ LD_SP4_INC(x, 4, src0, src1, src2, src3);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf = sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ else if (n & 4)
+ {
+ LD_SP2_INC(x, 4, src0, src1);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf = sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ else if (n & 2)
+ {
+ src0 = LD_SP(x); x += 4;
+
+ sum_abs0 += AND_VEC_W(src0);
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf = sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ else
+ {
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf = sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+
+ if (n & 1)
+ {
+ sumf += fabsf(*(x + 0));
+ sumf += fabsf(*(x + 1));
+ }
+ }
+ else
+ {
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf = sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ }
+ else
+ {
+ inc_x2 = 2 * inc_x;
+
+ if (n > 8)
+ {
+ n -= 8;
+
+ LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ sum_abs0 = AND_VEC_W(src0);
+ sum_abs1 = AND_VEC_W(src1);
+ sum_abs2 = AND_VEC_W(src2);
+ sum_abs3 = AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+ sum_abs1 += AND_VEC_W(src5);
+ sum_abs2 += AND_VEC_W(src6);
+ sum_abs3 += AND_VEC_W(src7);
+ }
+ else
+ {
+ sum_abs0 = zero_v;
+ sum_abs1 = zero_v;
+ sum_abs2 = zero_v;
+ sum_abs3 = zero_v;
+ }
+
+ for (i = (n >> 3); i--;)
+ {
+ LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+ sum_abs1 += AND_VEC_W(src5);
+ sum_abs2 += AND_VEC_W(src6);
+ sum_abs3 += AND_VEC_W(src7);
+ }
+
+ if (n & 7)
+ {
+ if ((n & 4) && (n & 2) && (n & 1))
+ {
+ LD_SP7_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+ sum_abs1 += AND_VEC_W(src5);
+ sum_abs2 += AND_VEC_W(src6);
+ }
+ else if ((n & 4) && (n & 2))
+ {
+ LD_SP6_INC(x, inc_x2, src0, src1, src2, src3, src4, src5);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+ sum_abs1 += AND_VEC_W(src5);
+ }
+ else if ((n & 4) && (n & 1))
+ {
+ LD_SP5_INC(x, inc_x2, src0, src1, src2, src3, src4);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+ }
+ else if ((n & 2) && (n & 1))
+ {
+ LD_SP3_INC(x, inc_x2, src0, src1, src2);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ }
+ else if (n & 4)
+ {
+ LD_SP4_INC(x, inc_x2, src0, src1, src2, src3);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+ }
+ else if (n & 2)
+ {
+ LD_SP2_INC(x, inc_x2, src0, src1);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ }
+ else if (n & 1)
+ {
+ src0 = LD_SP(x); x += inc_x2;
+
+ sum_abs0 += AND_VEC_W(src0);
+ }
+ }
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf = sum_abs0[0] + sum_abs0[1];
+ }
+
+ return (sumf);
+}
diff --git a/kernel/mips/cdot_msa.c b/kernel/mips/cdot_msa.c
new file mode 100644
index 000000000..bf9f6b7e2
--- /dev/null
+++ b/kernel/mips/cdot_msa.c
@@ -0,0 +1,361 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#if !defined(CONJ)
+ #define OP2 +=
+ #define OP3 -
+ #define OP4 +
+#else
+ #define OP2 -=
+ #define OP3 +
+ #define OP4 -
+#endif
+
+#define DOT16_KERNEL(OPR0, OPR1) \
+ dot0 += (vx0r * vy0r); \
+ dot0 OPR0## = (vx0i * vy0i); \
+ dot1 OPR1## = (vx0i * vy0r); \
+ dot1 += (vx0r * vy0i); \
+ \
+ dot0 += (vx1r * vy1r); \
+ dot0 OPR0## = (vx1i * vy1i); \
+ dot1 OPR1## = (vx1i * vy1r); \
+ dot1 += (vx1r * vy1i); \
+ \
+ dot0 += (vx2r * vy2r); \
+ dot0 OPR0## = (vx2i * vy2i); \
+ dot1 OPR1## = (vx2i * vy2r); \
+ dot1 += (vx2r * vy2i); \
+ \
+ dot0 += (vx3r * vy3r); \
+ dot0 OPR0## = (vx3i * vy3i); \
+ dot1 OPR1## = (vx3i * vy3r); \
+ dot1 += (vx3r * vy3i);
+
+#define DOT12_KERNEL(OPR0, OPR1) \
+ dot0 += (vx0r * vy0r); \
+ dot0 OPR0## = (vx0i * vy0i); \
+ dot1 OPR1## = (vx0i * vy0r); \
+ dot1 += (vx0r * vy0i); \
+ \
+ dot0 += (vx1r * vy1r); \
+ dot0 OPR0## = (vx1i * vy1i); \
+ dot1 OPR1## = (vx1i * vy1r); \
+ dot1 += (vx1r * vy1i); \
+ \
+ dot0 += (vx2r * vy2r); \
+ dot0 OPR0## = (vx2i * vy2i); \
+ dot1 OPR1## = (vx2i * vy2r); \
+ dot1 += (vx2r * vy2i);
+
+#define DOT8_KERNEL(OPR0, OPR1) \
+ dot0 += (vx0r * vy0r); \
+ dot0 OPR0## = (vx0i * vy0i); \
+ dot1 OPR1## = (vx0i * vy0r); \
+ dot1 += (vx0r * vy0i); \
+ \
+ dot0 += (vx1r * vy1r); \
+ dot0 OPR0## = (vx1i * vy1i); \
+ dot1 OPR1## = (vx1i * vy1r); \
+ dot1 += (vx1r * vy1i);
+
+#define DOT4_KERNEL(OPR0, OPR1) \
+ dot0 += (vx0r * vy0r); \
+ dot0 OPR0## = (vx0i * vy0i); \
+ dot1 OPR1## = (vx0i * vy0r); \
+ dot1 += (vx0r * vy0i);
+
+/* return float, x,y float */
+/* cdotc - CONJ */
+/* cdotu - !CONJ */
+#ifndef _MSC_VER
+#include
+FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+ BLASLONG i = 0;
+ FLOAT dot[2];
+ BLASLONG inc_x2;
+ BLASLONG inc_y2;
+ FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
+ FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
+ v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
+ v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
+ v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
+ v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
+ v4f32 dot0 = {0, 0, 0, 0};
+ v4f32 dot1 = {0, 0, 0, 0};
+ openblas_complex_float result;
+
+ dot[0] = 0.0;
+ dot[1] = 0.0;
+
+ __real__(result) = 0.0;
+ __imag__(result) = 0.0;
+
+ if ( n < 1 ) return(result);
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ for (i = (n >> 4); i--;)
+ {
+ LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+ LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+ PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
+ PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
+ PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i);
+
+ PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
+ PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
+ PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
+ PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i);
+
+ #if !defined(CONJ)
+ DOT16_KERNEL(-, +);
+ #else
+ DOT16_KERNEL(+, -);
+ #endif
+ }
+
+ if (n & 15)
+ {
+ if ((n & 8) && (n & 4))
+ {
+ LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
+ LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
+ LD_SP2_INC(x, 4, vx4, vx5);
+ LD_SP2_INC(y, 4, vy4, vy5);
+
+ PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
+ PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
+
+ PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
+ PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
+ PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
+
+ #if !defined(CONJ)
+ DOT12_KERNEL(-, +);
+ #else
+ DOT12_KERNEL(+, -);
+ #endif
+ }
+ else if (n & 8)
+ {
+ LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
+ LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
+
+ PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
+
+ PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
+ PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
+
+ #if !defined(CONJ)
+ DOT8_KERNEL(-, +);
+ #else
+ DOT8_KERNEL(+, -);
+ #endif
+ }
+ else if (n & 4)
+ {
+ LD_SP2_INC(x, 4, vx0, vx1);
+ LD_SP2_INC(y, 4, vy0, vy1);
+ PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
+
+ #if !defined(CONJ)
+ DOT4_KERNEL(-, +);
+ #else
+ DOT4_KERNEL(+, -);
+ #endif
+ }
+
+ if ((n & 2) && (n & 1))
+ {
+ LD_GP6_INC(x, 1, x0, x1, x2, x3, x4, x5);
+ LD_GP6_INC(y, 1, y0, y1, y2, y3, y4, y5);
+
+ dot[0] += ( x0 * y0 OP3 x1 * y1 );
+ dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+
+ dot[0] += ( x2 * y2 OP3 x3 * y3 );
+ dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
+
+ dot[0] += ( x4 * y4 OP3 x5 * y5 );
+ dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
+ }
+ else if (n & 2)
+ {
+ LD_GP4_INC(x, 1, x0, x1, x2, x3);
+ LD_GP4_INC(y, 1, y0, y1, y2, y3);
+
+ dot[0] += ( x0 * y0 OP3 x1 * y1 );
+ dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+
+ dot[0] += ( x2 * y2 OP3 x3 * y3 );
+ dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
+ }
+ else if (n & 1)
+ {
+ LD_GP2_INC(x, 1, x0, x1);
+ LD_GP2_INC(y, 1, y0, y1);
+
+ dot[0] += ( x0 * y0 OP3 x1 * y1 );
+ dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+ }
+ }
+
+ dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]);
+ dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]);
+ }
+ else
+ {
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ for (i = (n >> 2); i--;)
+ {
+ x0 = *x;
+ x1 = *(x + 1);
+ x += inc_x2;
+ x2 = *x;
+ x3 = *(x + 1);
+ x += inc_x2;
+ x4 = *x;
+ x5 = *(x + 1);
+ x += inc_x2;
+ x6 = *x;
+ x7 = *(x + 1);
+ x += inc_x2;
+
+ y0 = *y;
+ y1 = *(y + 1);
+ y += inc_y2;
+ y2 = *y;
+ y3 = *(y + 1);
+ y += inc_y2;
+ y4 = *y;
+ y5 = *(y + 1);
+ y += inc_y2;
+ y6 = *y;
+ y7 = *(y + 1);
+ y += inc_y2;
+
+ dot[0] += ( x0 * y0 OP3 x1 * y1 );
+ dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+
+ dot[0] += ( x2 * y2 OP3 x3 * y3 );
+ dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
+
+ dot[0] += ( x4 * y4 OP3 x5 * y5 );
+ dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
+
+ dot[0] += ( x6 * y6 OP3 x7 * y7 );
+ dot[1] OP2 ( x7 * y6 OP4 x6 * y7 );
+ }
+
+ if ((n & 2) && (n & 1))
+ {
+ x0 = *x;
+ x1 = *(x + 1);
+ x += inc_x2;
+ x2 = *x;
+ x3 = *(x + 1);
+ x += inc_x2;
+ x4 = *x;
+ x5 = *(x + 1);
+ x += inc_x2;
+
+ y0 = *y;
+ y1 = *(y + 1);
+ y += inc_y2;
+ y2 = *y;
+ y3 = *(y + 1);
+ y += inc_y2;
+ y4 = *y;
+ y5 = *(y + 1);
+ y += inc_y2;
+
+ dot[0] += ( x0 * y0 OP3 x1 * y1 );
+ dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+
+ dot[0] += ( x2 * y2 OP3 x3 * y3 );
+ dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
+
+ dot[0] += ( x4 * y4 OP3 x5 * y5 );
+ dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
+ }
+ else if (n & 2)
+ {
+ x0 = *x;
+ x1 = *(x + 1);
+ x += inc_x2;
+ x2 = *x;
+ x3 = *(x + 1);
+ x += inc_x2;
+
+ y0 = *y;
+ y1 = *(y + 1);
+ y += inc_y2;
+ y2 = *y;
+ y3 = *(y + 1);
+ y += inc_y2;
+
+ dot[0] += ( x0 * y0 OP3 x1 * y1 );
+ dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+
+ dot[0] += ( x2 * y2 OP3 x3 * y3 );
+ dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
+ }
+ else if (n & 1)
+ {
+ x0 = *x;
+ x1 = *(x + 1);
+ x += inc_x2;
+
+ y0 = *y;
+ y1 = *(y + 1);
+ y += inc_y2;
+
+ dot[0] += ( x0 * y0 OP3 x1 * y1 );
+ dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+ }
+ }
+
+ __real__(result) = dot[0];
+ __imag__(result) = dot[1];
+
+ return(result);
+}
diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c
new file mode 100644
index 000000000..cd1fa45b3
--- /dev/null
+++ b/kernel/mips/cgemm_kernel_8x4_msa.c
@@ -0,0 +1,2154 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#define CGEMM_KERNEL_8X4_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
+ LD_SP2_INC(pb0, 4, src_b0, src_b1); \
+ \
+ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
+ PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
+ \
+ /* 0th col */ \
+ SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = (OP4 src_a0r) * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ res1_r OP0## = src_a1r * src_br; \
+ res1_r OP1## = src_a1i * src_bi; \
+ res1_i OP2## = (OP4 src_a1r) * src_bi; \
+ res1_i OP3## = src_a1i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = (OP4 src_a0r) * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+ \
+ res3_r OP0## = src_a1r * src_br; \
+ res3_r OP1## = src_a1i * src_bi; \
+ res3_i OP2## = (OP4 src_a1r) * src_bi; \
+ res3_i OP3## = src_a1i * src_br; \
+ \
+ /* 2nd col */ \
+ SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \
+ res4_r OP0## = src_a0r * src_br; \
+ res4_r OP1## = src_a0i * src_bi; \
+ res4_i OP2## = (OP4 src_a0r) * src_bi; \
+ res4_i OP3## = src_a0i * src_br; \
+ \
+ res5_r OP0## = src_a1r * src_br; \
+ res5_r OP1## = src_a1i * src_bi; \
+ res5_i OP2## = (OP4 src_a1r) * src_bi; \
+ res5_i OP3## = src_a1i * src_br; \
+ \
+ /* 3rd col */ \
+ SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \
+ res6_r OP0## = src_a0r * src_br; \
+ res6_r OP1## = src_a0i * src_bi; \
+ res6_i OP2## = (OP4 src_a0r) * src_bi; \
+ res6_i OP3## = src_a0i * src_br; \
+ \
+ res7_r OP0## = src_a1r * src_br; \
+ res7_r OP1## = src_a1i * src_bi; \
+ res7_i OP2## = (OP4 src_a1r) * src_bi; \
+ res7_i OP3## = src_a1i * src_br; \
+}
+
+#define CGEMM_KERNEL_8X2_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
+ src_b0 = LD_SP(pb0); \
+ \
+ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
+ PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
+ \
+ /* 0th col */ \
+ SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = (OP4 src_a0r) * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ res1_r OP0## = src_a1r * src_br; \
+ res1_r OP1## = src_a1i * src_bi; \
+ res1_i OP2## = (OP4 src_a1r) * src_bi; \
+ res1_i OP3## = src_a1i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = (OP4 src_a0r) * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+ \
+ res3_r OP0## = src_a1r * src_br; \
+ res3_r OP1## = src_a1i * src_bi; \
+ res3_i OP2## = (OP4 src_a1r) * src_bi; \
+ res3_i OP3## = src_a1i * src_br; \
+}
+
+#define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
+ src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \
+ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \
+ \
+ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
+ PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
+ \
+ /* 0th col */ \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = (OP4 src_a0r) * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ res1_r OP0## = src_a1r * src_br; \
+ res1_r OP1## = src_a1i * src_bi; \
+ res1_i OP2## = (OP4 src_a1r) * src_bi; \
+ res1_i OP3## = src_a1i * src_br; \
+}
+
+#define CGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_SP2_INC(pa0, 4, src_a0, src_a1); \
+ LD_SP2_INC(pb0, 4, src_b0, src_b1); \
+ \
+ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th col */ \
+ SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = OP4 src_a0r * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+ \
+ /* 2nd col */ \
+ SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \
+ res4_r OP0## = src_a0r * src_br; \
+ res4_r OP1## = src_a0i * src_bi; \
+ res4_i OP2## = OP4 src_a0r * src_bi; \
+ res4_i OP3## = src_a0i * src_br; \
+ \
+ /* 3rd col */ \
+ SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \
+ res6_r OP0## = src_a0r * src_br; \
+ res6_r OP1## = src_a0i * src_bi; \
+ res6_i OP2## = OP4 src_a0r * src_bi; \
+ res6_i OP3## = src_a0i * src_br; \
+}
+
+#define CGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_SP2_INC(pa0, 4, src_a0, src_a1); \
+ src_b0 = LD_SP(pb0); \
+ \
+ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th col */ \
+ SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = OP4 src_a0r * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+}
+
+#define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_SP2_INC(pa0, 4, src_a0, src_a1); \
+ src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \
+ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \
+ \
+ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th col */ \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+}
+
+#define CGEMM_KERNEL_2X4(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ a0_r = pa0[0]; \
+ a0_i = pa0[1]; \
+ b0_r = pb0[0]; \
+ b0_i = pb0[1]; \
+ \
+ res0 OP0## = a0_r * b0_r; \
+ res0 OP1## = a0_i * b0_i; \
+ res1 OP2## = OP4 a0_r * b0_i; \
+ res1 OP3## = a0_i * b0_r; \
+ \
+ a1_r = pa0[2]; \
+ a1_i = pa0[3]; \
+ res2 OP0## = a1_r * b0_r; \
+ res2 OP1## = a1_i * b0_i; \
+ res3 OP2## = OP4 a1_r * b0_i; \
+ res3 OP3## = a1_i * b0_r; \
+ \
+ /* 1st col */ \
+ b1_r = pb0[2]; \
+ b1_i = pb0[3]; \
+ res4 OP0## = a0_r * b1_r; \
+ res4 OP1## = a0_i * b1_i; \
+ res5 OP2## = OP4 a0_r * b1_i; \
+ res5 OP3## = a0_i * b1_r; \
+ \
+ res6 OP0## = a1_r * b1_r; \
+ res6 OP1## = a1_i * b1_i; \
+ res7 OP2## = OP4 a1_r * b1_i; \
+ res7 OP3## = a1_i * b1_r; \
+ \
+ /* 2nd col */ \
+ b2_r = pb0[4]; \
+ b2_i = pb0[5]; \
+ res8 OP0## = a0_r * b2_r; \
+ res8 OP1## = a0_i * b2_i; \
+ res9 OP2## = OP4 a0_r * b2_i; \
+ res9 OP3## = a0_i * b2_r; \
+ \
+ res10 OP0## = a1_r * b2_r; \
+ res10 OP1## = a1_i * b2_i; \
+ res11 OP2## = OP4 a1_r * b2_i; \
+ res11 OP3## = a1_i * b2_r; \
+ \
+ /* 3rd col */ \
+ b3_r = pb0[6]; \
+ b3_i = pb0[7]; \
+ res12 OP0## = a0_r * b3_r; \
+ res12 OP1## = a0_i * b3_i; \
+ res13 OP2## = OP4 a0_r * b3_i; \
+ res13 OP3## = a0_i * b3_r; \
+ \
+ res14 OP0## = a1_r * b3_r; \
+ res14 OP1## = a1_i * b3_i; \
+ res15 OP2## = OP4 a1_r * b3_i; \
+ res15 OP3## = a1_i * b3_r; \
+}
+
+#define CGEMM_KERNEL_2X2(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ /* 0th col */ \
+ a0_r = pa0[0]; \
+ a0_i = pa0[1]; \
+ b0_r = pb0[0]; \
+ b0_i = pb0[1]; \
+ \
+ res0 OP0## = a0_r * b0_r; \
+ res0 OP1## = a0_i * b0_i; \
+ res1 OP2## = OP4 a0_r * b0_i; \
+ res1 OP3## = a0_i * b0_r; \
+ \
+ a1_r = pa0[2]; \
+ a1_i = pa0[3]; \
+ res2 OP0## = a1_r * b0_r; \
+ res2 OP1## = a1_i * b0_i; \
+ res3 OP2## = OP4 a1_r * b0_i; \
+ res3 OP3## = a1_i * b0_r; \
+ \
+ /* 1st col */ \
+ b1_r = pb0[2]; \
+ b1_i = pb0[3]; \
+ res4 OP0## = a0_r * b1_r; \
+ res4 OP1## = a0_i * b1_i; \
+ res5 OP2## = OP4 a0_r * b1_i; \
+ res5 OP3## = a0_i * b1_r; \
+ \
+ res6 OP0## = a1_r * b1_r; \
+ res6 OP1## = a1_i * b1_i; \
+ res7 OP2## = OP4 a1_r * b1_i; \
+ res7 OP3## = a1_i * b1_r; \
+}
+
+#define CGEMM_KERNEL_2X1(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ /* 0th col */ \
+ a0_r = pa0[0]; \
+ a0_i = pa0[1]; \
+ b0_r = pb0[0]; \
+ b0_i = pb0[1]; \
+ \
+ res0 OP0## = a0_r * b0_r; \
+ res0 OP1## = a0_i * b0_i; \
+ res1 OP2## = OP4 a0_r * b0_i; \
+ res1 OP3## = a0_i * b0_r; \
+ \
+ a1_r = pa0[2]; \
+ a1_i = pa0[3]; \
+ res2 OP0## = a1_r * b0_r; \
+ res2 OP1## = a1_i * b0_i; \
+ res3 OP2## = OP4 a1_r * b0_i; \
+ res3 OP3## = a1_i * b0_r; \
+}
+
+#define CGEMM_KERNEL_1X4(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ /* 0th col */ \
+ a0_r = pa0[0]; \
+ a0_i = pa0[1]; \
+ b0_r = pb0[0]; \
+ b0_i = pb0[1]; \
+ \
+ res0 OP0## = a0_r * b0_r; \
+ res0 OP1## = a0_i * b0_i; \
+ res1 OP2## = OP4 a0_r * b0_i; \
+ res1 OP3## = a0_i * b0_r; \
+ \
+ /* 1st col */ \
+ b1_r = pb0[2]; \
+ b1_i = pb0[3]; \
+ res2 OP0## = a0_r * b1_r; \
+ res2 OP1## = a0_i * b1_i; \
+ res3 OP2## = OP4 a0_r * b1_i; \
+ res3 OP3## = a0_i * b1_r; \
+ \
+ /* 2nd col */ \
+ b2_r = pb0[4]; \
+ b2_i = pb0[5]; \
+ res4 OP0## = a0_r * b2_r; \
+ res4 OP1## = a0_i * b2_i; \
+ res5 OP2## = OP4 a0_r * b2_i; \
+ res5 OP3## = a0_i * b2_r; \
+ \
+ /* 3rd col */ \
+ b3_r = pb0[6]; \
+ b3_i = pb0[7]; \
+ res6 OP0## = a0_r * b3_r; \
+ res6 OP1## = a0_i * b3_i; \
+ res7 OP2## = OP4 a0_r * b3_i; \
+ res7 OP3## = a0_i * b3_r; \
+}
+
+#define CGEMM_KERNEL_1X2(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ /* 0th col */ \
+ a0_r = pa0[0]; \
+ a0_i = pa0[1]; \
+ b0_r = pb0[0]; \
+ b0_i = pb0[1]; \
+ \
+ res0 OP0## = a0_r * b0_r; \
+ res0 OP1## = a0_i * b0_i; \
+ res1 OP2## = OP4 a0_r * b0_i; \
+ res1 OP3## = a0_i * b0_r; \
+ \
+ /* 1st col */ \
+ b1_r = pb0[2]; \
+ b1_i = pb0[3]; \
+ res2 OP0## = a0_r * b1_r; \
+ res2 OP1## = a0_i * b1_i; \
+ res3 OP2## = OP4 a0_r * b1_i; \
+ res3 OP3## = a0_i * b1_r; \
+}
+
+#define CGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ /* 0th col */ \
+ a0_r = pa0[0]; \
+ a0_i = pa0[1]; \
+ b0_r = pb0[0]; \
+ b0_i = pb0[1]; \
+ \
+ res0 OP0## = a0_r * b0_r; \
+ res0 OP1## = a0_i * b0_i; \
+ res1 OP2## = OP4 a0_r * b0_i; \
+ res1 OP3## = a0_i * b0_r; \
+}
+
+#define CGEMM_SCALE_8X4_MSA \
+{ \
+ LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r += alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i += alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
+ \
+ LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r += alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i += alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
+ \
+ LD_SP4(pc2, 4, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i += alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ dst1_r += alpha_r * res5_r; \
+ dst1_r -= alpha_i * res5_i; \
+ dst1_i += alpha_r * res5_i; \
+ dst1_i += alpha_i * res5_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \
+ \
+ LD_SP4(pc3, 4, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i += alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ dst1_r += alpha_r * res7_r; \
+ dst1_r -= alpha_i * res7_i; \
+ dst1_i += alpha_r * res7_i; \
+ dst1_i += alpha_i * res7_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \
+}
+
+#define CGEMM_SCALE_8X2_MSA \
+{ \
+ LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r += alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i += alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
+ \
+ LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r += alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i += alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
+}
+
+#define CGEMM_SCALE_8X1_MSA \
+{ \
+ LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r += alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i += alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
+}
+
+#define CGEMM_SCALE_4X4_MSA \
+{ \
+ LD_SP2(pc0, 4, dst0, dst1); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc0, 4); \
+ \
+ LD_SP2(pc1, 4, dst0, dst1); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc1, 4); \
+ \
+ LD_SP2(pc2, 4, dst0, dst1); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i += alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc2, 4); \
+ \
+ LD_SP2(pc3, 4, dst0, dst1); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i += alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc3, 4); \
+}
+
+#define CGEMM_SCALE_4X2_MSA \
+{ \
+ LD_SP2(pc0, 4, dst0, dst1); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc0, 4); \
+ \
+ LD_SP2(pc1, 4, dst0, dst1); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc1, 4); \
+}
+
+#define CGEMM_SCALE_4X1_MSA \
+{ \
+ LD_SP2(pc0, 4, dst0, dst1); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc0, 4); \
+}
+
+#define CGEMM_SCALE_2X4 \
+{ \
+ /* 0th col */ \
+ pc0[0] += alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] += alphar * res1; \
+ pc0[1] += alphai * res0; \
+ pc0[2] += alphar * res2; \
+ pc0[2] -= alphai * res3; \
+ pc0[3] += alphar * res3; \
+ pc0[3] += alphai * res2; \
+ \
+ /* 1st col */ \
+ pc1[0] += alphar * res4; \
+ pc1[0] -= alphai * res5; \
+ pc1[1] += alphar * res5; \
+ pc1[1] += alphai * res4; \
+ pc1[2] += alphar * res6; \
+ pc1[2] -= alphai * res7; \
+ pc1[3] += alphar * res7; \
+ pc1[3] += alphai * res6; \
+ \
+ /* 2nd col */ \
+ pc2[0] += alphar * res8; \
+ pc2[0] -= alphai * res9; \
+ pc2[1] += alphar * res9; \
+ pc2[1] += alphai * res8; \
+ pc2[2] += alphar * res10; \
+ pc2[2] -= alphai * res11; \
+ pc2[3] += alphar * res11; \
+ pc2[3] += alphai * res10; \
+ \
+ /* 3rd col */ \
+ pc3[0] += alphar * res12; \
+ pc3[0] -= alphai * res13; \
+ pc3[1] += alphar * res13; \
+ pc3[1] += alphai * res12; \
+ pc3[2] += alphar * res14; \
+ pc3[2] -= alphai * res15; \
+ pc3[3] += alphar * res15; \
+ pc3[3] += alphai * res14; \
+}
+
+#define CGEMM_SCALE_2X2 \
+{ \
+ /* 0th col */ \
+ pc0[0] += alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] += alphar * res1; \
+ pc0[1] += alphai * res0; \
+ pc0[2] += alphar * res2; \
+ pc0[2] -= alphai * res3; \
+ pc0[3] += alphar * res3; \
+ pc0[3] += alphai * res2; \
+ \
+ /* 1st col */ \
+ pc1[0] += alphar * res4; \
+ pc1[0] -= alphai * res5; \
+ pc1[1] += alphar * res5; \
+ pc1[1] += alphai * res4; \
+ pc1[2] += alphar * res6; \
+ pc1[2] -= alphai * res7; \
+ pc1[3] += alphar * res7; \
+ pc1[3] += alphai * res6; \
+}
+
+#define CGEMM_SCALE_2X1 \
+{ \
+ pc0[0] += alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] += alphar * res1; \
+ pc0[1] += alphai * res0; \
+ \
+ pc0[2] += alphar * res2; \
+ pc0[2] -= alphai * res3; \
+ pc0[3] += alphar * res3; \
+ pc0[3] += alphai * res2; \
+}
+
+#define CGEMM_SCALE_1X4 \
+{ \
+ pc0[0] += alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] += alphar * res1; \
+ pc0[1] += alphai * res0; \
+ \
+ pc1[0] += alphar * res2; \
+ pc1[0] -= alphai * res3; \
+ pc1[1] += alphar * res3; \
+ pc1[1] += alphai * res2; \
+ \
+ pc2[0] += alphar * res4; \
+ pc2[0] -= alphai * res5; \
+ pc2[1] += alphar * res5; \
+ pc2[1] += alphai * res4; \
+ \
+ pc3[0] += alphar * res6; \
+ pc3[0] -= alphai * res7; \
+ pc3[1] += alphar * res7; \
+ pc3[1] += alphai * res6; \
+}
+
+#define CGEMM_SCALE_1X2 \
+{ \
+ pc0[0] += alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] += alphar * res1; \
+ pc0[1] += alphai * res0; \
+ \
+ pc1[2] += alphar * res2; \
+ pc1[2] -= alphai * res3; \
+ pc1[3] += alphar * res3; \
+ pc1[3] += alphai * res2; \
+}
+
+#define CGEMM_SCALE_1X1 \
+{ \
+ pc0[0] += alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] += alphar * res1; \
+ pc0[1] += alphai * res0; \
+}
+
+#define CGEMM_TRMM_SCALE_8X4_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r = alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i = alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r = alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i = alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
+ \
+ dst0_r = alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i = alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ dst1_r = alpha_r * res5_r; \
+ dst1_r -= alpha_i * res5_i; \
+ dst1_i = alpha_r * res5_i; \
+ dst1_i += alpha_i * res5_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \
+ \
+ dst0_r = alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i = alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ dst1_r = alpha_r * res7_r; \
+ dst1_r -= alpha_i * res7_i; \
+ dst1_i = alpha_r * res7_i; \
+ dst1_i += alpha_i * res7_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \
+}
+
+#define CGEMM_TRMM_SCALE_8X2_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r = alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i = alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r = alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i = alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
+}
+
+#define CGEMM_TRMM_SCALE_8X1_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r = alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i = alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
+}
+
+#define CGEMM_TRMM_SCALE_4X4_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc0, 4); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc1, 4); \
+ \
+ dst0_r = alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i = alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc2, 4); \
+ \
+ dst0_r = alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i = alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc3, 4); \
+}
+
+#define CGEMM_TRMM_SCALE_4X2_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc0, 4); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc1, 4); \
+}
+
+#define CGEMM_TRMM_SCALE_4X1_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc0, 4); \
+}
+
+#define CGEMM_TRMM_SCALE_2X4 \
+{ \
+ /* 0th col */ \
+ pc0[0] = alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] = alphar * res1; \
+ pc0[1] += alphai * res0; \
+ pc0[2] = alphar * res2; \
+ pc0[2] -= alphai * res3; \
+ pc0[3] = alphar * res3; \
+ pc0[3] += alphai * res2; \
+ \
+ /* 1st col */ \
+ pc1[0] = alphar * res4; \
+ pc1[0] -= alphai * res5; \
+ pc1[1] = alphar * res5; \
+ pc1[1] += alphai * res4; \
+ pc1[2] = alphar * res6; \
+ pc1[2] -= alphai * res7; \
+ pc1[3] = alphar * res7; \
+ pc1[3] += alphai * res6; \
+ \
+ /* 2nd col */ \
+ pc2[0] = alphar * res8; \
+ pc2[0] -= alphai * res9; \
+ pc2[1] = alphar * res9; \
+ pc2[1] += alphai * res8; \
+ pc2[2] = alphar * res10; \
+ pc2[2] -= alphai * res11; \
+ pc2[3] = alphar * res11; \
+ pc2[3] += alphai * res10; \
+ \
+ /* 3rd col */ \
+ pc3[0] = alphar * res12; \
+ pc3[0] -= alphai * res13; \
+ pc3[1] = alphar * res13; \
+ pc3[1] += alphai * res12; \
+ pc3[2] = alphar * res14; \
+ pc3[2] -= alphai * res15; \
+ pc3[3] = alphar * res15; \
+ pc3[3] += alphai * res14; \
+}
+
+#define CGEMM_TRMM_SCALE_2X2 \
+{ \
+ /* 0th col */ \
+ pc0[0] = alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] = alphar * res1; \
+ pc0[1] += alphai * res0; \
+ pc0[2] = alphar * res2; \
+ pc0[2] -= alphai * res3; \
+ pc0[3] = alphar * res3; \
+ pc0[3] += alphai * res2; \
+ \
+ /* 1st col */ \
+ pc1[0] = alphar * res4; \
+ pc1[0] -= alphai * res5; \
+ pc1[1] = alphar * res5; \
+ pc1[1] += alphai * res4; \
+ pc1[2] = alphar * res6; \
+ pc1[2] -= alphai * res7; \
+ pc1[3] = alphar * res7; \
+ pc1[3] += alphai * res6; \
+}
+
+#define CGEMM_TRMM_SCALE_2X1 \
+{ \
+ pc0[0] = alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] = alphar * res1; \
+ pc0[1] += alphai * res0; \
+ \
+ pc0[2] = alphar * res2; \
+ pc0[2] -= alphai * res3; \
+ pc0[3] = alphar * res3; \
+ pc0[3] += alphai * res2; \
+}
+
+#define CGEMM_TRMM_SCALE_1X4 \
+{ \
+ pc0[0] = alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] = alphar * res1; \
+ pc0[1] += alphai * res0; \
+ \
+ pc1[0] = alphar * res2; \
+ pc1[0] -= alphai * res3; \
+ pc1[1] = alphar * res3; \
+ pc1[1] += alphai * res2; \
+ \
+ pc2[0] = alphar * res4; \
+ pc2[0] -= alphai * res5; \
+ pc2[1] = alphar * res5; \
+ pc2[1] += alphai * res4; \
+ \
+ pc3[0] = alphar * res6; \
+ pc3[0] -= alphai * res7; \
+ pc3[1] = alphar * res7; \
+ pc3[1] += alphai * res6; \
+}
+
+#define CGEMM_TRMM_SCALE_1X2 \
+{ \
+ pc0[0] = alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] = alphar * res1; \
+ pc0[1] += alphai * res0; \
+ \
+ pc1[2] = alphar * res2; \
+ pc1[2] -= alphai * res3; \
+ pc1[3] = alphar * res3; \
+ pc1[3] += alphai * res2; \
+}
+
+#define CGEMM_TRMM_SCALE_1X1 \
+{ \
+ pc0[0] = alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] = alphar * res1; \
+ pc0[1] += alphai * res0; \
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
+ FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc
+#ifdef TRMMKERNEL
+ , BLASLONG offset
+#endif
+ )
+{
+ BLASLONG i, j, l, temp;
+#if defined(TRMMKERNEL)
+ BLASLONG off;
+#endif
+ FLOAT *pc0, *pc1, *pc2, *pc3;
+ FLOAT *pa0, *pb0;
+ FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
+ FLOAT res8, res9, res10, res11, res12, res13, res14, res15;
+ FLOAT a0_r, a1_r;
+ FLOAT a0_i, a1_i;
+ FLOAT b0_r, b1_r, b2_r, b3_r;
+ FLOAT b0_i, b1_i, b2_i, b3_i;
+ v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1;
+ v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi;
+ v4f32 dst0, dst1, dst2, dst3;
+ v4f32 alpha_r, alpha_i;
+ v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i;
+ v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i;
+ v4f32 dst0_r, dst0_i, dst1_r, dst1_i;
+
+ alpha_r = COPY_FLOAT_TO_VECTOR(alphar);
+ alpha_i = COPY_FLOAT_TO_VECTOR(alphai);
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off = -offset;
+#endif
+
+ for (j = (n >> 2); j--;)
+ {
+ pc0 = C;
+ pc1 = pc0 + 2 * ldc;
+ pc2 = pc1 + 2 * ldc;
+ pc3 = pc2 + 2 * ldc;
+
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 8;
+ pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_8X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_8X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_8X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_8X4_MSA(, -, , -, -);
+#endif
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_8X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_8X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_8X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_8X4_MSA(+, -, -, -,);
+#endif
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_8X4_MSA
+#else
+ CGEMM_SCALE_8X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2 * 8;
+ pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 4)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 4;
+ pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_4X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_4X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_4X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_4X4_MSA(, -, , -, -);
+#endif
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_4X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_4X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_4X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_4X4_MSA(+, -, -, -,);
+#endif
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_4X4_MSA
+#else
+ CGEMM_SCALE_4X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2 * 4;
+ pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 2;
+ pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_2X4(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_2X4(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_2X4(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_2X4(, -, , -, -);
+#endif
+
+ pa0 += 4;
+ pb0 += 8;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_2X4(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_2X4(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_2X4(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_2X4(+, -, -, -,);
+#endif
+
+ pa0 += 4;
+ pb0 += 8;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_2X4
+#else
+ CGEMM_SCALE_2X4
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2 * 2;
+ pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+
+ pc0 += 4;
+ pc1 += 4;
+ pc2 += 4;
+ pc3 += 4;
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 1;
+ pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_1X4(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_1X4(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_1X4(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_1X4(, -, , -, -);
+#endif
+
+ pa0 += 2;
+ pb0 += 8;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_1X4(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_1X4(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_1X4(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_1X4(+, -, -, -,);
+#endif
+
+ pa0 += 2;
+ pb0 += 8;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_1X4
+#else
+ CGEMM_SCALE_1X4
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2 * 1;
+ pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ pc1 += 2;
+ pc2 += 2;
+ pc3 += 2;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 4; // number of values in A
+#endif
+
+ l = k << 3;
+ B = B + l;
+ i = ldc << 3;
+ C = C + i;
+ }
+
+ if (n & 2)
+ {
+ pc0 = C;
+ pc1 = pc0 + 2 * ldc;
+
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 8;
+ pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_8X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_8X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_8X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_8X2_MSA(, -, , -, -);
+#endif
+
+ pb0 += 4;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_8X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_8X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_8X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_8X2_MSA(+, -, -, -,);
+#endif
+
+ pb0 += 4;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_8X2_MSA
+#else
+ CGEMM_SCALE_8X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2 * 8;
+ pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 4)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 4;
+ pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_4X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_4X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_4X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_4X2_MSA(, -, , -, -);
+#endif
+
+ pb0 += 4;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_4X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_4X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_4X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_4X2_MSA(+, -, -, -,);
+#endif
+
+ pb0 += 4;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_4X2_MSA
+#else
+ CGEMM_SCALE_4X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2 * 4;
+ pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 2;
+ pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_2X2(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_2X2(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_2X2(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_2X2(, -, , -, -);
+#endif
+
+ pa0 += 4;
+ pb0 += 4;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_2X2(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_2X2(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_2X2(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_2X2(+, -, -, -,);
+#endif
+
+ pa0 += 4;
+ pb0 += 4;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_2X2
+#else
+ CGEMM_SCALE_2X2
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2 * 2;
+ pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+
+ pc0 += 4;
+ pc1 += 4;
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 1;
+ pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_1X2(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_1X2(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_1X2(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_1X2(, -, , -, -);
+#endif
+
+ pa0 += 2;
+ pb0 += 4;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_1X2(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_1X2(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_1X2(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_1X2(+, -, -, -,);
+#endif
+
+ pa0 += 2;
+ pb0 += 4;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_1X2
+#else
+ CGEMM_SCALE_1X2
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2 * 1;
+ pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ pc1 += 2;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 2; // number of values in A
+#endif
+
+ l = k << 2;
+ B = B + l;
+ i = ldc << 2;
+ C = C + i;
+ }
+
+ if (n & 1)
+ {
+ pc0 = C;
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 8;
+ pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_8X1_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_8X1_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_8X1_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_8X1_MSA(, -, , -, -);
+#endif
+
+ pb0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_8X1_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_8X1_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_8X1_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_8X1_MSA(+, -, -, -,);
+#endif
+
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_8X1_MSA
+#else
+ CGEMM_SCALE_8X1_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2 * 8;
+ pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 4)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 4;
+ pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_4X1_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_4X1_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_4X1_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_4X1_MSA(, -, , -, -);
+#endif
+
+ pb0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_4X1_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_4X1_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_4X1_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_4X1_MSA(+, -, -, -,);
+#endif
+
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_4X1_MSA
+#else
+ CGEMM_SCALE_4X1_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2 * 4;
+ pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 2;
+ pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_2X1(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_2X1(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_2X1(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_2X1(, -, , -, -);
+#endif
+
+ pa0 += 4;
+ pb0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_2X1(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_2X1(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_2X1(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_2X1(+, -, -, -,);
+#endif
+
+ pa0 += 4;
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_2X1
+#else
+ CGEMM_SCALE_2X1
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2 * 2;
+ pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+
+ pc0 += 4;
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 1;
+ pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_1X1(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_1X1(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_1X1(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_1X1(, -, , -, -);
+#endif
+
+ pa0 += 2;
+ pb0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_1X1(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_1X1(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_1X1(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_1X1(+, -, -, -,);
+#endif
+
+ pa0 += 2;
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_1X1
+#else
+ CGEMM_SCALE_1X1
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2 * 1;
+ pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 1; // number of values in A
+#endif
+
+ l = k << 1;
+ B = B + l;
+ i = ldc << 1;
+ C = C + i;
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/cgemm_ncopy_4_msa.c b/kernel/mips/cgemm_ncopy_4_msa.c
new file mode 100644
index 000000000..b38290b3d
--- /dev/null
+++ b/kernel/mips/cgemm_ncopy_4_msa.c
@@ -0,0 +1,195 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
+ FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+ FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
+ v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+ v4f32 dst0, dst1, dst4, dst5;
+
+ psrc0 = src;
+ pdst = dst;
+ lda *= 2;
+
+ for (j = (n >> 2); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc0 += 4 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+ LD_SP2_INC(psrc3, 4, src4, src5);
+ LD_SP2_INC(psrc4, 4, src6, src7);
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+ ILVRL_D2_SP(src6, src4, dst1, dst5);
+
+ ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+
+ ILVRL_D2_SP(src3, src1, dst0, dst4);
+ ILVRL_D2_SP(src7, src5, dst1, dst5);
+
+ ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ src2 = LD_SP(psrc2);
+ src4 = LD_SP(psrc3);
+ src6 = LD_SP(psrc4);
+ psrc1 += 4;
+ psrc2 += 4;
+ psrc3 += 4;
+ psrc4 += 4;
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+ ILVRL_D2_SP(src6, src4, dst1, dst5);
+
+ ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+ ctemp05 = *(psrc3 + 0);
+ ctemp06 = *(psrc3 + 1);
+ ctemp07 = *(psrc4 + 0);
+ ctemp08 = *(psrc4 + 1);
+ psrc1 += 2;
+ psrc2 += 2;
+ psrc3 += 2;
+ psrc4 += 2;
+
+ *(pdst + 0) = ctemp01;
+ *(pdst + 1) = ctemp02;
+ *(pdst + 2) = ctemp03;
+ *(pdst + 3) = ctemp04;
+ *(pdst + 4) = ctemp05;
+ *(pdst + 5) = ctemp06;
+ *(pdst + 6) = ctemp07;
+ *(pdst + 7) = ctemp08;
+ pdst += 8;
+ }
+ }
+
+ if (n & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc0 += 2 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+
+ ST_SP2_INC(dst0, dst4, pdst, 4);
+
+ ILVRL_D2_SP(src3, src1, dst0, dst4);
+
+ ST_SP2_INC(dst0, dst4, pdst, 4);
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ src2 = LD_SP(psrc2);
+ psrc1 += 4;
+ psrc2 += 4;
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+
+ ST_SP2_INC(dst0, dst4, pdst, 4);
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+ psrc1 += 2;
+ psrc2 += 2;
+
+ *(pdst + 0) = ctemp01;
+ *(pdst + 1) = ctemp02;
+ *(pdst + 2) = ctemp03;
+ *(pdst + 3) = ctemp04;
+ pdst += 4;
+ }
+ }
+
+ if (n & 1)
+ {
+ psrc1 = psrc0;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ ST_SP2_INC(src0, src1, pdst, 4);
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ psrc1 += 4;
+
+ ST_SP(src0, pdst);
+ pdst += 4;
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ psrc1 += 2;
+
+ *(pdst + 0) = ctemp01;
+ *(pdst + 1) = ctemp02;
+ pdst += 2;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/cgemm_ncopy_8_msa.c b/kernel/mips/cgemm_ncopy_8_msa.c
new file mode 100644
index 000000000..9ea749069
--- /dev/null
+++ b/kernel/mips/cgemm_ncopy_8_msa.c
@@ -0,0 +1,310 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
+ FLOAT *psrc8, *pdst;
+ FLOAT ctemp01, ctemp02, ctemp03, ctemp04, ctemp05, ctemp06, ctemp07;
+ FLOAT ctemp08, ctemp09, ctemp10, ctemp11, ctemp12, ctemp13, ctemp14;
+ FLOAT ctemp15, ctemp16;
+ v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+ v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
+ v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ psrc0 = src;
+ pdst = dst;
+ lda *= 2;
+
+ for (j = (n >> 3); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc5 = psrc4 + lda;
+ psrc6 = psrc5 + lda;
+ psrc7 = psrc6 + lda;
+ psrc8 = psrc7 + lda;
+ psrc0 += 8 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+ LD_SP2_INC(psrc3, 4, src4, src5);
+ LD_SP2_INC(psrc4, 4, src6, src7);
+ LD_SP2_INC(psrc5, 4, src8, src9);
+ LD_SP2_INC(psrc6, 4, src10, src11);
+ LD_SP2_INC(psrc7, 4, src12, src13);
+ LD_SP2_INC(psrc8, 4, src14, src15);
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+ ILVRL_D2_SP(src6, src4, dst1, dst5);
+ ILVRL_D2_SP(src10, src8, dst2, dst6);
+ ILVRL_D2_SP(src14, src12, dst3, dst7);
+
+ ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
+
+ ILVRL_D2_SP(src3, src1, dst0, dst4);
+ ILVRL_D2_SP(src7, src5, dst1, dst5);
+ ILVRL_D2_SP(src11, src9, dst2, dst6);
+ ILVRL_D2_SP(src15, src13, dst3, dst7);
+
+ ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ src2 = LD_SP(psrc2);
+ src4 = LD_SP(psrc3);
+ src6 = LD_SP(psrc4);
+ src8 = LD_SP(psrc5);
+ src10 = LD_SP(psrc6);
+ src12 = LD_SP(psrc7);
+ src14 = LD_SP(psrc8);
+ psrc1 += 4;
+ psrc2 += 4;
+ psrc3 += 4;
+ psrc4 += 4;
+ psrc5 += 4;
+ psrc6 += 4;
+ psrc7 += 4;
+ psrc8 += 4;
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+ ILVRL_D2_SP(src6, src4, dst1, dst5);
+ ILVRL_D2_SP(src10, src8, dst2, dst6);
+ ILVRL_D2_SP(src14, src12, dst3, dst7);
+
+ ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+ ctemp05 = *(psrc3 + 0);
+ ctemp06 = *(psrc3 + 1);
+ ctemp07 = *(psrc4 + 0);
+ ctemp08 = *(psrc4 + 1);
+ ctemp09 = *(psrc5 + 0);
+ ctemp10 = *(psrc5 + 1);
+ ctemp11 = *(psrc6 + 0);
+ ctemp12 = *(psrc6 + 1);
+ ctemp13 = *(psrc7 + 0);
+ ctemp14 = *(psrc7 + 1);
+ ctemp15 = *(psrc8 + 0);
+ ctemp16 = *(psrc8 + 1);
+ psrc1 += 2;
+ psrc2 += 2;
+ psrc3 += 2;
+ psrc4 += 2;
+ psrc5 += 2;
+ psrc6 += 2;
+ psrc7 += 2;
+ psrc8 += 2;
+
+ *(pdst + 0) = ctemp01;
+ *(pdst + 1) = ctemp02;
+ *(pdst + 2) = ctemp03;
+ *(pdst + 3) = ctemp04;
+ *(pdst + 4) = ctemp05;
+ *(pdst + 5) = ctemp06;
+ *(pdst + 6) = ctemp07;
+ *(pdst + 7) = ctemp08;
+ *(pdst + 8) = ctemp09;
+ *(pdst + 9) = ctemp10;
+ *(pdst + 10) = ctemp11;
+ *(pdst + 11) = ctemp12;
+ *(pdst + 12) = ctemp13;
+ *(pdst + 13) = ctemp14;
+ *(pdst + 14) = ctemp15;
+ *(pdst + 15) = ctemp16;
+ pdst += 16;
+ }
+ }
+
+ if (n & 4)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc0 += 4 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+ LD_SP2_INC(psrc3, 4, src4, src5);
+ LD_SP2_INC(psrc4, 4, src6, src7);
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+ ILVRL_D2_SP(src6, src4, dst1, dst5);
+
+ ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+
+ ILVRL_D2_SP(src3, src1, dst0, dst4);
+ ILVRL_D2_SP(src7, src5, dst1, dst5);
+
+ ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ src2 = LD_SP(psrc2);
+ src4 = LD_SP(psrc3);
+ src6 = LD_SP(psrc4);
+ psrc1 += 4;
+ psrc2 += 4;
+ psrc3 += 4;
+ psrc4 += 4;
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+ ILVRL_D2_SP(src6, src4, dst1, dst5);
+
+ ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+ ctemp05 = *(psrc3 + 0);
+ ctemp06 = *(psrc3 + 1);
+ ctemp07 = *(psrc4 + 0);
+ ctemp08 = *(psrc4 + 1);
+ psrc1 += 2;
+ psrc2 += 2;
+ psrc3 += 2;
+ psrc4 += 2;
+
+ *(pdst + 0) = ctemp01;
+ *(pdst + 1) = ctemp02;
+ *(pdst + 2) = ctemp03;
+ *(pdst + 3) = ctemp04;
+ *(pdst + 4) = ctemp05;
+ *(pdst + 5) = ctemp06;
+ *(pdst + 6) = ctemp07;
+ *(pdst + 7) = ctemp08;
+ pdst += 8;
+ }
+ }
+
+ if (n & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc0 += 2 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+
+ ST_SP2_INC(dst0, dst4, pdst, 4);
+
+ ILVRL_D2_SP(src3, src1, dst0, dst4);
+
+ ST_SP2_INC(dst0, dst4, pdst, 4);
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ src2 = LD_SP(psrc2);
+ psrc1 += 4;
+ psrc2 += 4;
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+
+ ST_SP2_INC(dst0, dst4, pdst, 4);
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+ psrc1 += 2;
+ psrc2 += 2;
+
+ *(pdst + 0) = ctemp01;
+ *(pdst + 1) = ctemp02;
+ *(pdst + 2) = ctemp03;
+ *(pdst + 3) = ctemp04;
+ pdst += 4;
+ }
+ }
+
+ if (n & 1)
+ {
+ psrc1 = psrc0;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ ST_SP2_INC(src0, src1, pdst, 4);
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ psrc1 += 4;
+
+ ST_SP(src0, pdst);
+ pdst += 4;
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ psrc1 += 2;
+
+ *(pdst + 0) = ctemp01;
+ *(pdst + 1) = ctemp02;
+ pdst += 2;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/cgemm_tcopy_4_msa.c b/kernel/mips/cgemm_tcopy_4_msa.c
new file mode 100644
index 000000000..12aaa979e
--- /dev/null
+++ b/kernel/mips/cgemm_tcopy_4_msa.c
@@ -0,0 +1,125 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0;
+ FLOAT *psrc1, *psrc2;
+ FLOAT *pdst0;
+ FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+ v4f32 src0, src1, src2, src3;
+
+ psrc0 = src;
+ pdst0 = dst;
+ lda *= 2;
+
+ for (j = (n >> 2); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc0 + lda;
+ psrc0 += 8;
+
+ for (i = (m >> 1); i--;)
+ {
+ LD_SP2(psrc1, 4, src0, src1);
+ LD_SP2(psrc2, 4, src2, src3);
+ ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ }
+
+ if (m & 1)
+ {
+ LD_SP2(psrc1, 4, src0, src1);
+ ST_SP2_INC(src0, src1, pdst0, 4);
+ }
+ }
+
+ if (n & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc0 + lda;
+ psrc0 += 4;
+
+ for (i = (m >> 1); i--;)
+ {
+ src0 = LD_SP(psrc1);
+ src1 = LD_SP(psrc2);
+ ST_SP2_INC(src0, src1, pdst0, 4);
+
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ }
+
+ if (m & 1)
+ {
+ src0 = LD_SP(psrc1);
+ ST_SP(src0, pdst0);
+ pdst0 += 4;
+ }
+ }
+
+ if (n & 1)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc0 + lda;
+ psrc0 += 2;
+
+ for (i = (m >> 1); i--;)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+
+ *(pdst0 + 0) = ctemp01;
+ *(pdst0 + 1) = ctemp02;
+ *(pdst0 + 2) = ctemp03;
+ *(pdst0 + 3) = ctemp04;
+
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ pdst0 += 4;
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+
+ *(pdst0 + 0) = ctemp01;
+ *(pdst0 + 1) = ctemp02;
+ pdst0 += 2;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/cgemm_tcopy_8_msa.c b/kernel/mips/cgemm_tcopy_8_msa.c
new file mode 100644
index 000000000..9f78fa73a
--- /dev/null
+++ b/kernel/mips/cgemm_tcopy_8_msa.c
@@ -0,0 +1,214 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *pdst0;
+ FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+ v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+ v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
+
+ psrc0 = src;
+ pdst0 = dst;
+ lda *= 2;
+
+ for (j = (n >> 3); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc0 + lda;
+ psrc0 += 16;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP4(psrc1, 4, src0, src1, src2, src3);
+ LD_SP4(psrc2, 4, src4, src5, src6, src7);
+ LD_SP4(psrc1 + 2 * lda, 4, src8, src9, src10, src11);
+ LD_SP4(psrc2 + 2 * lda, 4, src12, src13, src14, src15);
+ ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4);
+ ST_SP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, pdst0, 4);
+ psrc1 += 4 * lda;
+ psrc2 += 4 * lda;
+ }
+
+ if (m & 2)
+ {
+ LD_SP4(psrc1, 4, src0, src1, src2, src3);
+ LD_SP4(psrc2, 4, src4, src5, src6, src7);
+ ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4);
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ }
+
+ if (m & 1)
+ {
+ LD_SP4(psrc1, 4, src0, src1, src2, src3);
+ ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+ }
+ }
+
+ if (n & 4)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc0 + lda;
+ psrc0 += 8;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2(psrc1, 4, src0, src1);
+ LD_SP2(psrc2, 4, src2, src3);
+ LD_SP2(psrc1 + 2 * lda, 4, src4, src5);
+ LD_SP2(psrc2 + 2 * lda, 4, src6, src7);
+
+ ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+ ST_SP4_INC(src4, src5, src6, src7, pdst0, 4);
+ psrc1 += 4 * lda;
+ psrc2 += 4 * lda;
+ }
+
+ if (m & 2)
+ {
+ LD_SP2(psrc1, 4, src0, src1);
+ LD_SP2(psrc2, 4, src2, src3);
+ ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ }
+
+ if (m & 1)
+ {
+ LD_SP2(psrc1, 4, src0, src1);
+ ST_SP2_INC(src0, src1, pdst0, 4);
+ }
+ }
+
+ if (n & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc0 + lda;
+ psrc0 += 4;
+
+ for (i = (m >> 2); i--;)
+ {
+ src0 = LD_SP(psrc1);
+ src1 = LD_SP(psrc2);
+ src2 = LD_SP(psrc1 + 2 * lda);
+ src3 = LD_SP(psrc2 + 2 * lda);
+ ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+
+ psrc1 += 4 * lda;
+ psrc2 += 4 * lda;
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ src1 = LD_SP(psrc2);
+ ST_SP2_INC(src0, src1, pdst0, 4);
+
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ }
+
+ if (m & 1)
+ {
+ src0 = LD_SP(psrc1);
+ ST_SP(src0, pdst0);
+ pdst0 += 4;
+ }
+ }
+
+ if (n & 1)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc0 + lda;
+ psrc0 += 2;
+
+ for (i = (m >> 2); i--;)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+
+ *(pdst0 + 0) = ctemp01;
+ *(pdst0 + 1) = ctemp02;
+ *(pdst0 + 2) = ctemp03;
+ *(pdst0 + 3) = ctemp04;
+
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ pdst0 += 4;
+
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+
+ *(pdst0 + 0) = ctemp01;
+ *(pdst0 + 1) = ctemp02;
+ *(pdst0 + 2) = ctemp03;
+ *(pdst0 + 3) = ctemp04;
+
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ pdst0 += 4;
+ }
+
+ if (m & 2)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+
+ *(pdst0 + 0) = ctemp01;
+ *(pdst0 + 1) = ctemp02;
+ *(pdst0 + 2) = ctemp03;
+ *(pdst0 + 3) = ctemp04;
+
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ pdst0 += 4;
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+
+ *(pdst0 + 0) = ctemp01;
+ *(pdst0 + 1) = ctemp02;
+ pdst0 += 2;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/cgemv_n_msa.c b/kernel/mips/cgemv_n_msa.c
new file mode 100644
index 000000000..f1879ba00
--- /dev/null
+++ b/kernel/mips/cgemv_n_msa.c
@@ -0,0 +1,611 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#undef OP0
+#undef OP1
+#undef OP2
+#undef OP3
+#undef OP4
+
+#if !defined(XCONJ)
+ #define OP3 -=
+ #define OP4 +=
+#else
+ #define OP3 +=
+ #define OP4 -=
+#endif
+
+#if !defined(CONJ)
+ #if !defined(XCONJ)
+ #define OP0 -=
+ #define OP1 +=
+ #define OP2 +=
+ #else
+ #define OP0 +=
+ #define OP1 +=
+ #define OP2 -=
+ #endif
+#else
+ #if !defined(XCONJ)
+ #define OP0 +=
+ #define OP1 -=
+ #define OP2 -=
+ #else
+ #define OP0 -=
+ #define OP1 -=
+ #define OP2 +=
+ #endif
+#endif
+
+#define CGEMV_N_8x4() \
+ LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
+ LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
+ LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \
+ LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \
+ \
+ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
+ PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
+ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
+ PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
+ PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
+ PCKEVOD_W2_SP(t11, t10, src5r, src5i); \
+ PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
+ PCKEVOD_W2_SP(t15, t14, src7r, src7i); \
+ \
+ y0r += tp0r * src0r; \
+ y1r += tp0r * src1r; \
+ y0r += tp1r * src2r; \
+ y1r += tp1r * src3r; \
+ y0r += tp2r * src4r; \
+ y1r += tp2r * src5r; \
+ y0r += tp3r * src6r; \
+ y1r += tp3r * src7r; \
+ \
+ y0r OP0 tp0i * src0i; \
+ y1r OP0 tp0i * src1i; \
+ y0r OP0 tp1i * src2i; \
+ y1r OP0 tp1i * src3i; \
+ y0r OP0 tp2i * src4i; \
+ y1r OP0 tp2i * src5i; \
+ y0r OP0 tp3i * src6i; \
+ y1r OP0 tp3i * src7i; \
+ \
+ y0i OP1 tp0r * src0i; \
+ y1i OP1 tp0r * src1i; \
+ y0i OP1 tp1r * src2i; \
+ y1i OP1 tp1r * src3i; \
+ y0i OP1 tp2r * src4i; \
+ y1i OP1 tp2r * src5i; \
+ y0i OP1 tp3r * src6i; \
+ y1i OP1 tp3r * src7i; \
+ \
+ y0i OP2 tp0i * src0r; \
+ y1i OP2 tp0i * src1r; \
+ y0i OP2 tp1i * src2r; \
+ y1i OP2 tp1i * src3r; \
+ y0i OP2 tp2i * src4r; \
+ y1i OP2 tp2i * src5r; \
+ y0i OP2 tp3i * src6r; \
+ y1i OP2 tp3i * src7r; \
+
+#define CGEMV_N_4x4() \
+ LD_SP2(pa0 + k, 4, t0, t1); \
+ LD_SP2(pa1 + k, 4, t4, t5); \
+ LD_SP2(pa2 + k, 4, t8, t9); \
+ LD_SP2(pa3 + k, 4, t12, t13); \
+ \
+ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
+ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
+ PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
+ PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
+ \
+ y0r += tp0r * src0r; \
+ y0r += tp1r * src2r; \
+ y0r += tp2r * src4r; \
+ y0r += tp3r * src6r; \
+ \
+ y0r OP0 tp0i * src0i; \
+ y0r OP0 tp1i * src2i; \
+ y0r OP0 tp2i * src4i; \
+ y0r OP0 tp3i * src6i; \
+ \
+ y0i OP1 tp0r * src0i; \
+ y0i OP1 tp1r * src2i; \
+ y0i OP1 tp2r * src4i; \
+ y0i OP1 tp3r * src6i; \
+ \
+ y0i OP2 tp0i * src0r; \
+ y0i OP2 tp1i * src2r; \
+ y0i OP2 tp2i * src4r; \
+ y0i OP2 tp3i * src6r; \
+
+#define CGEMV_N_1x4() \
+ res0 = y[0 * inc_y2]; \
+ res1 = y[0 * inc_y2 + 1]; \
+ \
+ res0 += temp0_r * pa0[k]; \
+ res0 OP0 temp0_i * pa0[k + 1]; \
+ res0 += temp1_r * pa1[k]; \
+ res0 OP0 temp1_i * pa1[k + 1]; \
+ res0 += temp2_r * pa2[k]; \
+ res0 OP0 temp2_i * pa2[k + 1]; \
+ res0 += temp3_r * pa3[k]; \
+ res0 OP0 temp3_i * pa3[k + 1]; \
+ \
+ res1 OP1 temp0_r * pa0[k + 1]; \
+ res1 OP2 temp0_i * pa0[k]; \
+ res1 OP1 temp1_r * pa1[k + 1]; \
+ res1 OP2 temp1_i * pa1[k]; \
+ res1 OP1 temp2_r * pa2[k + 1]; \
+ res1 OP2 temp2_i * pa2[k]; \
+ res1 OP1 temp3_r * pa3[k + 1]; \
+ res1 OP2 temp3_i * pa3[k]; \
+ \
+ y[0 * inc_y2] = res0; \
+ y[0 * inc_y2 + 1] = res1; \
+
+#define CGEMV_N_8x2() \
+ LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
+ LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
+ \
+ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
+ PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
+ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
+ PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
+ \
+ y0r += tp0r * src0r; \
+ y1r += tp0r * src1r; \
+ y0r += tp1r * src2r; \
+ y1r += tp1r * src3r; \
+ \
+ y0r OP0 tp0i * src0i; \
+ y1r OP0 tp0i * src1i; \
+ y0r OP0 tp1i * src2i; \
+ y1r OP0 tp1i * src3i; \
+ \
+ y0i OP1 tp0r * src0i; \
+ y1i OP1 tp0r * src1i; \
+ y0i OP1 tp1r * src2i; \
+ y1i OP1 tp1r * src3i; \
+ \
+ y0i OP2 tp0i * src0r; \
+ y1i OP2 tp0i * src1r; \
+ y0i OP2 tp1i * src2r; \
+ y1i OP2 tp1i * src3r; \
+
+#define CGEMV_N_4x2() \
+ LD_SP2(pa0 + k, 4, t0, t1); \
+ LD_SP2(pa1 + k, 4, t4, t5); \
+ \
+ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
+ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
+ \
+ y0r += tp0r * src0r; \
+ y0r += tp1r * src2r; \
+ \
+ y0r OP0 tp0i * src0i; \
+ y0r OP0 tp1i * src2i; \
+ \
+ y0i OP1 tp0r * src0i; \
+ y0i OP1 tp1r * src2i; \
+ \
+ y0i OP2 tp0i * src0r; \
+ y0i OP2 tp1i * src2r; \
+
+#define CGEMV_N_1x2() \
+ res0 = y[0 * inc_y2]; \
+ res1 = y[0 * inc_y2 + 1]; \
+ \
+ res0 += temp0_r * pa0[k]; \
+ res0 OP0 temp0_i * pa0[k + 1]; \
+ res0 += temp1_r * pa1[k]; \
+ res0 OP0 temp1_i * pa1[k + 1]; \
+ \
+ res1 OP1 temp0_r * pa0[k + 1]; \
+ res1 OP2 temp0_i * pa0[k]; \
+ res1 OP1 temp1_r * pa1[k + 1]; \
+ res1 OP2 temp1_i * pa1[k]; \
+ \
+ y[0 * inc_y2] = res0; \
+ y[0 * inc_y2 + 1] = res1; \
+
+#define CGEMV_N_1x1() \
+ res0 = y[0 * inc_y2]; \
+ res1 = y[0 * inc_y2 + 1]; \
+ \
+ res0 += temp_r * pa0[k]; \
+ res0 OP0 temp_i * pa0[k + 1]; \
+ \
+ res1 OP1 temp_r * pa0[k + 1]; \
+ res1 OP2 temp_i * pa0[k]; \
+ \
+ y[0 * inc_y2] = res0; \
+ y[0 * inc_y2 + 1] = res1; \
+
+#define CLOAD_X4_SCALE_VECTOR() \
+ LD_SP2(x, 4, x0, x1); \
+ \
+ PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
+ \
+ tp4r = alphar * x0r; \
+ tp4r OP3 alphai * x0i; \
+ tp4i = alphar * x0i; \
+ tp4i OP4 alphai * x0r; \
+ \
+ SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \
+ SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \
+
+#define CLOAD_X4_SCALE_GP() \
+ x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
+ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
+ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
+ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
+ x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
+ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
+ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
+ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
+ \
+ tp4r = alphar * x0r; \
+ tp4r OP3 alphai * x0i; \
+ tp4i = alphar * x0i; \
+ tp4i OP4 alphai * x0r; \
+ \
+ SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \
+ SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \
+
+#define CLOAD_X2_SCALE_GP() \
+ temp0_r = alpha_r * x[0 * inc_x2]; \
+ temp0_r OP3 alpha_i * x[0 * inc_x2 + 1]; \
+ temp0_i = alpha_r * x[0 * inc_x2 + 1]; \
+ temp0_i OP4 alpha_i * x[0 * inc_x2]; \
+ \
+ temp1_r = alpha_r * x[1 * inc_x2]; \
+ temp1_r OP3 alpha_i * x[1 * inc_x2 + 1]; \
+ temp1_i = alpha_r * x[1 * inc_x2 + 1]; \
+ temp1_i OP4 alpha_i * x[1 * inc_x2]; \
+ \
+ tp0r = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_r); \
+ tp0i = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_i); \
+ tp1r = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_r); \
+ tp1i = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_i); \
+
+#define CLOAD_X1_SCALE_GP() \
+ temp_r = alpha_r * x[0 * inc_x2]; \
+ temp_r OP3 alpha_i * x[0 * inc_x2 + 1]; \
+ temp_i = alpha_r * x[0 * inc_x2 + 1]; \
+ temp_i OP4 alpha_i * x[0 * inc_x2]; \
+
+#define CLOAD_Y8_VECTOR() \
+ LD_SP4(y, 4, y0, y1, y2, y3); \
+ PCKEVOD_W2_SP(y1, y0, y0r, y0i); \
+ PCKEVOD_W2_SP(y3, y2, y1r, y1i); \
+
+#define CLOAD_Y4_VECTOR() \
+ LD_SP2(y, 4, y0, y1); \
+ PCKEVOD_W2_SP(y1, y0, y0r, y0i); \
+
+#define CSTORE_Y8_VECTOR() \
+ ILVRL_W2_SP(y0i, y0r, y0, y1); \
+ ILVRL_W2_SP(y1i, y1r, y2, y3); \
+ ST_SP4(y0, y1, y2, y3, y, 4); \
+
+#define CSTORE_Y4_VECTOR() \
+ ILVRL_W2_SP(y0i, y0r, y0, y1); \
+ ST_SP2(y0, y1, y, 4); \
+
+#define CLOAD_Y8_GP() \
+ y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \
+ y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \
+ y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \
+ y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \
+ y1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2))); \
+ y1r = (v4f32) __msa_insert_w((v4i32) y1r, 1, *((int *)(y + 5 * inc_y2))); \
+ y1r = (v4f32) __msa_insert_w((v4i32) y1r, 2, *((int *)(y + 6 * inc_y2))); \
+ y1r = (v4f32) __msa_insert_w((v4i32) y1r, 3, *((int *)(y + 7 * inc_y2))); \
+ y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \
+ y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \
+ y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \
+ y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \
+ y1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2 + 1))); \
+ y1i = (v4f32) __msa_insert_w((v4i32) y1i, 1, *((int *)(y + 5 * inc_y2 + 1))); \
+ y1i = (v4f32) __msa_insert_w((v4i32) y1i, 2, *((int *)(y + 6 * inc_y2 + 1))); \
+ y1i = (v4f32) __msa_insert_w((v4i32) y1i, 3, *((int *)(y + 7 * inc_y2 + 1))); \
+
+#define CLOAD_Y4_GP() \
+ y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \
+ y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \
+ y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \
+ y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \
+ y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \
+ y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \
+ y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \
+ y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \
+
+#define CSTORE_Y8_GP() \
+ *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \
+ *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \
+ *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \
+ *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \
+ *((int *)(y + 4 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 0); \
+ *((int *)(y + 5 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 1); \
+ *((int *)(y + 6 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 2); \
+ *((int *)(y + 7 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 3); \
+ *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \
+ *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \
+ *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \
+ *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \
+ *((int *)(y + 4 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 0); \
+ *((int *)(y + 5 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 1); \
+ *((int *)(y + 6 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 2); \
+ *((int *)(y + 7 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 3); \
+
+#define CSTORE_Y4_GP() \
+ *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \
+ *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \
+ *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \
+ *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \
+ *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \
+ *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \
+ *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \
+ *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \
+
+#define CGEMV_N_MSA() \
+ for (j = (n >> 2); j--;) \
+ { \
+ CLOAD_X4_SCALE(); \
+ \
+ k = 0; \
+ y = y_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ CLOAD_Y8() \
+ CGEMV_N_8x4(); \
+ CSTORE_Y8(); \
+ \
+ k += 2 * 8; \
+ y += inc_y2 * 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ CLOAD_Y4(); \
+ CGEMV_N_4x4(); \
+ CSTORE_Y4(); \
+ \
+ k += 2 * 4; \
+ y += inc_y2 * 4; \
+ } \
+ \
+ if (m & 3) \
+ { \
+ temp0_r = tp4r[0]; \
+ temp1_r = tp4r[1]; \
+ temp2_r = tp4r[2]; \
+ temp3_r = tp4r[3]; \
+ \
+ temp0_i = tp4i[0]; \
+ temp1_i = tp4i[1]; \
+ temp2_i = tp4i[2]; \
+ temp3_i = tp4i[3]; \
+ \
+ for (i = (m & 3); i--;) \
+ { \
+ CGEMV_N_1x4(); \
+ \
+ k += 2; \
+ y += inc_y2; \
+ } \
+ } \
+ \
+ pa0 += 4 * lda2; \
+ pa1 += 4 * lda2; \
+ pa2 += 4 * lda2; \
+ pa3 += 4 * lda2; \
+ \
+ x += 4 * inc_x2; \
+ } \
+ \
+ if (n & 2) \
+ { \
+ CLOAD_X2_SCALE(); \
+ \
+ k = 0; \
+ y = y_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ CLOAD_Y8(); \
+ CGEMV_N_8x2(); \
+ CSTORE_Y8(); \
+ \
+ k += 2 * 8; \
+ y += inc_y2 * 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ CLOAD_Y4(); \
+ CGEMV_N_4x2(); \
+ CSTORE_Y4(); \
+ \
+ k += 2 * 4; \
+ y += inc_y2 * 4; \
+ } \
+ \
+ for (i = (m & 3); i--;) \
+ { \
+ CGEMV_N_1x2(); \
+ \
+ k += 2; \
+ y += inc_y2; \
+ } \
+ \
+ pa0 += 2 * lda2; \
+ pa1 += 2 * lda2; \
+ \
+ x += 2 * inc_x2; \
+ } \
+ \
+ if (n & 1) \
+ { \
+ CLOAD_X1_SCALE(); \
+ \
+ k = 0; \
+ y = y_org; \
+ \
+ for (i = m; i--;) \
+ { \
+ CGEMV_N_1x1(); \
+ \
+ k += 2; \
+ y += inc_y2; \
+ } \
+ \
+ pa0 += lda2; \
+ x += inc_x2; \
+ } \
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y,
+ BLASLONG inc_y2, FLOAT *buffer)
+{
+ BLASLONG i, j, k;
+ FLOAT *y_org = y;
+ FLOAT *pa0, *pa1, *pa2, *pa3;
+ FLOAT temp_r, temp_i, res0, res1, temp0_r;
+ FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i;
+ v4f32 alphar, alphai;
+ v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i;
+ v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+ v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
+ v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
+ v4f32 tp0r, tp1r, tp2r, tp3r, tp4r, tp0i, tp1i, tp2i, tp3i, tp4i;
+
+ lda2 = 2 * lda2;
+ inc_x2 = 2 * inc_x2;
+ inc_y2 = 2 * inc_y2;
+
+ pa0 = A;
+ pa1 = A + lda2;
+ pa2 = A + 2 * lda2;
+ pa3 = A + 3 * lda2;
+
+ alphar = COPY_FLOAT_TO_VECTOR(alpha_r);
+ alphai = COPY_FLOAT_TO_VECTOR(alpha_i);
+
+ if ((2 == inc_x2) && (2 == inc_y2))
+ {
+ #define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR
+ #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
+ #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
+ #define CLOAD_Y8 CLOAD_Y8_VECTOR
+ #define CLOAD_Y4 CLOAD_Y4_VECTOR
+ #define CSTORE_Y8 CSTORE_Y8_VECTOR
+ #define CSTORE_Y4 CSTORE_Y4_VECTOR
+
+ CGEMV_N_MSA();
+
+ #undef CLOAD_X4_SCALE
+ #undef CLOAD_X2_SCALE
+ #undef CLOAD_X1_SCALE
+ #undef CLOAD_Y8
+ #undef CLOAD_Y4
+ #undef CSTORE_Y8
+ #undef CSTORE_Y4
+ }
+ else if (2 == inc_x2)
+ {
+ #define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR
+ #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
+ #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
+ #define CLOAD_Y8 CLOAD_Y8_GP
+ #define CLOAD_Y4 CLOAD_Y4_GP
+ #define CSTORE_Y8 CSTORE_Y8_GP
+ #define CSTORE_Y4 CSTORE_Y4_GP
+
+ CGEMV_N_MSA();
+
+ #undef CLOAD_X4_SCALE
+ #undef CLOAD_X2_SCALE
+ #undef CLOAD_X1_SCALE
+ #undef CLOAD_Y8
+ #undef CLOAD_Y4
+ #undef CSTORE_Y8
+ #undef CSTORE_Y4
+ }
+ else if (2 == inc_y2)
+ {
+ #define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP
+ #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
+ #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
+ #define CLOAD_Y8 CLOAD_Y8_VECTOR
+ #define CLOAD_Y4 CLOAD_Y4_VECTOR
+ #define CSTORE_Y8 CSTORE_Y8_VECTOR
+ #define CSTORE_Y4 CSTORE_Y4_VECTOR
+
+ CGEMV_N_MSA();
+
+ #undef CLOAD_X4_SCALE
+ #undef CLOAD_X2_SCALE
+ #undef CLOAD_X1_SCALE
+ #undef CLOAD_Y8
+ #undef CLOAD_Y4
+ #undef CSTORE_Y8
+ #undef CSTORE_Y4
+ }
+ else
+ {
+ #define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP
+ #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
+ #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
+ #define CLOAD_Y8 CLOAD_Y8_GP
+ #define CLOAD_Y4 CLOAD_Y4_GP
+ #define CSTORE_Y8 CSTORE_Y8_GP
+ #define CSTORE_Y4 CSTORE_Y4_GP
+
+ CGEMV_N_MSA();
+
+ #undef CLOAD_X4_SCALE
+ #undef CLOAD_X2_SCALE
+ #undef CLOAD_X1_SCALE
+ #undef CLOAD_Y8
+ #undef CLOAD_Y4
+ #undef CSTORE_Y8
+ #undef CSTORE_Y4
+ }
+ return(0);
+}
+
+#undef OP0
+#undef OP1
+#undef OP2
+#undef OP3
+#undef OP4
diff --git a/kernel/mips/cgemv_t_msa.c b/kernel/mips/cgemv_t_msa.c
new file mode 100644
index 000000000..b9620bfb9
--- /dev/null
+++ b/kernel/mips/cgemv_t_msa.c
@@ -0,0 +1,583 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#undef OP0
+#undef OP1
+#undef OP2
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ #define OP0 -=
+ #define OP1 +=
+ #define OP2 +=
+#else
+ #define OP0 +=
+ #define OP1 +=
+ #define OP2 -=
+#endif
+
+#define CGEMV_T_8x4() \
+ LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
+ LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
+ LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \
+ LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \
+ \
+ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
+ PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
+ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
+ PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
+ PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
+ PCKEVOD_W2_SP(t11, t10, src5r, src5i); \
+ PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
+ PCKEVOD_W2_SP(t15, t14, src7r, src7i); \
+ \
+ tp0r += src0r * x0r; \
+ tp0r += src1r * x1r; \
+ tp0r OP0 src0i * x0i; \
+ tp0r OP0 src1i * x1i; \
+ \
+ tp1r += src2r * x0r; \
+ tp1r += src3r * x1r; \
+ tp1r OP0 src2i * x0i; \
+ tp1r OP0 src3i * x1i; \
+ \
+ tp2r += src4r * x0r; \
+ tp2r += src5r * x1r; \
+ tp2r OP0 src4i * x0i; \
+ tp2r OP0 src5i * x1i; \
+ \
+ tp3r += src6r * x0r; \
+ tp3r += src7r * x1r; \
+ tp3r OP0 src6i * x0i; \
+ tp3r OP0 src7i * x1i; \
+ \
+ tp0i OP1 src0r * x0i; \
+ tp0i OP1 src1r * x1i; \
+ tp0i OP2 src0i * x0r; \
+ tp0i OP2 src1i * x1r; \
+ \
+ tp1i OP1 src2r * x0i; \
+ tp1i OP1 src3r * x1i; \
+ tp1i OP2 src2i * x0r; \
+ tp1i OP2 src3i * x1r; \
+ \
+ tp2i OP1 src4r * x0i; \
+ tp2i OP1 src5r * x1i; \
+ tp2i OP2 src4i * x0r; \
+ tp2i OP2 src5i * x1r; \
+ \
+ tp3i OP1 src6r * x0i; \
+ tp3i OP1 src7r * x1i; \
+ tp3i OP2 src6i * x0r; \
+ tp3i OP2 src7i * x1r; \
+
+#define CGEMV_T_8x2() \
+ LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
+ LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
+ \
+ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
+ PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
+ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
+ PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
+ \
+ tp0r += src0r * x0r; \
+ tp0r += src1r * x1r; \
+ tp0r OP0 src0i * x0i; \
+ tp0r OP0 src1i * x1i; \
+ \
+ tp1r += src2r * x0r; \
+ tp1r += src3r * x1r; \
+ tp1r OP0 src2i * x0i; \
+ tp1r OP0 src3i * x1i; \
+ \
+ tp0i OP1 src0r * x0i; \
+ tp0i OP1 src1r * x1i; \
+ tp0i OP2 src0i * x0r; \
+ tp0i OP2 src1i * x1r; \
+ \
+ tp1i OP1 src2r * x0i; \
+ tp1i OP1 src3r * x1i; \
+ tp1i OP2 src2i * x0r; \
+ tp1i OP2 src3i * x1r; \
+
+#define CGEMV_T_8x1() \
+ LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
+ \
+ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
+ PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
+ \
+ tp0r += src0r * x0r; \
+ tp0r += src1r * x1r; \
+ tp0r OP0 src0i * x0i; \
+ tp0r OP0 src1i * x1i; \
+ \
+ tp0i OP1 src0r * x0i; \
+ tp0i OP1 src1r * x1i; \
+ tp0i OP2 src0i * x0r; \
+ tp0i OP2 src1i * x1r; \
+
+#define CGEMV_T_4x4() \
+ LD_SP2(pa0 + k, 4, t0, t1); \
+ LD_SP2(pa1 + k, 4, t4, t5); \
+ LD_SP2(pa2 + k, 4, t8, t9); \
+ LD_SP2(pa3 + k, 4, t12, t13); \
+ \
+ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
+ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
+ PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
+ PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
+ \
+ tp0r += src0r * x0r; \
+ tp0r OP0 src0i * x0i; \
+ \
+ tp1r += src2r * x0r; \
+ tp1r OP0 src2i * x0i; \
+ \
+ tp2r += src4r * x0r; \
+ tp2r OP0 src4i * x0i; \
+ \
+ tp3r += src6r * x0r; \
+ tp3r OP0 src6i * x0i; \
+ \
+ tp0i OP1 src0r * x0i; \
+ tp0i OP2 src0i * x0r; \
+ \
+ tp1i OP1 src2r * x0i; \
+ tp1i OP2 src2i * x0r; \
+ \
+ tp2i OP1 src4r * x0i; \
+ tp2i OP2 src4i * x0r; \
+ \
+ tp3i OP1 src6r * x0i; \
+ tp3i OP2 src6i * x0r; \
+
+#define CGEMV_T_4x2() \
+ LD_SP2(pa0 + k, 4, t0, t1); \
+ LD_SP2(pa1 + k, 4, t4, t5); \
+ \
+ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
+ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
+ \
+ tp0r += src0r * x0r; \
+ tp0r OP0 src0i * x0i; \
+ \
+ tp1r += src2r * x0r; \
+ tp1r OP0 src2i * x0i; \
+ \
+ tp0i OP1 src0r * x0i; \
+ tp0i OP2 src0i * x0r; \
+ \
+ tp1i OP1 src2r * x0i; \
+ tp1i OP2 src2i * x0r; \
+
+#define CGEMV_T_4x1() \
+ LD_SP2(pa0 + k, 4, t0, t1); \
+ \
+ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
+ \
+ tp0r += src0r * x0r; \
+ tp0r OP0 src0i * x0i; \
+ \
+ tp0i OP1 src0r * x0i; \
+ tp0i OP2 src0i * x0r; \
+
+#define CGEMV_T_1x4() \
+ temp0r += pa0[k + 0] * x[0 * inc_x2]; \
+ temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
+ temp1r += pa1[k + 0] * x[0 * inc_x2]; \
+ temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \
+ temp2r += pa2[k + 0] * x[0 * inc_x2]; \
+ temp2r OP0 pa2[k + 1] * x[0 * inc_x2 + 1]; \
+ temp3r += pa3[k + 0] * x[0 * inc_x2]; \
+ temp3r OP0 pa3[k + 1] * x[0 * inc_x2 + 1]; \
+ \
+ temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
+ temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
+ temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \
+ temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \
+ temp2i OP1 pa2[k + 0] * x[0 * inc_x2 + 1]; \
+ temp2i OP2 pa2[k + 1] * x[0 * inc_x2]; \
+ temp3i OP1 pa3[k + 0] * x[0 * inc_x2 + 1]; \
+ temp3i OP2 pa3[k + 1] * x[0 * inc_x2]; \
+
+#define CGEMV_T_1x2() \
+ temp0r += pa0[k + 0] * x[0 * inc_x2]; \
+ temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
+ temp1r += pa1[k + 0] * x[0 * inc_x2]; \
+ temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \
+ \
+ temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
+ temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
+ temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \
+ temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \
+
+#define CGEMV_T_1x1() \
+ temp0r += pa0[k + 0] * x[0 * inc_x2]; \
+ temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
+ \
+ temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
+ temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
+
+#define CSCALE_STORE_Y4_GP() \
+ res0r = y[0 * inc_y2]; \
+ res1r = y[1 * inc_y2]; \
+ res2r = y[2 * inc_y2]; \
+ res3r = y[3 * inc_y2]; \
+ \
+ res0i = y[0 * inc_y2 + 1]; \
+ res1i = y[1 * inc_y2 + 1]; \
+ res2i = y[2 * inc_y2 + 1]; \
+ res3i = y[3 * inc_y2 + 1]; \
+ \
+ res0r += alphar * temp0r; \
+ res0r OP0 alphai * temp0i; \
+ res1r += alphar * temp1r; \
+ res1r OP0 alphai * temp1i; \
+ res2r += alphar * temp2r; \
+ res2r OP0 alphai * temp2i; \
+ res3r += alphar * temp3r; \
+ res3r OP0 alphai * temp3i; \
+ \
+ res0i OP1 alphar * temp0i; \
+ res0i OP2 alphai * temp0r; \
+ res1i OP1 alphar * temp1i; \
+ res1i OP2 alphai * temp1r; \
+ res2i OP1 alphar * temp2i; \
+ res2i OP2 alphai * temp2r; \
+ res3i OP1 alphar * temp3i; \
+ res3i OP2 alphai * temp3r; \
+ \
+ y[0 * inc_y2] = res0r; \
+ y[1 * inc_y2] = res1r; \
+ y[2 * inc_y2] = res2r; \
+ y[3 * inc_y2] = res3r; \
+ \
+ y[0 * inc_y2 + 1] = res0i; \
+ y[1 * inc_y2 + 1] = res1i; \
+ y[2 * inc_y2 + 1] = res2i; \
+ y[3 * inc_y2 + 1] = res3i; \
+
+#define CSCALE_STORE_Y2_GP() \
+ res0r = y[0 * inc_y2]; \
+ res1r = y[1 * inc_y2]; \
+ \
+ res0i = y[0 * inc_y2 + 1]; \
+ res1i = y[1 * inc_y2 + 1]; \
+ \
+ res0r += alphar * temp0r; \
+ res0r OP0 alphai * temp0i; \
+ res1r += alphar * temp1r; \
+ res1r OP0 alphai * temp1i; \
+ \
+ res0i OP1 alphar * temp0i; \
+ res0i OP2 alphai * temp0r; \
+ res1i OP1 alphar * temp1i; \
+ res1i OP2 alphai * temp1r; \
+ \
+ y[0 * inc_y2] = res0r; \
+ y[1 * inc_y2] = res1r; \
+ \
+ y[0 * inc_y2 + 1] = res0i; \
+ y[1 * inc_y2 + 1] = res1i; \
+
+
+#define CSCALE_STORE_Y1_GP() \
+ res0r = y[0 * inc_y2]; \
+ res0i = y[0 * inc_y2 + 1]; \
+ \
+ res0r += alphar * temp0r; \
+ res0r OP0 alphai * temp0i; \
+ \
+ res0i OP1 alphar * temp0i; \
+ res0i OP2 alphai * temp0r; \
+ \
+ y[0 * inc_y2] = res0r; \
+ y[0 * inc_y2 + 1] = res0i; \
+
+#define CLOAD_X8_VECTOR() \
+ LD_SP4(x, 4, x0, x1, x2, x3); \
+ PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
+ PCKEVOD_W2_SP(x3, x2, x1r, x1i); \
+
+#define CLOAD_X4_VECTOR() \
+ LD_SP2(x, 4, x0, x1); \
+ PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
+
+#define CLOAD_X8_GP() \
+ x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
+ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
+ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
+ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
+ x1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2))); \
+ x1r = (v4f32) __msa_insert_w((v4i32) x1r, 1, *((int *) (x + 5 * inc_x2))); \
+ x1r = (v4f32) __msa_insert_w((v4i32) x1r, 2, *((int *) (x + 6 * inc_x2))); \
+ x1r = (v4f32) __msa_insert_w((v4i32) x1r, 3, *((int *) (x + 7 * inc_x2))); \
+ x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
+ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
+ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
+ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
+ x1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2 + 1))); \
+ x1i = (v4f32) __msa_insert_w((v4i32) x1i, 1, *((int *) (x + 5 * inc_x2 + 1))); \
+ x1i = (v4f32) __msa_insert_w((v4i32) x1i, 2, *((int *) (x + 6 * inc_x2 + 1))); \
+ x1i = (v4f32) __msa_insert_w((v4i32) x1i, 3, *((int *) (x + 7 * inc_x2 + 1))); \
+
+#define CLOAD_X4_GP() \
+ x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
+ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
+ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
+ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
+ x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
+ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
+ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
+ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
+
+#define CGEMV_T_MSA() \
+ for (j = (n >> 2); j--;) \
+ { \
+ tp0r = tp1r = tp2r = tp3r = zero; \
+ tp0i = tp1i = tp2i = tp3i = zero; \
+ \
+ k = 0; \
+ x = srcx_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ CLOAD_X8() \
+ CGEMV_T_8x4(); \
+ \
+ k += 2 * 8; \
+ x += inc_x2 * 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ CLOAD_X4(); \
+ \
+ CGEMV_T_4x4(); \
+ \
+ k += 2 * 4; \
+ x += inc_x2 * 4; \
+ } \
+ \
+ TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp2r, tp3r, \
+ tp0r, tp1r, tp2r, tp3r); \
+ TRANSPOSE4x4_SP_SP(tp0i, tp1i, tp2i, tp3i, \
+ tp0i, tp1i, tp2i, tp3i); \
+ \
+ tp0r += tp1r; \
+ tp0r += tp2r; \
+ tp0r += tp3r; \
+ tp0i += tp1i; \
+ tp0i += tp2i; \
+ tp0i += tp3i; \
+ \
+ temp0r = tp0r[0]; \
+ temp1r = tp0r[1]; \
+ temp2r = tp0r[2]; \
+ temp3r = tp0r[3]; \
+ temp0i = tp0i[0]; \
+ temp1i = tp0i[1]; \
+ temp2i = tp0i[2]; \
+ temp3i = tp0i[3]; \
+ \
+ for (i = (m & 3); i--;) \
+ { \
+ CGEMV_T_1x4(); \
+ \
+ k += 2; \
+ x += inc_x2; \
+ } \
+ \
+ CSCALE_STORE_Y4_GP(); \
+ \
+ pa0 += 4 * lda2; \
+ pa1 += 4 * lda2; \
+ pa2 += 4 * lda2; \
+ pa3 += 4 * lda2; \
+ y += 4 * inc_y2; \
+ } \
+ \
+ if (n & 2) \
+ { \
+ tp0r = tp1r = zero; \
+ tp0i = tp1i = zero; \
+ \
+ k = 0; \
+ x = srcx_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ CLOAD_X8(); \
+ \
+ CGEMV_T_8x2(); \
+ \
+ k += 2 * 8; \
+ x += inc_x2 * 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ CLOAD_X4(); \
+ \
+ CGEMV_T_4x2(); \
+ \
+ k += 2 * 4; \
+ x += inc_x2 * 4; \
+ } \
+ \
+ TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp0i, tp1i, \
+ tp0r, tp1r, tp0i, tp1i); \
+ \
+ tp0r += tp1r; \
+ tp0r += tp0i; \
+ tp0r += tp1i; \
+ \
+ temp0r = tp0r[0]; \
+ temp1r = tp0r[1]; \
+ temp0i = tp0r[2]; \
+ temp1i = tp0r[3]; \
+ \
+ for (i = (m & 3); i--;) \
+ { \
+ CGEMV_T_1x2(); \
+ \
+ k += 2; \
+ x += inc_x2; \
+ } \
+ \
+ CSCALE_STORE_Y2_GP(); \
+ \
+ pa0 += 2 * lda2; \
+ pa1 += 2 * lda2; \
+ y += 2 * inc_y2; \
+ } \
+ \
+ if (n & 1) \
+ { \
+ tp0r = zero; \
+ tp0i = zero; \
+ \
+ k = 0; \
+ x = srcx_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ CLOAD_X8(); \
+ \
+ CGEMV_T_8x1(); \
+ \
+ k += 2 * 8; \
+ x += inc_x2 * 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ CLOAD_X4(); \
+ \
+ CGEMV_T_4x1(); \
+ \
+ k += 2 * 4; \
+ x += inc_x2 * 4; \
+ } \
+ \
+ ILVRL_W2_SP(tp0i, tp0r, t0, t1); \
+ \
+ t0 += t1; \
+ \
+ temp0r = t0[0] + t0[2]; \
+ temp0i = t0[1] + t0[3]; \
+ \
+ for (i = (m & 3); i--;) \
+ { \
+ CGEMV_T_1x1(); \
+ \
+ k += 2; \
+ x += inc_x2; \
+ } \
+ \
+ CSCALE_STORE_Y1_GP(); \
+ \
+ pa0 += lda2; \
+ y += inc_y2; \
+ } \
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai,
+ FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+ BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i, j, k;
+ FLOAT *pa0, *pa1, *pa2, *pa3;
+ FLOAT *srcx_org = x;
+ FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i;
+ FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i;
+ BLASLONG inc_x2, inc_y2, lda2;
+ v4f32 zero = {0};
+ v4f32 x0, x1, x2, x3, x0r, x1r, x0i, x1i;
+ v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+ v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
+ v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
+ v4f32 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i;
+
+ lda2 = 2 * lda;
+
+ pa0 = A;
+ pa1 = A + lda2;
+ pa2 = A + 2 * lda2;
+ pa3 = A + 3 * lda2;
+
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ if (2 == inc_x2)
+ {
+ #define CLOAD_X8 CLOAD_X8_VECTOR
+ #define CLOAD_X4 CLOAD_X4_VECTOR
+
+ CGEMV_T_MSA();
+
+ #undef CLOAD_X8
+ #undef CLOAD_X4
+ }
+ else
+ {
+ #define CLOAD_X8 CLOAD_X8_GP
+ #define CLOAD_X4 CLOAD_X4_GP
+
+ CGEMV_T_MSA();
+
+ #undef CLOAD_X8
+ #undef CLOAD_X4
+ }
+
+ return(0);
+}
+
+#undef OP0
+#undef OP1
+#undef OP2
diff --git a/kernel/mips/copy.c b/kernel/mips/copy.c
new file mode 100644
index 000000000..9f488ddb3
--- /dev/null
+++ b/kernel/mips/copy.c
@@ -0,0 +1,50 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+
+ if ( n < 0 ) return(0);
+
+ while(i < n)
+ {
+
+ y[iy] = x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/mips/dasum_msa.c b/kernel/mips/dasum_msa.c
new file mode 100644
index 000000000..a3641cd50
--- /dev/null
+++ b/kernel/mips/dasum_msa.c
@@ -0,0 +1,278 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include
+#include "macros_msa.h"
+
+#define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec))
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i;
+ FLOAT sumf = 0.0;
+ v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
+ v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
+ v2f64 zero_v = {0};
+ v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
+
+ if (n <= 0 || inc_x <= 0) return (sumf);
+
+ if (1 == inc_x)
+ {
+ if (n > 15)
+ {
+ n -= 16;
+
+ LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ sum_abs0 = AND_VEC_D(src0);
+ sum_abs1 = AND_VEC_D(src1);
+ sum_abs2 = AND_VEC_D(src2);
+ sum_abs3 = AND_VEC_D(src3);
+ sum_abs0 += AND_VEC_D(src4);
+ sum_abs1 += AND_VEC_D(src5);
+ sum_abs2 += AND_VEC_D(src6);
+ sum_abs3 += AND_VEC_D(src7);
+ }
+ else
+ {
+ sum_abs0 = zero_v;
+ sum_abs1 = zero_v;
+ sum_abs2 = zero_v;
+ sum_abs3 = zero_v;
+ }
+
+ for (i = (n >> 4); i--;)
+ {
+ LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ sum_abs0 += AND_VEC_D(src4);
+ sum_abs1 += AND_VEC_D(src5);
+ sum_abs2 += AND_VEC_D(src6);
+ sum_abs3 += AND_VEC_D(src7);
+ }
+
+ if (n & 15)
+ {
+ if ((n & 8) && (n & 4) && (n & 2))
+ {
+ LD_DP7_INC(x, 2, src0, src1, src2, src3, src4, src5, src6);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ sum_abs0 += AND_VEC_D(src4);
+ sum_abs1 += AND_VEC_D(src5);
+ sum_abs2 += AND_VEC_D(src6);
+ }
+ else if ((n & 8) && (n & 4))
+ {
+ LD_DP6_INC(x, 2, src0, src1, src2, src3, src4, src5);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ sum_abs0 += AND_VEC_D(src4);
+ sum_abs1 += AND_VEC_D(src5);
+ }
+ else if ((n & 8) && (n & 2))
+ {
+ LD_DP5_INC(x, 2, src0, src1, src2, src3, src4);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ sum_abs0 += AND_VEC_D(src4);
+ }
+ else if ((n & 4) && (n & 2))
+ {
+ LD_DP3_INC(x, 2, src0, src1, src2);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ }
+ else if (n & 8)
+ {
+ LD_DP4_INC(x, 2, src0, src1, src2, src3);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ }
+ else if (n & 4)
+ {
+ LD_DP2_INC(x, 2, src0, src1);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ }
+ else if (n & 2)
+ {
+ src0 = LD_DP(x); x += 2;
+
+ sum_abs0 += AND_VEC_D(src0);
+ }
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf = sum_abs0[0] + sum_abs0[1];
+
+ if (n & 1)
+ {
+ sumf += fabs(*x);
+ }
+ }
+ else
+ {
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf = sum_abs0[0] + sum_abs0[1];
+ }
+ }
+ else
+ {
+ if (n > 8)
+ {
+ n -= 8;
+
+ LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ sum_abs0 = AND_VEC_D(src0);
+ sum_abs1 = AND_VEC_D(src1);
+ sum_abs2 = AND_VEC_D(src2);
+ sum_abs3 = AND_VEC_D(src3);
+ sum_abs0 += AND_VEC_D(src4);
+ sum_abs1 += AND_VEC_D(src5);
+ sum_abs2 += AND_VEC_D(src6);
+ sum_abs3 += AND_VEC_D(src7);
+ }
+ else
+ {
+ sum_abs0 = zero_v;
+ sum_abs1 = zero_v;
+ sum_abs2 = zero_v;
+ sum_abs3 = zero_v;
+ }
+
+ for (i = (n >> 3); i--;)
+ {
+ LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ sum_abs0 += AND_VEC_D(src4);
+ sum_abs1 += AND_VEC_D(src5);
+ sum_abs2 += AND_VEC_D(src6);
+ sum_abs3 += AND_VEC_D(src7);
+ }
+
+ if (n & 7)
+ {
+ if ((n & 4) && (n & 2) && (n & 1))
+ {
+ LD_DP7_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ sum_abs0 += AND_VEC_D(src4);
+ sum_abs1 += AND_VEC_D(src5);
+ sum_abs2 += AND_VEC_D(src6);
+ }
+ else if ((n & 4) && (n & 2))
+ {
+ LD_DP6_INC(x, inc_x, src0, src1, src2, src3, src4, src5);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ sum_abs0 += AND_VEC_D(src4);
+ sum_abs1 += AND_VEC_D(src5);
+ }
+ else if ((n & 4) && (n & 1))
+ {
+ LD_DP5_INC(x, inc_x, src0, src1, src2, src3, src4);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ sum_abs0 += AND_VEC_D(src4);
+ }
+ else if ((n & 2) && (n & 1))
+ {
+ LD_DP3_INC(x, inc_x, src0, src1, src2);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ }
+ else if (n & 4)
+ {
+ LD_DP4_INC(x, inc_x, src0, src1, src2, src3);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ }
+ else if (n & 2)
+ {
+ LD_DP2_INC(x, inc_x, src0, src1);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ }
+ else if (n & 1)
+ {
+ src0 = LD_DP(x);
+
+ sum_abs0 += AND_VEC_D(src0);
+ }
+ }
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf = sum_abs0[0];
+ }
+
+ return (sumf);
+}
diff --git a/kernel/mips/ddot_msa.c b/kernel/mips/ddot_msa.c
new file mode 100644
index 000000000..b56e10135
--- /dev/null
+++ b/kernel/mips/ddot_msa.c
@@ -0,0 +1,189 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+/* return float, x,y float */
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+ BLASLONG i = 0;
+ double dot = 0.0;
+ FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
+ v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
+ v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
+ v2f64 dot0 = {0, 0};
+
+ if (n < 0) return (dot);
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ for (i = (n >> 4); i--;)
+ {
+ LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+ LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+ dot0 += (vy0 * vx0);
+ dot0 += (vy1 * vx1);
+ dot0 += (vy2 * vx2);
+ dot0 += (vy3 * vx3);
+ dot0 += (vy4 * vx4);
+ dot0 += (vy5 * vx5);
+ dot0 += (vy6 * vx6);
+ dot0 += (vy7 * vx7);
+ }
+
+ if (n & 15)
+ {
+ if ((n & 8) && (n & 4) && (n & 2))
+ {
+ LD_DP7_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6);
+ LD_DP7_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6);
+
+ dot0 += (vy0 * vx0);
+ dot0 += (vy1 * vx1);
+ dot0 += (vy2 * vx2);
+ dot0 += (vy3 * vx3);
+ dot0 += (vy4 * vx4);
+ dot0 += (vy5 * vx5);
+ dot0 += (vy6 * vx6);
+ }
+ else if ((n & 8) && (n & 4))
+ {
+ LD_DP6_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5);
+ LD_DP6_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5);
+
+ dot0 += (vy0 * vx0);
+ dot0 += (vy1 * vx1);
+ dot0 += (vy2 * vx2);
+ dot0 += (vy3 * vx3);
+ dot0 += (vy4 * vx4);
+ dot0 += (vy5 * vx5);
+ }
+ else if ((n & 8) && (n & 2))
+ {
+ LD_DP5_INC(x, 2, vx0, vx1, vx2, vx3, vx4);
+ LD_DP5_INC(y, 2, vy0, vy1, vy2, vy3, vy4);
+
+ dot0 += (vy0 * vx0);
+ dot0 += (vy1 * vx1);
+ dot0 += (vy2 * vx2);
+ dot0 += (vy3 * vx3);
+ dot0 += (vy4 * vx4);
+ }
+ else if ((n & 4) && (n & 2))
+ {
+ LD_DP3_INC(x, 2, vx0, vx1, vx2);
+ LD_DP3_INC(y, 2, vy0, vy1, vy2);
+
+ dot0 += (vy0 * vx0);
+ dot0 += (vy1 * vx1);
+ dot0 += (vy2 * vx2);
+ }
+ else if (n & 8)
+ {
+ LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3);
+ LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3);
+
+ dot0 += (vy0 * vx0);
+ dot0 += (vy1 * vx1);
+ dot0 += (vy2 * vx2);
+ dot0 += (vy3 * vx3);
+ }
+ else if (n & 4)
+ {
+ LD_DP2_INC(x, 2, vx0, vx1);
+ LD_DP2_INC(y, 2, vy0, vy1);
+
+ dot0 += (vy0 * vx0);
+ dot0 += (vy1 * vx1);
+ }
+ else if (n & 2)
+ {
+ vx0 = LD_DP(x); x += 2;
+ vy0 = LD_DP(y); y += 2;
+
+ dot0 += (vy0 * vx0);
+ }
+
+ if (n & 1)
+ {
+ x0 = *x;
+ y0 = *y;
+
+ dot += (y0 * x0);
+ }
+ }
+
+ dot += dot0[0];
+ dot += dot0[1];
+ }
+ else
+ {
+ for (i = (n >> 2); i--;)
+ {
+ LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
+ LD_GP4_INC(y, inc_y, y0, y1, y2, y3);
+
+ dot += (y0 * x0);
+ dot += (y1 * x1);
+ dot += (y2 * x2);
+ dot += (y3 * x3);
+ }
+
+ if ((n & 2) && (n & 1))
+ {
+ LD_GP3_INC(x, inc_x, x0, x1, x2);
+ LD_GP3_INC(y, inc_y, y0, y1, y2);
+
+ dot += (y0 * x0);
+ dot += (y1 * x1);
+ dot += (y2 * x2);
+ }
+ else if (n & 2)
+ {
+ LD_GP2_INC(x, inc_x, x0, x1);
+ LD_GP2_INC(y, inc_y, y0, y1);
+
+ dot += (y0 * x0);
+ dot += (y1 * x1);
+ }
+ else if (n & 1)
+ {
+ x0 = *x;
+ y0 = *y;
+
+ dot += (y0 * x0);
+ }
+ }
+
+ return (dot);
+}
diff --git a/kernel/mips/dgemm_kernel_8x4_msa.c b/kernel/mips/dgemm_kernel_8x4_msa.c
new file mode 100644
index 000000000..9286e7469
--- /dev/null
+++ b/kernel/mips/dgemm_kernel_8x4_msa.c
@@ -0,0 +1,1566 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
+ FLOAT *C, BLASLONG ldc
+#ifdef TRMMKERNEL
+ , BLASLONG offset
+#endif
+ )
+{
+ BLASLONG i, j, l, temp;
+#if defined(TRMMKERNEL)
+ BLASLONG off;
+#endif
+ FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
+ FLOAT tmp0, tmp1, tmp2, tmp3;
+ FLOAT a0, b0, b1, b2, b3;
+ v2f64 v_alpha = {alpha, alpha};
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0, src_b1;
+ v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v2f64 res0, res1, res2, res3, res4, res5, res6, res7;
+ v2f64 res8, res9, res10, res11, res12, res13, res14, res15;
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off = -offset;
+#endif
+
+ for (j = (n >> 2); j--;)
+ {
+ pc0 = C;
+ pc1 = pc0 + ldc;
+ pc2 = pc1 + ldc;
+ pc3 = pc2 + ldc;
+
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 8;
+ pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 = src_a0 * src_b;
+ res1 = src_a1 * src_b;
+ res2 = src_a2 * src_b;
+ res3 = src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res4 = src_a0 * src_b;
+ res5 = src_a1 * src_b;
+ res6 = src_a2 * src_b;
+ res7 = src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ res8 = src_a0 * src_b;
+ res9 = src_a1 * src_b;
+ res10 = src_a2 * src_b;
+ res11 = src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ res12 = src_a0 * src_b;
+ res13 = src_a1 * src_b;
+ res14 = src_a2 * src_b;
+ res15 = src_a3 * src_b;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+ res2 += src_a2 * src_b;
+ res3 += src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+ res6 += src_a2 * src_b;
+ res7 += src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ res8 += src_a0 * src_b;
+ res9 += src_a1 * src_b;
+ res10 += src_a2 * src_b;
+ res11 += src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ res12 += src_a0 * src_b;
+ res13 += src_a1 * src_b;
+ res14 += src_a2 * src_b;
+ res15 += src_a3 * src_b;
+
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+ res2 += src_a2 * src_b;
+ res3 += src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+ res6 += src_a2 * src_b;
+ res7 += src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ res8 += src_a0 * src_b;
+ res9 += src_a1 * src_b;
+ res10 += src_a2 * src_b;
+ res11 += src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ res12 += src_a0 * src_b;
+ res13 += src_a1 * src_b;
+ res14 += src_a2 * src_b;
+ res15 += src_a3 * src_b;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+ res2 += src_a2 * src_b;
+ res3 += src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+ res6 += src_a2 * src_b;
+ res7 += src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ res8 += src_a0 * src_b;
+ res9 += src_a1 * src_b;
+ res10 += src_a2 * src_b;
+ res11 += src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ res12 += src_a0 * src_b;
+ res13 += src_a1 * src_b;
+ res14 += src_a2 * src_b;
+ res15 += src_a3 * src_b;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+ dst4 = res4 * v_alpha;
+ dst5 = res5 * v_alpha;
+ dst6 = res6 * v_alpha;
+ dst7 = res7 * v_alpha;
+#else
+ LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
+ LD_DP4(pc1, 2, dst4, dst5, dst6, dst7);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+ dst2 += res2 * v_alpha;
+ dst3 += res3 * v_alpha;
+ dst4 += res4 * v_alpha;
+ dst5 += res5 * v_alpha;
+ dst6 += res6 * v_alpha;
+ dst7 += res7 * v_alpha;
+#endif
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);
+
+#if defined(TRMMKERNEL)
+ dst0 = res8 * v_alpha;
+ dst1 = res9 * v_alpha;
+ dst2 = res10 * v_alpha;
+ dst3 = res11 * v_alpha;
+ dst4 = res12 * v_alpha;
+ dst5 = res13 * v_alpha;
+ dst6 = res14 * v_alpha;
+ dst7 = res15 * v_alpha;
+#else
+ LD_DP4(pc2, 2, dst0, dst1, dst2, dst3);
+ LD_DP4(pc3, 2, dst4, dst5, dst6, dst7);
+
+ dst0 += res8 * v_alpha;
+ dst1 += res9 * v_alpha;
+ dst2 += res10 * v_alpha;
+ dst3 += res11 * v_alpha;
+ dst4 += res12 * v_alpha;
+ dst5 += res13 * v_alpha;
+ dst6 += res14 * v_alpha;
+ dst7 += res15 * v_alpha;
+#endif
+
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2);
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 8;
+ pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 4)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 4;
+ pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 = src_a0 * src_b;
+ res1 = src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res2 = src_a0 * src_b;
+ res3 = src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ res4 = src_a0 * src_b;
+ res5 = src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ res6 = src_a0 * src_b;
+ res7 = src_a1 * src_b;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ res6 += src_a0 * src_b;
+ res7 += src_a1 * src_b;
+
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ res6 += src_a0 * src_b;
+ res7 += src_a1 * src_b;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ res6 += src_a0 * src_b;
+ res7 += src_a1 * src_b;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+ dst4 = res4 * v_alpha;
+ dst5 = res5 * v_alpha;
+ dst6 = res6 * v_alpha;
+ dst7 = res7 * v_alpha;
+#else
+ LD_DP2(pc0, 2, dst0, dst1);
+ LD_DP2(pc1, 2, dst2, dst3);
+ LD_DP2(pc2, 2, dst4, dst5);
+ LD_DP2(pc3, 2, dst6, dst7);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+ dst2 += res2 * v_alpha;
+ dst3 += res3 * v_alpha;
+ dst4 += res4 * v_alpha;
+ dst5 += res5 * v_alpha;
+ dst6 += res6 * v_alpha;
+ dst7 += res7 * v_alpha;
+#endif
+ ST_DP2_INC(dst0, dst1, pc0, 2);
+ ST_DP2_INC(dst2, dst3, pc1, 2);
+ ST_DP2_INC(dst4, dst5, pc2, 2);
+ ST_DP2_INC(dst6, dst7, pc3, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 4;
+ pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2;
+ pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ src_a0 = LD_DP(pa0);
+ pa0 += 2;
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 = src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res1 = src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ res2 = src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ res3 = src_a0 * src_b;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ src_a0 = LD_DP(pa0);
+ pa0 += 2;
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res1 += src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ res2 += src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ res3 += src_a0 * src_b;
+
+ src_a0 = LD_DP(pa0);
+ pa0 += 2;
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res1 += src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ res2 += src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ res3 += src_a0 * src_b;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ src_a0 = LD_DP(pa0);
+ pa0 += 2;
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res1 += src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ res2 += src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ res3 += src_a0 * src_b;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+#else
+ dst0 = LD_DP(pc0);
+ dst1 = LD_DP(pc1);
+ dst2 = LD_DP(pc2);
+ dst3 = LD_DP(pc3);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+ dst2 += res2 * v_alpha;
+ dst3 += res3 * v_alpha;
+#endif
+ ST_DP(dst0, pc0);
+ ST_DP(dst1, pc1);
+ ST_DP(dst2, pc2);
+ ST_DP(dst3, pc3);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2;
+ pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+ pc0 += 2;
+ pc1 += 2;
+ pc2 += 2;
+ pc3 += 2;
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 1;
+ pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 = a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 = a0 * b1;
+
+ b2 = pb0[2];
+ tmp2 = a0 * b2;
+
+ b3 = pb0[3];
+ tmp3 = a0 * b3;
+
+ pa0 += 1;
+ pb0 += 4;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ b2 = pb0[2];
+ tmp2 += a0 * b2;
+
+ b3 = pb0[3];
+ tmp3 += a0 * b3;
+
+ pa0 += 1;
+ pb0 += 4;
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ b2 = pb0[2];
+ tmp2 += a0 * b2;
+
+ b3 = pb0[3];
+ tmp3 += a0 * b3;
+
+ pa0 += 1;
+ pb0 += 4;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ b2 = pb0[2];
+ tmp2 += a0 * b2;
+
+ b3 = pb0[3];
+ tmp3 += a0 * b3;
+
+ pa0 += 1;
+ pb0 += 4;
+ }
+
+ tmp0 = alpha * tmp0;
+ tmp1 = alpha * tmp1;
+ tmp2 = alpha * tmp2;
+ tmp3 = alpha * tmp3;
+
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp1;
+ pc2[0] = tmp2;
+ pc3[0] = tmp3;
+#else
+ pc0[0] += tmp0;
+ pc1[0] += tmp1;
+ pc2[0] += tmp2;
+ pc3[0] += tmp3;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 1;
+ pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 1;
+ pc1 += 1;
+ pc2 += 1;
+ pc3 += 1;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 4; // number of values in A
+#endif
+
+ l = (k << 2);
+ B = B + l;
+ i = (ldc << 2);
+ C = C + i;
+ }
+
+ if (n & 2)
+ {
+ pc0 = C;
+ pc1 = pc0 + ldc;
+
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 8;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(pb0);
+ pb0 += 2;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 = src_a0 * src_b;
+ res1 = src_a1 * src_b;
+ res2 = src_a2 * src_b;
+ res3 = src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res4 = src_a0 * src_b;
+ res5 = src_a1 * src_b;
+ res6 = src_a2 * src_b;
+ res7 = src_a3 * src_b;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(pb0);
+ pb0 += 2;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+ res2 += src_a2 * src_b;
+ res3 += src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+ res6 += src_a2 * src_b;
+ res7 += src_a3 * src_b;
+
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(pb0);
+ pb0 += 2;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+ res2 += src_a2 * src_b;
+ res3 += src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+ res6 += src_a2 * src_b;
+ res7 += src_a3 * src_b;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(pb0);
+ pb0 += 2;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+ res2 += src_a2 * src_b;
+ res3 += src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+ res6 += src_a2 * src_b;
+ res7 += src_a3 * src_b;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+ dst4 = res4 * v_alpha;
+ dst5 = res5 * v_alpha;
+ dst6 = res6 * v_alpha;
+ dst7 = res7 * v_alpha;
+#else
+ LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
+ LD_DP4(pc1, 2, dst4, dst5, dst6, dst7);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+ dst2 += res2 * v_alpha;
+ dst3 += res3 * v_alpha;
+ dst4 += res4 * v_alpha;
+ dst5 += res5 * v_alpha;
+ dst6 += res6 * v_alpha;
+ dst7 += res7 * v_alpha;
+#endif
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 8;
+ pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 4)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 4;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ src_b0 = LD_DP(pb0);
+ pb0 += 2;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 = src_a0 * src_b;
+ res1 = src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res2 = src_a0 * src_b;
+ res3 = src_a1 * src_b;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ src_b0 = LD_DP(pb0);
+ pb0 += 2;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ src_b0 = LD_DP(pb0);
+ pb0 += 2;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ src_b0 = LD_DP(pb0);
+ pb0 += 2;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+#else
+ LD_DP2(pc0, 2, dst0, dst1);
+ LD_DP2(pc1, 2, dst2, dst3);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+ dst2 += res2 * v_alpha;
+ dst3 += res3 * v_alpha;
+#endif
+ ST_DP2_INC(dst0, dst1, pc0, 2);
+ ST_DP2_INC(dst2, dst3, pc1, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 4;
+ pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ src_a0 = LD_DP(pa0);
+ pa0 += 2;
+ src_b0 = LD_DP(pb0);
+ pb0 += 2;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 = src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res1 = src_a0 * src_b;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ src_a0 = LD_DP(pa0);
+ pa0 += 2;
+ src_b0 = LD_DP(pb0);
+ pb0 += 2;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res1 += src_a0 * src_b;
+
+ src_a0 = LD_DP(pa0);
+ pa0 += 2;
+ src_b0 = LD_DP(pb0);
+ pb0 += 2;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res1 += src_a0 * src_b;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ src_a0 = LD_DP(pa0);
+ pa0 += 2;
+ src_b0 = LD_DP(pb0);
+ pb0 += 2;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ res1 += src_a0 * src_b;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+#else
+ dst0 = LD_DP(pc0);
+ dst1 = LD_DP(pc1);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+#endif
+ ST_DP(dst0, pc0);
+ ST_DP(dst1, pc1);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2;
+ pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+ pc0 += 2;
+ pc1 += 2;
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 1;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 = a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 = a0 * b1;
+
+ pa0 += 1;
+ pb0 += 2;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ pa0 += 1;
+ pb0 += 2;
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ pa0 += 1;
+ pb0 += 2;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ pa0 += 1;
+ pb0 += 2;
+ }
+
+ tmp0 = alpha * tmp0;
+ tmp1 = alpha * tmp1;
+
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp1;
+#else
+ pc0[0] += tmp0;
+ pc1[0] += tmp1;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 1;
+ pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 1;
+ pc1 += 1;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 2; // number of values in A
+#endif
+
+ l = (k << 1);
+ B = B + l;
+ i = (ldc << 1);
+ C = C + i;
+ }
+
+ if (n & 1)
+ {
+ pc0 = C;
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 8;
+ pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b[0] = pb0[0];
+ src_b[1] = pb0[0];
+
+ res0 = src_a0 * src_b;
+ res1 = src_a1 * src_b;
+ res2 = src_a2 * src_b;
+ res3 = src_a3 * src_b;
+
+ pb0 += 1;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b[0] = pb0[0];
+ src_b[1] = pb0[0];
+
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+ res2 += src_a2 * src_b;
+ res3 += src_a3 * src_b;
+
+ pb0 += 1;
+
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b[0] = pb0[0];
+ src_b[1] = pb0[0];
+
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+ res2 += src_a2 * src_b;
+ res3 += src_a3 * src_b;
+
+ pb0 += 1;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b[0] = pb0[0];
+ src_b[1] = pb0[0];
+
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+ res2 += src_a2 * src_b;
+ res3 += src_a3 * src_b;
+
+ pb0 += 1;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+#else
+ LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+ dst2 += res2 * v_alpha;
+ dst3 += res3 * v_alpha;
+#endif
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 8;
+ pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 4)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 4;
+ pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ src_b[0] = pb0[0];
+ src_b[1] = pb0[0];
+
+ res0 = src_a0 * src_b;
+ res1 = src_a1 * src_b;
+
+ pb0 += 1;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ src_b[0] = pb0[0];
+ src_b[1] = pb0[0];
+
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ pb0 += 1;
+
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ src_b[0] = pb0[0];
+ src_b[1] = pb0[0];
+
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ pb0 += 1;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ src_b[0] = pb0[0];
+ src_b[1] = pb0[0];
+
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ pb0 += 1;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+#else
+ LD_DP2(pc0, 2, dst0, dst1);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+#endif
+ ST_DP2_INC(dst0, dst1, pc0, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 4;
+ pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2;
+ pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ src_a0 = LD_DP(pa0);
+ src_b[0] = pb0[0];
+ src_b[1] = pb0[0];
+
+ res0 = src_a0 * src_b;
+
+ pa0 += 2;
+ pb0 += 1;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ src_a0 = LD_DP(pa0);
+ src_b[0] = pb0[0];
+ src_b[1] = pb0[0];
+
+ res0 += src_a0 * src_b;
+
+ pa0 += 2;
+ pb0 += 1;
+
+ src_a0 = LD_DP(pa0);
+ src_b[0] = pb0[0];
+ src_b[1] = pb0[0];
+
+ res0 += src_a0 * src_b;
+
+ pa0 += 2;
+ pb0 += 1;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ src_a0 = LD_DP(pa0);
+ src_b[0] = pb0[0];
+ src_b[1] = pb0[0];
+
+ res0 += src_a0 * src_b;
+
+ pa0 += 2;
+ pb0 += 1;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+#else
+ dst0 = LD_DP(pc0);
+
+ dst0 += res0 * v_alpha;
+#endif
+ ST_DP(dst0, pc0);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2;
+ pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+ pc0 += 2;
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 1;
+ pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 = a0 * b0;
+
+ pa0 += 1;
+ pb0 += 1;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ pa0 += 1;
+ pb0 += 1;
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ pa0 += 1;
+ pb0 += 1;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ pa0 += 1;
+ pb0 += 1;
+ }
+
+#if defined(TRMMKERNEL)
+ pc0[0] = alpha * tmp0;
+#else
+ pc0[0] += alpha * tmp0;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 1;
+ pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 1;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 1; // number of values in A
+#endif
+
+ l = (k << 0);
+ B = B + l;
+ i = (ldc << 0);
+ C = C + i;
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/dgemm_ncopy_4_msa.c b/kernel/mips/dgemm_ncopy_4_msa.c
new file mode 100644
index 000000000..a61b2e806
--- /dev/null
+++ b/kernel/mips/dgemm_ncopy_4_msa.c
@@ -0,0 +1,118 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
+ FLOAT * __restrict dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
+ v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
+ v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ psrc0 = src;
+ pdst = dst;
+
+ for (j = (n >> 2); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc0 += 4 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
+
+ ILVRL_D2_DP(src2, src0, dst0, dst4);
+ ILVRL_D2_DP(src6, src4, dst1, dst5);
+ ILVRL_D2_DP(src3, src1, dst2, dst6);
+ ILVRL_D2_DP(src7, src5, dst3, dst7);
+
+ ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
+ }
+
+ for (i = (m & 3); i--;)
+ {
+ *pdst++ = *psrc1++;
+ *pdst++ = *psrc2++;
+ *pdst++ = *psrc3++;
+ *pdst++ = *psrc4++;
+ }
+ }
+
+ if (n & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc0 += 2 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+
+ ILVRL_D2_DP(src2, src0, dst0, dst4);
+ ILVRL_D2_DP(src3, src1, dst1, dst5);
+
+ ST_DP4_INC(dst0, dst4, dst1, dst5, pdst, 2);
+ }
+
+ for (i = (m & 3); i--;)
+ {
+ *pdst++ = *psrc1++;
+ *pdst++ = *psrc2++;
+ }
+ }
+
+ if (n & 1)
+ {
+ psrc1 = psrc0;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_DP2(psrc1, 2, src0, src1);
+ psrc1 += 4;
+
+ ST_DP2(src0, src1, pdst, 2);
+ pdst += 4;
+ }
+
+ for (i = (m & 3); i--;)
+ {
+ *pdst++ = *psrc1++;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/dgemm_ncopy_8_msa.c b/kernel/mips/dgemm_ncopy_8_msa.c
new file mode 100644
index 000000000..86d019c4f
--- /dev/null
+++ b/kernel/mips/dgemm_ncopy_8_msa.c
@@ -0,0 +1,186 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
+ FLOAT * __restrict dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
+ FLOAT *psrc8, *pdst;
+ v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
+ v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
+ v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ psrc0 = src;
+ pdst = dst;
+
+ for (j = (n >> 3); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc5 = psrc4 + lda;
+ psrc6 = psrc5 + lda;
+ psrc7 = psrc6 + lda;
+ psrc8 = psrc7 + lda;
+ psrc0 += 8 * lda;
+
+ for (i = (m >> 3); i--;)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
+ LD_DP2_INC(psrc5, 2, src8, src9);
+ LD_DP2_INC(psrc6, 2, src10, src11);
+ LD_DP2_INC(psrc7, 2, src12, src13);
+ LD_DP2_INC(psrc8, 2, src14, src15);
+
+ ILVRL_D2_DP(src2, src0, dst0, dst4);
+ ILVRL_D2_DP(src6, src4, dst1, dst5);
+ ILVRL_D2_DP(src10, src8, dst2, dst6);
+ ILVRL_D2_DP(src14, src12, dst3, dst7);
+
+ ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
+
+ ILVRL_D2_DP(src3, src1, dst0, dst4);
+ ILVRL_D2_DP(src7, src5, dst1, dst5);
+ ILVRL_D2_DP(src11, src9, dst2, dst6);
+ ILVRL_D2_DP(src15, src13, dst3, dst7);
+
+ ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
+
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
+ LD_DP2_INC(psrc5, 2, src8, src9);
+ LD_DP2_INC(psrc6, 2, src10, src11);
+ LD_DP2_INC(psrc7, 2, src12, src13);
+ LD_DP2_INC(psrc8, 2, src14, src15);
+
+ ILVRL_D2_DP(src2, src0, dst0, dst4);
+ ILVRL_D2_DP(src6, src4, dst1, dst5);
+ ILVRL_D2_DP(src10, src8, dst2, dst6);
+ ILVRL_D2_DP(src14, src12, dst3, dst7);
+
+ ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
+
+ ILVRL_D2_DP(src3, src1, dst0, dst4);
+ ILVRL_D2_DP(src7, src5, dst1, dst5);
+ ILVRL_D2_DP(src11, src9, dst2, dst6);
+ ILVRL_D2_DP(src15, src13, dst3, dst7);
+
+ ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
+ }
+
+ for (i = (m & 7); i--;)
+ {
+ *pdst++ = *psrc1++;
+ *pdst++ = *psrc2++;
+ *pdst++ = *psrc3++;
+ *pdst++ = *psrc4++;
+ *pdst++ = *psrc5++;
+ *pdst++ = *psrc6++;
+ *pdst++ = *psrc7++;
+ *pdst++ = *psrc8++;
+ }
+ }
+
+ if (n & 4)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc0 += 4 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
+
+ ILVRL_D2_DP(src2, src0, dst0, dst4);
+ ILVRL_D2_DP(src6, src4, dst1, dst5);
+ ILVRL_D2_DP(src3, src1, dst2, dst6);
+ ILVRL_D2_DP(src7, src5, dst3, dst7);
+
+ ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
+ }
+
+ for (i = (m & 3); i--;)
+ {
+ *pdst++ = *psrc1++;
+ *pdst++ = *psrc2++;
+ *pdst++ = *psrc3++;
+ *pdst++ = *psrc4++;
+ }
+ }
+
+ if (n & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc0 += 2 * lda;
+
+ for (i = (m >> 1); i--;)
+ {
+ src0 = LD_DP(psrc1);
+ src1 = LD_DP(psrc2);
+ psrc1 += 2;
+ psrc2 += 2;
+
+ ILVRL_D2_DP(src1, src0, dst0, dst1);
+
+ ST_DP2_INC(dst0, dst1, pdst, 2);
+ }
+
+ if (m & 1)
+ {
+ *pdst++ = *psrc1++;
+ *pdst++ = *psrc2++;
+ }
+ }
+
+ if (n & 1)
+ {
+ psrc1 = psrc0;
+
+ for (i = m; i--;)
+ {
+ *pdst++ = *psrc1++;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/dgemm_tcopy_4_msa.c b/kernel/mips/dgemm_tcopy_4_msa.c
new file mode 100644
index 000000000..a51c47429
--- /dev/null
+++ b/kernel/mips/dgemm_tcopy_4_msa.c
@@ -0,0 +1,153 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
+ FLOAT * __restrict dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
+ FLOAT *pdst0, *pdst1, *pdst2, *pdst3;
+ v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ psrc0 = src;
+ pdst0 = dst;
+
+ pdst2 = dst + m * (n & ~3);
+ pdst3 = dst + m * (n & ~1);
+
+ for (j = (m >> 2); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc0 += 4 * lda;
+
+ pdst1 = pdst0;
+ pdst0 += 16;
+
+ for (i = (n >> 2); i--;)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
+
+ ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
+ pdst1 += m * 4;
+ }
+
+ if (n & 2)
+ {
+ src0 = LD_DP(psrc1);
+ src1 = LD_DP(psrc2);
+ src2 = LD_DP(psrc3);
+ src3 = LD_DP(psrc4);
+ psrc1 += 2;
+ psrc2 += 2;
+ psrc3 += 2;
+ psrc4 += 2;
+
+ ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
+ }
+
+ if (n & 1)
+ {
+ *pdst3++ = *psrc1++;
+ *pdst3++ = *psrc2++;
+ *pdst3++ = *psrc3++;
+ *pdst3++ = *psrc4++;
+ }
+ }
+
+ if (m & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc0 += 2 * lda;
+
+ pdst1 = pdst0;
+ pdst0 += 8;
+
+ for (i = (n >> 2); i--;)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+
+ ST_DP4(src0, src1, src2, src3, pdst1, 2);
+ pdst1 += m * 4;
+ }
+
+ if (n & 2)
+ {
+ src0 = LD_DP(psrc1);
+ src1 = LD_DP(psrc2);
+ psrc1 += 2;
+ psrc2 += 2;
+
+ ST_DP2_INC(src0, src1, pdst2, 2);
+ }
+
+ if (n & 1)
+ {
+ *pdst3++ = *psrc1++;
+ *pdst3++ = *psrc2++;
+ }
+ }
+
+ if (m & 1)
+ {
+ psrc1 = psrc0;
+ pdst1 = pdst0;
+
+ for (i = (n >> 2); i--;)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+
+ ST_DP2(src0, src1, pdst1, 2);
+ pdst1 += 4 * m;
+ }
+
+ if (n & 2)
+ {
+ src0 = LD_DP(psrc1);
+ psrc1 += 2;
+
+ ST_DP(src0, pdst2);
+ }
+
+ if (n & 1)
+ {
+ *pdst3 = *psrc1;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/dgemm_tcopy_8_msa.c b/kernel/mips/dgemm_tcopy_8_msa.c
new file mode 100644
index 000000000..350ecb359
--- /dev/null
+++ b/kernel/mips/dgemm_tcopy_8_msa.c
@@ -0,0 +1,276 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
+ FLOAT * __restrict dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
+ FLOAT *psrc5, *psrc6, *psrc7, *psrc8;
+ FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4;
+ v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
+ v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
+
+ psrc0 = src;
+ pdst0 = dst;
+
+ pdst2 = dst + m * (n & ~7);
+ pdst3 = dst + m * (n & ~3);
+ pdst4 = dst + m * (n & ~1);
+
+ for (j = (m >> 3); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc5 = psrc4 + lda;
+ psrc6 = psrc5 + lda;
+ psrc7 = psrc6 + lda;
+ psrc8 = psrc7 + lda;
+ psrc0 += 8 * lda;
+
+ pdst1 = pdst0;
+ pdst0 += 64;
+
+ for (i = (n >> 3); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+ LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
+ LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
+
+ ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
+ ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
+ pdst1 + 16, 2);
+
+ LD_DP4_INC(psrc5, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc6, 2, src4, src5, src6, src7);
+ LD_DP4_INC(psrc7, 2, src8, src9, src10, src11);
+ LD_DP4_INC(psrc8, 2, src12, src13, src14, src15);
+
+ ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32,
+ 2);
+ ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
+ pdst1 + 48, 2);
+ pdst1 += m * 8;
+ }
+
+ if (n & 4)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
+ LD_DP2_INC(psrc5, 2, src8, src9);
+ LD_DP2_INC(psrc6, 2, src10, src11);
+ LD_DP2_INC(psrc7, 2, src12, src13);
+ LD_DP2_INC(psrc8, 2, src14, src15);
+
+ ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
+ ST_DP8_INC(src8, src9, src10, src11, src12, src13, src14, src15,
+ pdst2, 2);
+ }
+
+ if (n & 2)
+ {
+ src0 = LD_DP(psrc1);
+ src1 = LD_DP(psrc2);
+ src2 = LD_DP(psrc3);
+ src3 = LD_DP(psrc4);
+ src4 = LD_DP(psrc5);
+ src5 = LD_DP(psrc6);
+ src6 = LD_DP(psrc7);
+ src7 = LD_DP(psrc8);
+ psrc1 += 2;
+ psrc2 += 2;
+ psrc3 += 2;
+ psrc4 += 2;
+ psrc5 += 2;
+ psrc6 += 2;
+ psrc7 += 2;
+ psrc8 += 2;
+
+ ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2);
+ }
+
+ if (n & 1)
+ {
+ *pdst4++ = *psrc1++;
+ *pdst4++ = *psrc2++;
+ *pdst4++ = *psrc3++;
+ *pdst4++ = *psrc4++;
+ *pdst4++ = *psrc5++;
+ *pdst4++ = *psrc6++;
+ *pdst4++ = *psrc7++;
+ *pdst4++ = *psrc8++;
+ }
+ }
+
+ if (m & 4)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc0 += 4 * lda;
+
+ pdst1 = pdst0;
+ pdst0 += 32;
+
+ for (i = (n >> 3); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+ LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
+ LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
+
+ ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
+ ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
+ pdst1 + 16, 2);
+ pdst1 += 8 * m;
+ }
+
+ if (n & 4)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
+
+ ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
+ }
+
+ if (n & 2)
+ {
+ src0 = LD_DP(psrc1);
+ src1 = LD_DP(psrc2);
+ src2 = LD_DP(psrc3);
+ src3 = LD_DP(psrc4);
+ psrc1 += 2;
+ psrc2 += 2;
+ psrc3 += 2;
+ psrc4 += 2;
+
+ ST_DP4_INC(src0, src1, src2, src3, pdst3, 2);
+ }
+
+ if (n & 1)
+ {
+ *pdst4++ = *psrc1++;
+ *pdst4++ = *psrc2++;
+ *pdst4++ = *psrc3++;
+ *pdst4++ = *psrc4++;
+ }
+ }
+
+ if (m & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc0 += 2 * lda;
+
+ pdst1 = pdst0;
+ pdst0 += 16;
+
+ for (i = (n >> 3); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+
+ ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
+ pdst1 += 8 * m;
+ }
+
+ if (n & 4)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+
+ ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
+ }
+
+ if (n & 2)
+ {
+ src0 = LD_DP(psrc1);
+ src1 = LD_DP(psrc2);
+ psrc1 += 2;
+ psrc2 += 2;
+
+ ST_DP2_INC(src0, src1, pdst3, 2);
+ }
+
+ if (n & 1)
+ {
+ *pdst4++ = *psrc1++;
+ *pdst4++ = *psrc2++;
+ }
+ }
+
+ if (m & 1)
+ {
+ psrc1 = psrc0;
+ psrc0 += lda;
+
+ pdst1 = pdst0;
+ pdst0 += 8;
+
+ for (i = (n >> 3); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+
+ ST_DP4(src0, src1, src2, src3, pdst1, 2);
+ pdst1 += 8 * m;
+ }
+
+ if (n & 4)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+
+ ST_DP2_INC(src0, src1, pdst2, 2);
+ }
+
+ if (n & 2)
+ {
+ src0 = LD_DP(psrc1);
+ psrc1 += 2;
+
+ ST_DP(src0, pdst3);
+ pdst3 += 2;
+ }
+
+ if (n & 1)
+ {
+ *pdst4++ = *psrc1++;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/dgemv_n_msa.c b/kernel/mips/dgemv_n_msa.c
new file mode 100644
index 000000000..09bb063ff
--- /dev/null
+++ b/kernel/mips/dgemv_n_msa.c
@@ -0,0 +1,577 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#define DGEMV_N_8x8() \
+{ \
+ LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
+ LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
+ LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
+ LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
+ LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \
+ LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \
+ LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \
+ LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \
+ \
+ y0 += tp0 * t0; \
+ y1 += tp0 * t1; \
+ y2 += tp0 * t2; \
+ y3 += tp0 * t3; \
+ \
+ y0 += tp1 * t4; \
+ y1 += tp1 * t5; \
+ y2 += tp1 * t6; \
+ y3 += tp1 * t7; \
+ \
+ y0 += tp2 * t8; \
+ y1 += tp2 * t9; \
+ y2 += tp2 * t10; \
+ y3 += tp2 * t11; \
+ \
+ y0 += tp3 * t12; \
+ y1 += tp3 * t13; \
+ y2 += tp3 * t14; \
+ y3 += tp3 * t15; \
+ \
+ y0 += tp4 * t16; \
+ y1 += tp4 * t17; \
+ y2 += tp4 * t18; \
+ y3 += tp4 * t19; \
+ \
+ y0 += tp5 * t20; \
+ y1 += tp5 * t21; \
+ y2 += tp5 * t22; \
+ y3 += tp5 * t23; \
+ \
+ y0 += tp6 * t24; \
+ y1 += tp6 * t25; \
+ y2 += tp6 * t26; \
+ y3 += tp6 * t27; \
+ \
+ y0 += tp7 * t28; \
+ y1 += tp7 * t29; \
+ y2 += tp7 * t30; \
+ y3 += tp7 * t31; \
+}
+
+#define DGEMV_N_4x8() \
+{ \
+ LD_DP2(pa0 + k, 2, t0, t1); \
+ LD_DP2(pa1 + k, 2, t4, t5); \
+ LD_DP2(pa2 + k, 2, t8, t9); \
+ LD_DP2(pa3 + k, 2, t12, t13); \
+ LD_DP2(pa4 + k, 2, t16, t17); \
+ LD_DP2(pa5 + k, 2, t20, t21); \
+ LD_DP2(pa6 + k, 2, t24, t25); \
+ LD_DP2(pa7 + k, 2, t28, t29); \
+ \
+ y0 += tp0 * t0; \
+ y1 += tp0 * t1; \
+ \
+ y0 += tp1 * t4; \
+ y1 += tp1 * t5; \
+ \
+ y0 += tp2 * t8; \
+ y1 += tp2 * t9; \
+ \
+ y0 += tp3 * t12; \
+ y1 += tp3 * t13; \
+ \
+ y0 += tp4 * t16; \
+ y1 += tp4 * t17; \
+ \
+ y0 += tp5 * t20; \
+ y1 += tp5 * t21; \
+ \
+ y0 += tp6 * t24; \
+ y1 += tp6 * t25; \
+ \
+ y0 += tp7 * t28; \
+ y1 += tp7 * t29; \
+}
+
+#define DGEMV_N_8x4() \
+{ \
+ LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
+ LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
+ LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
+ LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
+ \
+ y0 += tp0 * t0; \
+ y1 += tp0 * t1; \
+ y2 += tp0 * t2; \
+ y3 += tp0 * t3; \
+ \
+ y0 += tp1 * t4; \
+ y1 += tp1 * t5; \
+ y2 += tp1 * t6; \
+ y3 += tp1 * t7; \
+ \
+ y0 += tp2 * t8; \
+ y1 += tp2 * t9; \
+ y2 += tp2 * t10; \
+ y3 += tp2 * t11; \
+ \
+ y0 += tp3 * t12; \
+ y1 += tp3 * t13; \
+ y2 += tp3 * t14; \
+ y3 += tp3 * t15; \
+}
+
+#define DGEMV_N_4x4() \
+{ \
+ LD_DP2(pa0 + k, 2, t0, t1); \
+ LD_DP2(pa1 + k, 2, t4, t5); \
+ LD_DP2(pa2 + k, 2, t8, t9); \
+ LD_DP2(pa3 + k, 2, t12, t13); \
+ \
+ y0 += tp0 * t0; \
+ y1 += tp0 * t1; \
+ \
+ y0 += tp1 * t4; \
+ y1 += tp1 * t5; \
+ \
+ y0 += tp2 * t8; \
+ y1 += tp2 * t9; \
+ \
+ y0 += tp3 * t12; \
+ y1 += tp3 * t13; \
+}
+
+#define DGEMV_N_8x2() \
+{ \
+ LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
+ LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
+ \
+ y0 += tp0 * t0; \
+ y1 += tp0 * t1; \
+ y2 += tp0 * t2; \
+ y3 += tp0 * t3; \
+ \
+ y0 += tp1 * t4; \
+ y1 += tp1 * t5; \
+ y2 += tp1 * t6; \
+ y3 += tp1 * t7; \
+}
+
+#define DGEMV_N_4x2() \
+{ \
+ LD_DP2(pa0 + k, 2, t0, t1); \
+ LD_DP2(pa1 + k, 2, t4, t5); \
+ \
+ y0 += tp0 * t0; \
+ y1 += tp0 * t1; \
+ \
+ y0 += tp1 * t4; \
+ y1 += tp1 * t5; \
+}
+
+#define DLOAD_X8_SCALE_GP() \
+ temp0 = alpha * x[0 * inc_x]; \
+ temp1 = alpha * x[1 * inc_x]; \
+ temp2 = alpha * x[2 * inc_x]; \
+ temp3 = alpha * x[3 * inc_x]; \
+ temp4 = alpha * x[4 * inc_x]; \
+ temp5 = alpha * x[5 * inc_x]; \
+ temp6 = alpha * x[6 * inc_x]; \
+ temp7 = alpha * x[7 * inc_x]; \
+ \
+ tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
+ tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
+ tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \
+ tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \
+ tp4 = COPY_DOUBLE_TO_VECTOR(temp4); \
+ tp5 = COPY_DOUBLE_TO_VECTOR(temp5); \
+ tp6 = COPY_DOUBLE_TO_VECTOR(temp6); \
+ tp7 = COPY_DOUBLE_TO_VECTOR(temp7); \
+
+#define DLOAD_X4_SCALE_GP() \
+ temp0 = alpha * x[0 * inc_x]; \
+ temp1 = alpha * x[1 * inc_x]; \
+ temp2 = alpha * x[2 * inc_x]; \
+ temp3 = alpha * x[3 * inc_x]; \
+ \
+ tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
+ tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
+ tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \
+ tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \
+
+#define DLOAD_X8_SCALE_VECTOR() \
+ LD_DP4(x, 2, x0, x1, x2, x3); \
+ \
+ x0 = x0 * v_alpha; \
+ x1 = x1 * v_alpha; \
+ x2 = x2 * v_alpha; \
+ x3 = x3 * v_alpha; \
+ \
+ SPLATI_D2_DP(x0, tp0, tp1); \
+ SPLATI_D2_DP(x1, tp2, tp3); \
+ SPLATI_D2_DP(x2, tp4, tp5); \
+ SPLATI_D2_DP(x3, tp6, tp7); \
+
+#define DLOAD_X4_SCALE_VECTOR() \
+ LD_DP2(x, 2, x0, x1); \
+ \
+ x0 = x0 * v_alpha; \
+ x1 = x1 * v_alpha; \
+ \
+ SPLATI_D2_DP(x0, tp0, tp1); \
+ SPLATI_D2_DP(x1, tp2, tp3); \
+
+#define DLOAD_Y8_GP() \
+ y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \
+ y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \
+ y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \
+ y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \
+ y2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 4 * inc_y))); \
+ y2 = (v2f64) __msa_insert_d((v2i64) y2, 1, *((long long *)(y + 5 * inc_y))); \
+ y3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 6 * inc_y))); \
+ y3 = (v2f64) __msa_insert_d((v2i64) y3, 1, *((long long *)(y + 7 * inc_y))); \
+
+#define DLOAD_Y4_GP() \
+ y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \
+ y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \
+ y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \
+ y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \
+
+#define DLOAD_Y8_VECTOR() LD_DP4(y, 2, y0, y1, y2, y3);
+#define DLOAD_Y4_VECTOR() LD_DP2(y, 2, y0, y1);
+
+#define DSTORE_Y8_GP() \
+ *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \
+ *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \
+ *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \
+ *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \
+ *((long long *)(y + 4 * inc_y)) = __msa_copy_s_d((v2i64) y2, 0); \
+ *((long long *)(y + 5 * inc_y)) = __msa_copy_s_d((v2i64) y2, 1); \
+ *((long long *)(y + 6 * inc_y)) = __msa_copy_s_d((v2i64) y3, 0); \
+ *((long long *)(y + 7 * inc_y)) = __msa_copy_s_d((v2i64) y3, 1); \
+
+#define DSTORE_Y4_GP() \
+ *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \
+ *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \
+ *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \
+ *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \
+
+#define DSTORE_Y8_VECTOR() ST_DP4(y0, y1, y2, y3, y, 2);
+#define DSTORE_Y4_VECTOR() ST_DP2(y0, y1, y, 2);
+
+#define DGEMV_N_MSA() \
+ for (j = (n >> 3); j--;) \
+ { \
+ DLOAD_X8_SCALE(); \
+ \
+ k = 0; \
+ y = y_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ DLOAD_Y8(); \
+ DGEMV_N_8x8(); \
+ DSTORE_Y8(); \
+ \
+ y += 8 * inc_y; \
+ k += 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ DLOAD_Y4(); \
+ DGEMV_N_4x8(); \
+ DSTORE_Y4(); \
+ \
+ y += 4 * inc_y; \
+ k += 4; \
+ } \
+ \
+ if (m & 3) \
+ { \
+ temp0 = alpha * x[0 * inc_x]; \
+ temp1 = alpha * x[1 * inc_x]; \
+ temp2 = alpha * x[2 * inc_x]; \
+ temp3 = alpha * x[3 * inc_x]; \
+ temp4 = alpha * x[4 * inc_x]; \
+ temp5 = alpha * x[5 * inc_x]; \
+ temp6 = alpha * x[6 * inc_x]; \
+ temp7 = alpha * x[7 * inc_x]; \
+ \
+ for (i = (m & 3); i--;) \
+ { \
+ temp = y[0]; \
+ temp += temp0 * pa0[k]; \
+ temp += temp1 * pa1[k]; \
+ temp += temp2 * pa2[k]; \
+ temp += temp3 * pa3[k]; \
+ temp += temp4 * pa4[k]; \
+ temp += temp5 * pa5[k]; \
+ temp += temp6 * pa6[k]; \
+ temp += temp7 * pa7[k]; \
+ y[0] = temp; \
+ \
+ y += inc_y; \
+ k++; \
+ } \
+ } \
+ pa0 += 8 * lda; \
+ pa1 += 8 * lda; \
+ pa2 += 8 * lda; \
+ pa3 += 8 * lda; \
+ pa4 += 8 * lda; \
+ pa5 += 8 * lda; \
+ pa6 += 8 * lda; \
+ pa7 += 8 * lda; \
+ \
+ x += 8 * inc_x; \
+ } \
+ \
+ if (n & 4) \
+ { \
+ DLOAD_X4_SCALE(); \
+ \
+ k = 0; \
+ y = y_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ DLOAD_Y8(); \
+ DGEMV_N_8x4(); \
+ DSTORE_Y8(); \
+ \
+ y += 8 * inc_y; \
+ k += 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ DLOAD_Y4(); \
+ DGEMV_N_4x4(); \
+ DSTORE_Y4(); \
+ \
+ y += 4 * inc_y; \
+ k += 4; \
+ } \
+ \
+ if (m & 3) \
+ { \
+ temp0 = alpha * x[0 * inc_x]; \
+ temp1 = alpha * x[1 * inc_x]; \
+ temp2 = alpha * x[2 * inc_x]; \
+ temp3 = alpha * x[3 * inc_x]; \
+ \
+ for (i = (m & 3); i--;) \
+ { \
+ temp = y[0]; \
+ temp += temp0 * pa0[k]; \
+ temp += temp1 * pa1[k]; \
+ temp += temp2 * pa2[k]; \
+ temp += temp3 * pa3[k]; \
+ y[0] = temp; \
+ \
+ y += inc_y; \
+ k++; \
+ } \
+ } \
+ \
+ pa0 += 4 * lda; \
+ pa1 += 4 * lda; \
+ pa2 += 4 * lda; \
+ pa3 += 4 * lda; \
+ \
+ x += 4 * inc_x; \
+ } \
+ \
+ if (n & 2) \
+ { \
+ temp0 = alpha * x[0 * inc_x]; \
+ temp1 = alpha * x[1 * inc_x]; \
+ \
+ tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
+ tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
+ \
+ k = 0; \
+ y = y_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ DLOAD_Y8(); \
+ DGEMV_N_8x2(); \
+ DSTORE_Y8(); \
+ \
+ y += 8 * inc_y; \
+ k += 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ DLOAD_Y4(); \
+ DGEMV_N_4x2(); \
+ DSTORE_Y4(); \
+ \
+ y += 4 * inc_y; \
+ k += 4; \
+ } \
+ \
+ if (m & 3) \
+ { \
+ temp0 = alpha * x[0 * inc_x]; \
+ temp1 = alpha * x[1 * inc_x]; \
+ \
+ for (i = (m & 3); i--;) \
+ { \
+ temp = y[0]; \
+ temp += temp0 * pa0[k]; \
+ temp += temp1 * pa1[k]; \
+ y[0] = temp; \
+ \
+ y += inc_y; \
+ k++; \
+ } \
+ } \
+ \
+ pa0 += 2 * lda; \
+ pa1 += 2 * lda; \
+ \
+ x += 2 * inc_x; \
+ } \
+ \
+ if (n & 1) \
+ { \
+ temp = alpha * x[0]; \
+ \
+ k = 0; \
+ y = y_org; \
+ \
+ for (i = m; i--;) \
+ { \
+ y[0] += temp * pa0[k]; \
+ y += inc_y; \
+ k++; \
+ } \
+ } \
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
+ BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+ FLOAT *buffer)
+{
+ BLASLONG i, j, k;
+ FLOAT *y_org = y;
+ FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
+ FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ v2f64 v_alpha;
+ v2f64 x0, x1, x2, x3, y0, y1, y2, y3;
+ v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+ v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
+ v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+
+ v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);
+
+ pa0 = A;
+ pa1 = A + lda;
+ pa2 = A + 2 * lda;
+ pa3 = A + 3 * lda;
+ pa4 = A + 4 * lda;
+ pa5 = A + 5 * lda;
+ pa6 = A + 6 * lda;
+ pa7 = A + 7 * lda;
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR
+ #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR
+ #define DLOAD_Y8 DLOAD_Y8_VECTOR
+ #define DLOAD_Y4 DLOAD_Y4_VECTOR
+ #define DSTORE_Y8 DSTORE_Y8_VECTOR
+ #define DSTORE_Y4 DSTORE_Y4_VECTOR
+
+ DGEMV_N_MSA();
+
+ #undef DLOAD_X8_SCALE
+ #undef DLOAD_X4_SCALE
+ #undef DLOAD_Y8
+ #undef DLOAD_Y4
+ #undef DSTORE_Y8
+ #undef DSTORE_Y4
+ }
+ else if (1 == inc_y)
+ {
+ #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP
+ #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP
+ #define DLOAD_Y8 DLOAD_Y8_VECTOR
+ #define DLOAD_Y4 DLOAD_Y4_VECTOR
+ #define DSTORE_Y8 DSTORE_Y8_VECTOR
+ #define DSTORE_Y4 DSTORE_Y4_VECTOR
+
+ DGEMV_N_MSA();
+
+ #undef DLOAD_X8_SCALE
+ #undef DLOAD_X4_SCALE
+ #undef DLOAD_Y8
+ #undef DLOAD_Y4
+ #undef DSTORE_Y8
+ #undef DSTORE_Y4
+ }
+ else if (1 == inc_x)
+ {
+ #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR
+ #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR
+ #define DLOAD_Y8 DLOAD_Y8_GP
+ #define DLOAD_Y4 DLOAD_Y4_GP
+ #define DSTORE_Y8 DSTORE_Y8_GP
+ #define DSTORE_Y4 DSTORE_Y4_GP
+
+ DGEMV_N_MSA();
+
+ #undef DLOAD_X8_SCALE
+ #undef DLOAD_X4_SCALE
+ #undef DLOAD_Y8
+ #undef DLOAD_Y4
+ #undef DSTORE_Y8
+ #undef DSTORE_Y4
+ }
+ else
+ {
+ #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP
+ #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP
+ #define DLOAD_Y8 DLOAD_Y8_GP
+ #define DLOAD_Y4 DLOAD_Y4_GP
+ #define DSTORE_Y8 DSTORE_Y8_GP
+ #define DSTORE_Y4 DSTORE_Y4_GP
+
+ DGEMV_N_MSA();
+
+ #undef DLOAD_X8_SCALE
+ #undef DLOAD_X4_SCALE
+ #undef DLOAD_Y8
+ #undef DLOAD_Y4
+ #undef DSTORE_Y8
+ #undef DSTORE_Y4
+ }
+
+ return(0);
+}
diff --git a/kernel/mips/dgemv_t_msa.c b/kernel/mips/dgemv_t_msa.c
new file mode 100644
index 000000000..f74cb2e66
--- /dev/null
+++ b/kernel/mips/dgemv_t_msa.c
@@ -0,0 +1,589 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#define DGEMV_T_8x8() \
+{ \
+ LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
+ LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
+ LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
+ LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
+ LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \
+ LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \
+ LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \
+ LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \
+ \
+ tp0 += x0 * t0; \
+ tp0 += x1 * t1; \
+ tp0 += x2 * t2; \
+ tp0 += x3 * t3; \
+ \
+ tp1 += x0 * t4; \
+ tp1 += x1 * t5; \
+ tp1 += x2 * t6; \
+ tp1 += x3 * t7; \
+ \
+ tp2 += x0 * t8; \
+ tp2 += x1 * t9; \
+ tp2 += x2 * t10; \
+ tp2 += x3 * t11; \
+ \
+ tp3 += x0 * t12; \
+ tp3 += x1 * t13; \
+ tp3 += x2 * t14; \
+ tp3 += x3 * t15; \
+ \
+ tp4 += x0 * t16; \
+ tp4 += x1 * t17; \
+ tp4 += x2 * t18; \
+ tp4 += x3 * t19; \
+ \
+ tp5 += x0 * t20; \
+ tp5 += x1 * t21; \
+ tp5 += x2 * t22; \
+ tp5 += x3 * t23; \
+ \
+ tp6 += x0 * t24; \
+ tp6 += x1 * t25; \
+ tp6 += x2 * t26; \
+ tp6 += x3 * t27; \
+ \
+ tp7 += x0 * t28; \
+ tp7 += x1 * t29; \
+ tp7 += x2 * t30; \
+ tp7 += x3 * t31; \
+}
+
+#define DGEMV_T_8x4() \
+{ \
+ LD_DP2(pa0 + k, 2, t0, t1); \
+ LD_DP2(pa1 + k, 2, t4, t5); \
+ LD_DP2(pa2 + k, 2, t8, t9); \
+ LD_DP2(pa3 + k, 2, t12, t13); \
+ LD_DP2(pa4 + k, 2, t16, t17); \
+ LD_DP2(pa5 + k, 2, t20, t21); \
+ LD_DP2(pa6 + k, 2, t24, t25); \
+ LD_DP2(pa7 + k, 2, t28, t29); \
+ \
+ tp0 += x0 * t0; \
+ tp0 += x1 * t1; \
+ \
+ tp1 += x0 * t4; \
+ tp1 += x1 * t5; \
+ \
+ tp2 += x0 * t8; \
+ tp2 += x1 * t9; \
+ \
+ tp3 += x0 * t12; \
+ tp3 += x1 * t13; \
+ \
+ tp4 += x0 * t16; \
+ tp4 += x1 * t17; \
+ \
+ tp5 += x0 * t20; \
+ tp5 += x1 * t21; \
+ \
+ tp6 += x0 * t24; \
+ tp6 += x1 * t25; \
+ \
+ tp7 += x0 * t28; \
+ tp7 += x1 * t29; \
+}
+
+#define DGEMV_T_8x2() \
+{ \
+ t0 = LD_DP(pa0 + k); \
+ t4 = LD_DP(pa1 + k); \
+ t8 = LD_DP(pa2 + k); \
+ t12 = LD_DP(pa3 + k); \
+ t16 = LD_DP(pa4 + k); \
+ t20 = LD_DP(pa5 + k); \
+ t24 = LD_DP(pa6 + k); \
+ t28 = LD_DP(pa7 + k); \
+ \
+ tp0 += x0 * t0; \
+ tp1 += x0 * t4; \
+ tp2 += x0 * t8; \
+ tp3 += x0 * t12; \
+ tp4 += x0 * t16; \
+ tp5 += x0 * t20; \
+ tp6 += x0 * t24; \
+ tp7 += x0 * t28; \
+}
+
+#define DGEMV_T_4x8() \
+{ \
+ LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
+ LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
+ LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
+ LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
+ \
+ tp0 += x0 * t0; \
+ tp0 += x1 * t1; \
+ tp0 += x2 * t2; \
+ tp0 += x3 * t3; \
+ \
+ tp1 += x0 * t4; \
+ tp1 += x1 * t5; \
+ tp1 += x2 * t6; \
+ tp1 += x3 * t7; \
+ \
+ tp2 += x0 * t8; \
+ tp2 += x1 * t9; \
+ tp2 += x2 * t10; \
+ tp2 += x3 * t11; \
+ \
+ tp3 += x0 * t12; \
+ tp3 += x1 * t13; \
+ tp3 += x2 * t14; \
+ tp3 += x3 * t15; \
+}
+
+#define DGEMV_T_4x4() \
+{ \
+ LD_DP2(pa0 + k, 2, t0, t1); \
+ LD_DP2(pa1 + k, 2, t4, t5); \
+ LD_DP2(pa2 + k, 2, t8, t9); \
+ LD_DP2(pa3 + k, 2, t12, t13); \
+ \
+ tp0 += x0 * t0; \
+ tp0 += x1 * t1; \
+ \
+ tp1 += x0 * t4; \
+ tp1 += x1 * t5; \
+ \
+ tp2 += x0 * t8; \
+ tp2 += x1 * t9; \
+ \
+ tp3 += x0 * t12; \
+ tp3 += x1 * t13; \
+}
+
+#define DGEMV_T_4x2() \
+{ \
+ t0 = LD_DP(pa0 + k); \
+ t4 = LD_DP(pa1 + k); \
+ t8 = LD_DP(pa2 + k); \
+ t12 = LD_DP(pa3 + k); \
+ \
+ tp0 += x0 * t0; \
+ tp1 += x0 * t4; \
+ tp2 += x0 * t8; \
+ tp3 += x0 * t12; \
+}
+
+#define DGEMV_T_2x8() \
+{ \
+ LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
+ LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
+ \
+ tp0 += x0 * t0; \
+ tp0 += x1 * t1; \
+ tp0 += x2 * t2; \
+ tp0 += x3 * t3; \
+ \
+ tp1 += x0 * t4; \
+ tp1 += x1 * t5; \
+ tp1 += x2 * t6; \
+ tp1 += x3 * t7; \
+}
+
+#define DGEMV_T_2x4() \
+{ \
+ LD_DP2(pa0 + k, 2, t0, t1); \
+ LD_DP2(pa1 + k, 2, t4, t5); \
+ \
+ tp0 += x0 * t0; \
+ tp0 += x1 * t1; \
+ \
+ tp1 += x0 * t4; \
+ tp1 += x1 * t5; \
+}
+
+#define DGEMV_T_2x2() \
+{ \
+ t0 = LD_DP(pa0 + k); \
+ t4 = LD_DP(pa1 + k); \
+ \
+ tp0 += x0 * t0; \
+ tp1 += x0 * t4; \
+}
+
+#define DLOAD_X8_GP() \
+ x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
+ x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
+ x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \
+ x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \
+ x2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 4 * inc_x))); \
+ x2 = (v2f64) __msa_insert_d((v2i64) x2, 1, *((long long *)(x + 5 * inc_x))); \
+ x3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 6 * inc_x))); \
+ x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((long long *)(x + 7 * inc_x))); \
+
+#define DLOAD_X4_GP() \
+ x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
+ x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
+ x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \
+ x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \
+
+#define DLOAD_X2_GP() \
+ x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
+ x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
+
+#define DLOAD_X8_VECTOR() LD_DP4(x, 2, x0, x1, x2, x3);
+#define DLOAD_X4_VECTOR() LD_DP2(x, 2, x0, x1);
+#define DLOAD_X2_VECTOR() x0 = LD_DP(x);
+
+#define DGEMV_T_MSA() \
+ for (j = (n >> 3); j--;) \
+ { \
+ tp0 = zero; \
+ tp1 = zero; \
+ tp2 = zero; \
+ tp3 = zero; \
+ tp4 = zero; \
+ tp5 = zero; \
+ tp6 = zero; \
+ tp7 = zero; \
+ \
+ k = 0; \
+ x = srcx_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ DLOAD_X8(); \
+ DGEMV_T_8x8(); \
+ \
+ x += 8 * inc_x; \
+ k += 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ DLOAD_X4(); \
+ DGEMV_T_8x4(); \
+ \
+ x += 4 * inc_x; \
+ k += 4; \
+ } \
+ \
+ if (m & 2) \
+ { \
+ DLOAD_X2(); \
+ DGEMV_T_8x2(); \
+ \
+ x += 2 * inc_x; \
+ k += 2; \
+ } \
+ \
+ ILVRL_D2_DP(tp1, tp0, t0, t4); \
+ ILVRL_D2_DP(tp3, tp2, t1, t5); \
+ ILVRL_D2_DP(tp5, tp4, t2, t6); \
+ ILVRL_D2_DP(tp7, tp6, t3, t7); \
+ ADD2(t0, t4, t1, t5, t0, t1); \
+ ADD2(t2, t6, t3, t7, t2, t3); \
+ \
+ temp0 = t0[0]; \
+ temp1 = t0[1]; \
+ temp2 = t1[0]; \
+ temp3 = t1[1]; \
+ temp4 = t2[0]; \
+ temp5 = t2[1]; \
+ temp6 = t3[0]; \
+ temp7 = t3[1]; \
+ \
+ if (m & 1) \
+ { \
+ temp0 += pa0[k] * x[0]; \
+ temp1 += pa1[k] * x[0]; \
+ temp2 += pa2[k] * x[0]; \
+ temp3 += pa3[k] * x[0]; \
+ temp4 += pa4[k] * x[0]; \
+ temp5 += pa5[k] * x[0]; \
+ temp6 += pa6[k] * x[0]; \
+ temp7 += pa7[k] * x[0]; \
+ \
+ x += inc_x; \
+ k++; \
+ } \
+ \
+ res0 = y[0 * inc_y]; \
+ res1 = y[1 * inc_y]; \
+ res2 = y[2 * inc_y]; \
+ res3 = y[3 * inc_y]; \
+ res4 = y[4 * inc_y]; \
+ res5 = y[5 * inc_y]; \
+ res6 = y[6 * inc_y]; \
+ res7 = y[7 * inc_y]; \
+ \
+ res0 += alpha * temp0; \
+ res1 += alpha * temp1; \
+ res2 += alpha * temp2; \
+ res3 += alpha * temp3; \
+ res4 += alpha * temp4; \
+ res5 += alpha * temp5; \
+ res6 += alpha * temp6; \
+ res7 += alpha * temp7; \
+ \
+ y[0 * inc_y] = res0; \
+ y[1 * inc_y] = res1; \
+ y[2 * inc_y] = res2; \
+ y[3 * inc_y] = res3; \
+ y[4 * inc_y] = res4; \
+ y[5 * inc_y] = res5; \
+ y[6 * inc_y] = res6; \
+ y[7 * inc_y] = res7; \
+ \
+ y += 8 * inc_y; \
+ \
+ pa0 += 8 * lda; \
+ pa1 += 8 * lda; \
+ pa2 += 8 * lda; \
+ pa3 += 8 * lda; \
+ pa4 += 8 * lda; \
+ pa5 += 8 * lda; \
+ pa6 += 8 * lda; \
+ pa7 += 8 * lda; \
+ } \
+ \
+ if (n & 4) \
+ { \
+ tp0 = zero; \
+ tp1 = zero; \
+ tp2 = zero; \
+ tp3 = zero; \
+ \
+ k = 0; \
+ x = srcx_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ DLOAD_X8(); \
+ DGEMV_T_4x8(); \
+ \
+ x += 8 * inc_x; \
+ k += 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ DLOAD_X4(); \
+ DGEMV_T_4x4(); \
+ \
+ x += 4 * inc_x; \
+ k += 4; \
+ } \
+ \
+ if (m & 2) \
+ { \
+ DLOAD_X2(); \
+ DGEMV_T_4x2(); \
+ \
+ x += 2 * inc_x; \
+ k += 2; \
+ } \
+ \
+ ILVRL_D2_DP(tp1, tp0, t0, t4); \
+ ILVRL_D2_DP(tp3, tp2, t1, t5); \
+ ADD2(t0, t4, t1, t5, t0, t1); \
+ \
+ temp0 = t0[0]; \
+ temp1 = t0[1]; \
+ temp2 = t1[0]; \
+ temp3 = t1[1]; \
+ \
+ if (m & 1) \
+ { \
+ temp0 += pa0[k] * x[0]; \
+ temp1 += pa1[k] * x[0]; \
+ temp2 += pa2[k] * x[0]; \
+ temp3 += pa3[k] * x[0]; \
+ \
+ x += inc_x; \
+ k++; \
+ } \
+ \
+ res0 = y[0 * inc_y]; \
+ res1 = y[1 * inc_y]; \
+ res2 = y[2 * inc_y]; \
+ res3 = y[3 * inc_y]; \
+ \
+ res0 += alpha * temp0; \
+ res1 += alpha * temp1; \
+ res2 += alpha * temp2; \
+ res3 += alpha * temp3; \
+ \
+ y[0 * inc_y] = res0; \
+ y[1 * inc_y] = res1; \
+ y[2 * inc_y] = res2; \
+ y[3 * inc_y] = res3; \
+ \
+ y += 4 * inc_y; \
+ \
+ pa0 += 4 * lda; \
+ pa1 += 4 * lda; \
+ pa2 += 4 * lda; \
+ pa3 += 4 * lda; \
+ } \
+ \
+ if (n & 2) \
+ { \
+ tp0 = zero; \
+ tp1 = zero; \
+ \
+ k = 0; \
+ x = srcx_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ DLOAD_X8(); \
+ DGEMV_T_2x8(); \
+ \
+ x += 8 * inc_x; \
+ k += 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ DLOAD_X4(); \
+ DGEMV_T_2x4(); \
+ \
+ x += 4 * inc_x; \
+ k += 4; \
+ } \
+ \
+ if (m & 2) \
+ { \
+ DLOAD_X2(); \
+ DGEMV_T_2x2(); \
+ \
+ x += 2 * inc_x; \
+ k += 2; \
+ } \
+ \
+ ILVRL_D2_DP(tp1, tp0, t0, t4); \
+ \
+ t0 += t4; \
+ \
+ temp0 = t0[0]; \
+ temp1 = t0[1]; \
+ \
+ if (m & 1) \
+ { \
+ temp0 += pa0[k] * x[0]; \
+ temp1 += pa1[k] * x[0]; \
+ x += inc_x; \
+ k++; \
+ } \
+ \
+ res0 = y[0 * inc_y]; \
+ res1 = y[1 * inc_y]; \
+ \
+ res0 += alpha * temp0; \
+ res1 += alpha * temp1; \
+ \
+ y[0 * inc_y] = res0; \
+ y[1 * inc_y] = res1; \
+ \
+ y += 2 * inc_y; \
+ \
+ pa0 += 2 * lda; \
+ pa1 += 2 * lda; \
+ } \
+ \
+ if (n & 1) \
+ { \
+ temp0 = 0.0; \
+ \
+ k = 0; \
+ x = srcx_org; \
+ \
+ for (i = m; i--;) \
+ { \
+ temp0 += pa0[k] * x[0]; \
+ x += inc_x; \
+ k++; \
+ } \
+ \
+ y[0] += alpha * temp0; \
+ y += inc_y; \
+ pa0 += lda; \
+ }
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
+ BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+ FLOAT *buffer)
+{
+ BLASLONG i, j, k;
+ FLOAT *srcx_org = x;
+ FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
+ FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
+ v2f64 x0, x1, x2, x3;
+ v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+ v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
+ v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+ v2f64 zero = {0};
+
+ pa0 = A + 0 * lda;
+ pa1 = A + 1 * lda;
+ pa2 = A + 2 * lda;
+ pa3 = A + 3 * lda;
+ pa4 = A + 4 * lda;
+ pa5 = A + 5 * lda;
+ pa6 = A + 6 * lda;
+ pa7 = A + 7 * lda;
+
+ if (1 == inc_x)
+ {
+ #define DLOAD_X8 DLOAD_X8_VECTOR
+ #define DLOAD_X4 DLOAD_X4_VECTOR
+ #define DLOAD_X2 DLOAD_X2_VECTOR
+
+ DGEMV_T_MSA();
+
+ #undef DLOAD_X8
+ #undef DLOAD_X4
+ #undef DLOAD_X2
+ }
+ else
+ {
+ #define DLOAD_X8 DLOAD_X8_GP
+ #define DLOAD_X4 DLOAD_X4_GP
+ #define DLOAD_X2 DLOAD_X2_GP
+
+ DGEMV_T_MSA();
+
+ #undef DLOAD_X8
+ #undef DLOAD_X4
+ #undef DLOAD_X2
+ }
+
+ return(0);
+}
diff --git a/kernel/mips/dot.c b/kernel/mips/dot.c
new file mode 100644
index 000000000..de7f7167f
--- /dev/null
+++ b/kernel/mips/dot.c
@@ -0,0 +1,55 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+ double dot = 0.0 ;
+
+ if ( n < 0 ) return(dot);
+
+ while(i < n)
+ {
+
+ dot += y[iy] * x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(dot);
+
+}
+
+
diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c
new file mode 100644
index 000000000..dc21dab45
--- /dev/null
+++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c
@@ -0,0 +1,1349 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
+ v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17;
+ v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33;
+ v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43;
+ v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52;
+ v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60;
+ v2f64 src_a61, src_a62, src_a63;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
+ LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
+ LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *pba = a, *pbb = b;
+ v2f64 src_b, src_b0, src_b1, src_b2, src_b3;
+
+ LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2(pbb, 2, src_b0, src_b1);
+
+ for (i = (bk - 1); i--;)
+ {
+ pba += 8;
+ pbb += 4;
+
+ LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17);
+ LD_DP2(pbb, 2, src_b2, src_b3);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ src_a0 = src_a8;
+ src_a1 = src_a9;
+ src_a2 = src_a16;
+ src_a3 = src_a17;
+ src_b0 = src_b2;
+ src_b1 = src_b3;
+ }
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+ }
+
+ a -= 64;
+ b -= 32;
+
+ ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
+ ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
+ ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
+ ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9);
+ ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11);
+ ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13);
+ ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15);
+
+ src_a54 = __msa_cast_to_vector_double(*(a + 54));
+ src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
+ src_a62 = LD_DP(a + 62);
+ src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1);
+ src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0);
+ src_a60 = LD_DP(a + 60);
+ src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1);
+ src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0);
+ src_a52 = LD_DP(a + 52);
+ src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1);
+ src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0);
+ src_a44 = LD_DP(a + 44);
+ src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1);
+ src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0);
+ src_a36 = __msa_cast_to_vector_double(*(a + 36));
+ src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
+
+ res_c7 *= src_a63;
+ res_c6 -= res_c7 * src_a62;
+ res_c6 *= src_a54;
+
+ res_c15 *= src_a63;
+ res_c14 -= res_c15 * src_a62;
+ res_c14 *= src_a54;
+
+ ST_DP(res_c7, b + 28);
+ ST_DP(res_c6, b + 24);
+ ST_DP(res_c15, b + 30);
+ ST_DP(res_c14, b + 26);
+ ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
+ ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15);
+ ST_DP(src_c3, c + 6);
+ ST_DP(src_c7, c_nxt1line + 6);
+ ST_DP(src_c11, c_nxt2line + 6);
+ ST_DP(src_c15, c_nxt3line + 6);
+
+ res_c5 -= res_c7 * src_a61;
+ res_c5 -= res_c6 * src_a53;
+ res_c5 *= src_a45;
+
+ res_c4 -= res_c7 * src_a60;
+ res_c4 -= res_c6 * src_a52;
+ res_c4 -= res_c5 * src_a44;
+ res_c4 *= src_a36;
+
+ res_c13 -= res_c15 * src_a61;
+ res_c13 -= res_c14 * src_a53;
+ res_c13 *= src_a45;
+
+ res_c12 -= res_c15 * src_a60;
+ res_c12 -= res_c14 * src_a52;
+ res_c12 -= res_c13 * src_a44;
+ res_c12 *= src_a36;
+
+ src_a56 = LD_DP(a + 56);
+ src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1);
+ src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0);
+ src_a58 = LD_DP(a + 58);
+ src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1);
+ src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0);
+
+ ST_DP(res_c4, b + 16);
+ ST_DP(res_c5, b + 20);
+ ST_DP(res_c12, b + 18);
+ ST_DP(res_c13, b + 22);
+
+ ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
+ ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14);
+ ST_DP(src_c2, c + 4);
+ ST_DP(src_c6, c_nxt1line + 4);
+ ST_DP(src_c10, c_nxt2line + 4);
+ ST_DP(src_c14, c_nxt3line + 4);
+
+ src_a50 = LD_DP(a + 50);
+ src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1);
+ src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0);
+ src_a42 = LD_DP(a + 42);
+ src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1);
+ src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0);
+ src_a34 = LD_DP(a + 34);
+ src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1);
+ src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0);
+ src_a26 = LD_DP(a + 26);
+ src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1);
+ src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0);
+ src_a18 = __msa_cast_to_vector_double(*(a + 18));
+ src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
+
+ res_c3 -= res_c7 * src_a59;
+ res_c2 -= res_c7 * src_a58;
+ res_c1 -= res_c7 * src_a57;
+ res_c0 -= res_c7 * src_a56;
+
+ res_c11 -= res_c15 * src_a59;
+ res_c10 -= res_c15 * src_a58;
+ res_c9 -= res_c15 * src_a57;
+ res_c8 -= res_c15 * src_a56;
+
+ res_c3 -= res_c6 * src_a51;
+ res_c3 -= res_c5 * src_a43;
+ res_c3 -= res_c4 * src_a35;
+ res_c3 *= src_a27;
+
+ res_c2 -= res_c6 * src_a50;
+ res_c2 -= res_c5 * src_a42;
+ res_c2 -= res_c4 * src_a34;
+ res_c2 -= res_c3 * src_a26;
+ res_c2 *= src_a18;
+
+ res_c11 -= res_c14 * src_a51;
+ res_c11 -= res_c13 * src_a43;
+ res_c11 -= res_c12 * src_a35;
+ res_c11 *= src_a27;
+
+ res_c10 -= res_c14 * src_a50;
+ res_c10 -= res_c13 * src_a42;
+ res_c10 -= res_c12 * src_a34;
+ res_c10 -= res_c11 * src_a26;
+ res_c10 *= src_a18;
+
+ src_a48 = LD_DP(a + 48);
+ src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1);
+ src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0);
+ src_a40 = LD_DP(a + 40);
+ src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1);
+ src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0);
+
+ ST_DP(res_c2, b + 8);
+ ST_DP(res_c3, b + 12);
+ ST_DP(res_c10, b + 10);
+ ST_DP(res_c11, b + 14);
+
+ src_a32 = LD_DP(a + 32);
+ src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1);
+ src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0);
+ src_a24 = LD_DP(a + 24);
+ src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1);
+ src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0);
+
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
+ ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13);
+ ST_DP(src_c1, c + 2);
+ ST_DP(src_c5, c_nxt1line + 2);
+ ST_DP(src_c9, c_nxt2line + 2);
+ ST_DP(src_c13, c_nxt3line + 2);
+
+ res_c1 -= res_c6 * src_a49;
+ res_c1 -= res_c5 * src_a41;
+ res_c1 -= res_c4 * src_a33;
+ res_c1 -= res_c3 * src_a25;
+
+ res_c0 -= res_c6 * src_a48;
+ res_c0 -= res_c5 * src_a40;
+ res_c0 -= res_c4 * src_a32;
+ res_c0 -= res_c3 * src_a24;
+
+ res_c9 -= res_c14 * src_a49;
+ res_c9 -= res_c13 * src_a41;
+ res_c9 -= res_c12 * src_a33;
+ res_c9 -= res_c11 * src_a25;
+
+ res_c8 -= res_c14 * src_a48;
+ res_c8 -= res_c13 * src_a40;
+ res_c8 -= res_c12 * src_a32;
+ res_c8 -= res_c11 * src_a24;
+
+ src_a16 = LD_DP(a + 16);
+ src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1);
+ src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0);
+ src_a8 = LD_DP(a + 8);
+ src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
+ src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
+ src_a0 = __msa_cast_to_vector_double(*(a + 0));
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+
+ res_c1 -= res_c2 * src_a17;
+ res_c1 *= src_a9;
+
+ res_c9 -= res_c10 * src_a17;
+ res_c9 *= src_a9;
+
+ res_c0 -= res_c2 * src_a16;
+ res_c0 -= res_c1 * src_a8;
+ res_c0 *= src_a0;
+
+ res_c8 -= res_c10 * src_a16;
+ res_c8 -= res_c9 * src_a8;
+ res_c8 *= src_a0;
+
+ ST_DP(res_c0, b + 0);
+ ST_DP(res_c8, b + 2);
+ ST_DP(res_c1, b + 4);
+ ST_DP(res_c9, b + 6);
+
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
+ ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12);
+
+ ST_DP(src_c0, c);
+ ST_DP(src_c4, c_nxt1line);
+ ST_DP(src_c8, c_nxt2line);
+ ST_DP(src_c12, c_nxt3line);
+}
+
+static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17;
+ v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33;
+ v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43;
+ v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52;
+ v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60;
+ v2f64 src_a61, src_a62, src_a63;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *pba = a, *pbb = b;
+ v2f64 src_b, src_b0, src_b1;
+
+ LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(pbb);
+
+ for (i = bk - 1; i--;)
+ {
+ pba += 8;
+ pbb += 2;
+
+ LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17);
+ src_b1 = LD_DP(pbb);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_a0 = src_a8;
+ src_a1 = src_a9;
+ src_a2 = src_a16;
+ src_a3 = src_a17;
+ src_b0 = src_b1;
+ }
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+ }
+
+ ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
+ ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
+ ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
+
+ src_a56 = LD_DP(a - 8);
+ src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1);
+ src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0);
+ src_a58 = LD_DP(a - 6);
+ src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1);
+ src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0);
+ src_a60 = LD_DP(a - 4);
+ src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1);
+ src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0);
+ src_a62 = LD_DP(a - 2);
+ src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1);
+ src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0);
+
+ res_c7 *= src_a63;
+ res_c6 -= res_c7 * src_a62;
+ res_c5 -= res_c7 * src_a61;
+ res_c4 -= res_c7 * src_a60;
+ res_c3 -= res_c7 * src_a59;
+ res_c2 -= res_c7 * src_a58;
+ res_c1 -= res_c7 * src_a57;
+ res_c0 -= res_c7 * src_a56;
+
+ src_a48 = LD_DP(a - 16);
+ src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1);
+ src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0);
+ src_a50 = LD_DP(a - 14);
+ src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1);
+ src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0);
+ src_a52 = LD_DP(a - 12);
+ src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1);
+ src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0);
+ src_a54 = __msa_cast_to_vector_double(*(a - 10));
+ src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
+
+ src_a40 = LD_DP(a - 24);
+ src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1);
+ src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0);
+ src_a42 = LD_DP(a - 22);
+ src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1);
+ src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0);
+ src_a44 = LD_DP(a - 20);
+ src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1);
+ src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0);
+
+ res_c6 *= src_a54;
+ res_c5 -= res_c6 * src_a53;
+ res_c4 -= res_c6 * src_a52;
+ res_c3 -= res_c6 * src_a51;
+ res_c2 -= res_c6 * src_a50;
+ res_c1 -= res_c6 * src_a49;
+ res_c0 -= res_c6 * src_a48;
+
+ res_c5 *= src_a45;
+ res_c4 -= res_c5 * src_a44;
+ res_c3 -= res_c5 * src_a43;
+ res_c2 -= res_c5 * src_a42;
+ res_c1 -= res_c5 * src_a41;
+ res_c0 -= res_c5 * src_a40;
+
+ ST_DP(res_c7, b - 2);
+ ST_DP(res_c6, b - 4);
+ ST_DP(res_c5, b - 6);
+
+ src_a32 = LD_DP(a - 32);
+ src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1);
+ src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0);
+ src_a34 = LD_DP(a - 30);
+ src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1);
+ src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0);
+ src_a36 = __msa_cast_to_vector_double(*(a - 28));
+ src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
+
+ res_c4 *= src_a36;
+ res_c3 -= res_c4 * src_a35;
+ res_c2 -= res_c4 * src_a34;
+ res_c1 -= res_c4 * src_a33;
+ res_c0 -= res_c4 * src_a32;
+
+ src_a24 = LD_DP(a - 40);
+ src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1);
+ src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0);
+ src_a26 = LD_DP(a - 38);
+ src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1);
+ src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0);
+ src_a16 = LD_DP(a - 48);
+ src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1);
+ src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0);
+ src_a18 = __msa_cast_to_vector_double(*(a - 46));
+ src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
+ src_a0 = __msa_cast_to_vector_double(*(a - 64));
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+ src_a8 = LD_DP(a - 56);
+ src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
+ src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
+
+ res_c3 *= src_a27;
+ res_c2 -= res_c3 * src_a26;
+ res_c1 -= res_c3 * src_a25;
+ res_c0 -= res_c3 * src_a24;
+
+ res_c2 *= src_a18;
+ res_c1 -= res_c2 * src_a17;
+ res_c0 -= res_c2 * src_a16;
+
+ res_c1 *= src_a9;
+ res_c0 -= res_c1 * src_a8;
+
+ res_c0 *= src_a0;
+
+ ST_DP(res_c4, b - 8);
+ ST_DP(res_c3, b - 10);
+ ST_DP(res_c2, b - 12);
+ ST_DP(res_c1, b - 14);
+ ST_DP(res_c0, b - 16);
+
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
+ ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
+ ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2);
+}
+
+static void dsolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35;
+ FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53;
+ FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63;
+ FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+ c4 = *(c + 4);
+ c5 = *(c + 5);
+ c6 = *(c + 6);
+ c7 = *(c + 7);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--; )
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c2 -= aa[2] * bb[0];
+ c3 -= aa[3] * bb[0];
+ c4 -= aa[4] * bb[0];
+ c5 -= aa[5] * bb[0];
+ c6 -= aa[6] * bb[0];
+ c7 -= aa[7] * bb[0];
+
+ aa += 8;
+ bb += 1;
+ }
+ }
+
+ a -= 64;
+ b -= 8;
+
+ a0 = *(a + 0);
+ a8 = *(a + 8);
+ a9 = *(a + 9);
+ a16 = *(a + 16);
+ a17 = *(a + 17);
+ a18 = *(a + 18);
+ a24 = *(a + 24);
+ a25 = *(a + 25);
+ a26 = *(a + 26);
+ a27 = *(a + 27);
+ a32 = *(a + 32);
+ a33 = *(a + 33);
+ a34 = *(a + 34);
+ a35 = *(a + 35);
+ a36 = *(a + 36);
+ a40 = *(a + 40);
+ a41 = *(a + 41);
+ a42 = *(a + 42);
+ a43 = *(a + 43);
+ a44 = *(a + 44);
+ a45 = *(a + 45);
+ a48 = *(a + 48);
+ a49 = *(a + 49);
+ a50 = *(a + 50);
+ a51 = *(a + 51);
+ a52 = *(a + 52);
+ a53 = *(a + 53);
+ a54 = *(a + 54);
+ a56 = *(a + 56);
+ a57 = *(a + 57);
+ a58 = *(a + 58);
+ a59 = *(a + 59);
+ a60 = *(a + 60);
+ a61 = *(a + 61);
+ a62 = *(a + 62);
+ a63 = *(a + 63);
+
+ c7 *= a63;
+
+ c6 -= c7 * a62;
+ c6 *= a54;
+
+ c5 -= c7 * a61;
+ c5 -= c6 * a53;
+ c5 *= a45;
+
+ c4 -= c7 * a60;
+ c4 -= c6 * a52;
+ c4 -= c5 * a44;
+ c4 *= a36;
+
+ c3 -= c7 * a59;
+ c3 -= c6 * a51;
+ c3 -= c5 * a43;
+ c3 -= c4 * a35;
+ c3 *= a27;
+
+ c2 -= c7 * a58;
+ c2 -= c6 * a50;
+ c2 -= c5 * a42;
+ c2 -= c4 * a34;
+ c2 -= c3 * a26;
+ c2 *= a18;
+
+ c1 -= c7 * a57;
+ c1 -= c6 * a49;
+ c1 -= c5 * a41;
+ c1 -= c4 * a33;
+ c1 -= c3 * a25;
+ c1 -= c2 * a17;
+ c1 *= a9;
+
+ c0 -= c7 * a56;
+ c0 -= c6 * a48;
+ c0 -= c5 * a40;
+ c0 -= c4 * a32;
+ c0 -= c3 * a24;
+ c0 -= c2 * a16;
+ c0 -= c1 * a8;
+ c0 *= a0;
+
+ *(b + 7) = c7;
+ *(b + 6) = c6;
+ *(b + 5) = c5;
+ *(b + 4) = c4;
+ *(b + 3) = c3;
+ *(b + 2) = c2;
+ *(b + 1) = c1;
+ *(b + 0) = c0;
+
+ *(c + 7) = c7;
+ *(c + 6) = c6;
+ *(c + 5) = c5;
+ *(c + 4) = c4;
+ *(c + 3) = c3;
+ *(c + 2) = c2;
+ *(c + 1) = c1;
+ *(c + 0) = c0;
+}
+
+static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13;
+ v2f64 src_a14, src_a15;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+ LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
+ LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+ v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(aa, 2, src_a0, src_a1);
+ LD_DP2(bb, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c6 -= src_a0 * src_b;
+ src_c7 -= src_a1 * src_b;
+
+ aa += 4;
+ bb += 4;
+ }
+ }
+
+ a -= 16;
+ b -= 16;
+
+ ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
+ ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5);
+ ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7);
+
+ src_a14 = LD_DP(a + 14);
+ src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
+ src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0);
+
+ src_a12 = LD_DP(a + 12);
+ src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1);
+ src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0);
+
+ src_a9 = LD_DP(a + 9);
+ src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
+ src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
+
+ src_a8 = __msa_cast_to_vector_double(*(a + 8));
+ src_a0 = __msa_cast_to_vector_double(*(a + 0));
+
+ src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+
+ src_a4 = LD_DP(a + 4);
+ src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
+ src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0);
+
+ res_c3 *= src_a15;
+ res_c7 *= src_a15;
+
+ res_c2 -= res_c3 * src_a14;
+ res_c6 -= res_c7 * src_a14;
+ res_c2 *= src_a10;
+ res_c6 *= src_a10;
+
+ res_c1 -= res_c3 * src_a13;
+ res_c5 -= res_c7 * src_a13;
+ res_c1 -= res_c2 * src_a9;
+ res_c5 -= res_c6 * src_a9;
+ res_c1 *= src_a5;
+ res_c5 *= src_a5;
+
+ res_c0 -= res_c3 * src_a12;
+ res_c4 -= res_c7 * src_a12;
+ res_c0 -= res_c2 * src_a8;
+ res_c4 -= res_c6 * src_a8;
+ res_c0 -= res_c1 * src_a4;
+ res_c4 -= res_c5 * src_a4;
+ res_c0 *= src_a0;
+ res_c4 *= src_a0;
+
+ ST_DP(res_c7, b + 14);
+ ST_DP(res_c3, b + 12);
+ ST_DP(res_c6, b + 10);
+ ST_DP(res_c2, b + 8);
+ ST_DP(res_c5, b + 6);
+ ST_DP(res_c1, b + 4);
+ ST_DP(res_c4, b + 2);
+ ST_DP(res_c0, b + 0);
+
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
+ ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6);
+ ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7);
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+ ST_DP2(src_c4, src_c5, c + 2 * ldc, 2);
+ ST_DP2(src_c6, src_c7, c + 3 * ldc, 2);
+}
+
+static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3;
+ v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13;
+ v2f64 src_a14, src_a15;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+ v2f64 src_a0, src_a1, src_b, src_b0;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(aa, 2, src_a0, src_a1);
+ src_b0 = LD_DP(bb);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ aa += 4;
+ bb += 2;
+ }
+ }
+
+ a -= 16;
+ b -= 8;
+
+ ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
+
+ src_a14 = LD_DP(a + 14);
+ src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
+ src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0);
+
+ src_a12 = LD_DP(a + 12);
+ src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1);
+ src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0);
+
+ src_a9 = LD_DP(a + 9);
+ src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
+ src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
+
+ src_a8 = __msa_cast_to_vector_double(*(a + 8));
+ src_a0 = __msa_cast_to_vector_double(*(a + 0));
+
+ src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+
+ src_a4 = LD_DP(a + 4);
+ src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
+ src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0);
+
+ res_c3 *= src_a15;
+
+ res_c2 -= res_c3 * src_a14;
+ res_c2 *= src_a10;
+
+ res_c1 -= res_c3 * src_a13;
+ res_c1 -= res_c2 * src_a9;
+ res_c1 *= src_a5;
+
+ res_c0 -= res_c3 * src_a12;
+ res_c0 -= res_c2 * src_a8;
+ res_c0 -= res_c1 * src_a4;
+ res_c0 *= src_a0;
+
+ ST_DP(res_c3, b + 6);
+ ST_DP(res_c2, b + 4);
+ ST_DP(res_c1, b + 2);
+ ST_DP(res_c0, b + 0);
+
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+}
+
+static void dsolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c2 -= aa[2] * bb[0];
+ c3 -= aa[3] * bb[0];
+
+ aa += 4;
+ bb += 1;
+ }
+ }
+
+ a -= 16;
+ b -= 4;
+
+ a0 = *(a + 0);
+ a4 = *(a + 4);
+ a5 = *(a + 5);
+ a8 = *(a + 8);
+ a9 = *(a + 9);
+ a10 = *(a + 10);
+ a12 = *(a + 12);
+ a13 = *(a + 13);
+ a14 = *(a + 14);
+ a15 = *(a + 15);
+
+ c3 *= a15;
+
+ c2 -= c3 * a14;
+ c2 *= a10;
+
+ c1 -= c3 * a13;
+ c1 -= c2 * a9;
+ c1 *= a5;
+
+ c0 -= c3 * a12;
+ c0 -= c2 * a8;
+ c0 -= c1 * a4;
+ c0 *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+ *(b + 2) = c2;
+ *(b + 3) = c3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+}
+
+static void dsolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1;
+ FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + 0 + ldc);
+ c1_nxt1 = *(c + 1 + ldc);
+ c0_nxt2 = *(c + 0 + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 0 + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c0_nxt1 -= aa[0] * bb[1];
+ c1_nxt1 -= aa[1] * bb[1];
+ c0_nxt2 -= aa[0] * bb[2];
+ c1_nxt2 -= aa[1] * bb[2];
+ c0_nxt3 -= aa[0] * bb[3];
+ c1_nxt3 -= aa[1] * bb[3];
+
+ aa += 2;
+ bb += 4;
+ }
+ }
+
+ a -= 4;
+ b -= 8;
+
+ a0 = *(a + 0);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+
+ c1 *= a3;
+ c0 -= c1 * a2;
+ c0 *= a0;
+
+ c1_nxt1 *= a3;
+ c0_nxt1 -= c1_nxt1 * a2;
+ c0_nxt1 *= a0;
+
+ c1_nxt2 *= a3;
+ c0_nxt2 -= c1_nxt2 * a2;
+ c0_nxt2 *= a0;
+
+ c1_nxt3 *= a3;
+ c0_nxt3 -= c1_nxt3 * a2;
+ c0_nxt3 *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt1;
+ *(b + 2) = c0_nxt2;
+ *(b + 3) = c0_nxt3;
+ *(b + 4) = c1;
+ *(b + 5) = c1_nxt1;
+ *(b + 6) = c1_nxt2;
+ *(b + 7) = c1_nxt3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + ldc) = c0_nxt1;
+ *(c + 1 + ldc) = c1_nxt1;
+ *(c + 0 + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+ *(c + 0 + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+}
+
+static void dsolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ c0_nxt = *(c + 0 + ldc);
+ c1_nxt = *(c + 1 + ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+
+ c0_nxt -= aa[0] * bb[1];
+ c1_nxt -= aa[1] * bb[1];
+
+ aa += 2;
+ bb += 2;
+ }
+ }
+
+ a -= 4;
+ b -= 4;
+
+ a0 = *(a + 0);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+
+ c1 *= a3;
+
+ c0 -= c1 * a2;
+ c0 *= a0;
+
+ c1_nxt *= a3;
+
+ c0_nxt -= c1_nxt * a2;
+ c0_nxt *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt;
+ *(b + 2) = c1;
+ *(b + 3) = c1_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+
+ *(c + 0 + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+}
+
+static void dsolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT a0, a2, a3, c0, c1;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+
+ aa += 2;
+ bb += 1;
+ }
+ }
+
+ a0 = *(a - 4);
+ a2 = *(a - 2);
+ a3 = *(a - 1);
+
+ c1 *= a3;
+ c0 -= c1 * a2;
+ c0 *= a0;
+
+ *(b - 2) = c0;
+ *(b - 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+}
+
+static void dsolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[0] * bb[1];
+ c2 -= aa[0] * bb[2];
+ c3 -= aa[0] * bb[3];
+
+ aa += 1;
+ bb += 4;
+ }
+ }
+
+ c0 *= *(a - 1);
+ c1 *= *(a - 1);
+ c2 *= *(a - 1);
+ c3 *= *(a - 1);
+
+ *(c + 0 * ldc) = c0;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
+
+ *(b - 4) = c0;
+ *(b - 3) = c1;
+ *(b - 2) = c2;
+ *(b - 1) = c3;
+}
+
+static void dsolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
+{
+ *c *= *a;
+ *(c + ldc) = *a * *(c + ldc);
+
+ *b = *c;
+ *(b + 1) = *(c + ldc);
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ FLOAT *c, BLASLONG ldc, BLASLONG offset)
+{
+ BLASLONG kk, i, j;
+ FLOAT *aa, *bb, *cc;
+
+ for (j = (n >> 2); j--;)
+ {
+ kk = m + offset;
+
+ if (m & 7)
+ {
+ if (m & 1)
+ {
+ aa = a + (m - 1) * k + kk;
+ bb = b + 4 * kk;
+ cc = c + (m - 1);
+
+ dsolve_1x4_ln_msa(aa, bb, cc, ldc, k - kk);
+
+ kk -= 1;
+ }
+
+ if (m & 2)
+ {
+ aa = a + ((m & -2) - 2) * k + 2 * kk;
+ bb = b + 4 * kk;
+ cc = c + ((m & -2) - 2);
+
+ dsolve_2x4_ln_msa(aa, bb, cc, ldc, k - kk);
+
+ kk -= 2;
+ }
+
+ if (m & 4)
+ {
+ aa = a + ((m & -4) - 4) * k + 4 * kk;
+ bb = b + 4 * kk;
+ cc = c + ((m & -4) - 4);
+
+ dsolve_4x4_ln_msa(aa, bb, cc, ldc, k - kk);
+
+ kk -= 4;
+ }
+ }
+
+ i = (m >> 3);
+ if (i > 0)
+ {
+ aa = a + ((m & -8) - 8) * k;
+ cc = c + ((m & -8) - 8);
+
+ do
+ {
+ dsolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, k - kk);
+
+ aa -= 8 * k;
+ cc -= 8;
+ kk -= 8;
+ i --;
+ } while (i > 0);
+ }
+
+ b += 4 * k;
+ c += 4 * ldc;
+ }
+
+ if (n & 3)
+ {
+ if (n & 2)
+ {
+ kk = m + offset;
+
+ if (m & 7)
+ {
+ if (m & 1)
+ {
+ aa = a + ((m & -1) - 1) * k;
+ cc = c + ((m & -1) - 1);
+
+ dsolve_1x2_ln_msa(aa + kk - 1, b + kk * 2 - 2, cc, ldc);
+
+ kk -= 1;
+ }
+
+ if (m & 2)
+ {
+ aa = a + ((m & -2) - 2) * k;
+ cc = c + ((m & -2) - 2);
+
+ dsolve_2x2_ln_msa(aa + kk * 2, b + kk * 2, cc, ldc, k - kk);
+
+ kk -= 2;
+ }
+
+ if (m & 4)
+ {
+ aa = a + ((m & -4) - 4) * k;
+ cc = c + ((m & -4) - 4);
+
+ dsolve_4x2_ln_msa(aa + kk * 4, b + kk * 2, cc, ldc, k - kk);
+
+ kk -= 4;
+ }
+ }
+
+ i = (m >> 3);
+ if (i > 0)
+ {
+ aa = a + ((m & -8) - 8) * k;
+ cc = c + ((m & -8) - 8);
+
+ do
+ {
+ dsolve_8x2_ln_msa(aa + kk * 8, b + kk * 2, cc, ldc, k - kk);
+
+ aa -= 8 * k;
+ cc -= 8;
+ kk -= 8;
+ i --;
+ } while (i > 0);
+ }
+
+ b += 2 * k;
+ c += 2 * ldc;
+ }
+
+ if (n & 1)
+ {
+ kk = m + offset;
+
+ if (m & 7)
+ {
+ if (m & 1)
+ {
+ kk -= 1;
+ aa = a + ((m & -1) - 1) * k + kk;
+ cc = c + ((m & -1) - 1);
+
+ *cc *= *aa;
+ *(b + kk) = *cc;
+ }
+
+ if (m & 2)
+ {
+ aa = a + ((m & -2) - 2) * k + kk * 2;
+ cc = c + ((m & -2) - 2);
+
+ dsolve_2x1_ln_msa(aa, b + kk, cc, k - kk);
+
+ kk -= 2;
+ }
+
+ if (m & 4)
+ {
+ aa = a + ((m & -4) - 4) * k;
+ cc = c + ((m & -4) - 4);
+
+ dsolve_4x1_ln_msa(aa + 4 * kk, b + kk, cc, k - kk);
+
+ kk -= 4;
+ }
+ }
+
+ i = (m >> 3);
+ if (i > 0)
+ {
+ aa = a + ((m & -8) - 8) * k;
+ cc = c + ((m & -8) - 8);
+
+ do
+ {
+ dsolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, k - kk);
+
+ aa -= 8 * k;
+ cc -= 8;
+ kk -= 8;
+ i --;
+ } while (i > 0);
+ }
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c
new file mode 100644
index 000000000..897fd313b
--- /dev/null
+++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c
@@ -0,0 +1,1334 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
+ v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
+ v2f64 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18;
+ v2f64 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28;
+ v2f64 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39;
+ v2f64 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
+ LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
+ LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
+
+ if (bk)
+ {
+ BLASLONG i;
+ v2f64 src_b, src_b0, src_b1, src_b2, src_b3;
+
+ LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2(b, 2, src_b0, src_b1);
+
+ for (i = (bk - 1); i--;)
+ {
+ a += 8;
+ b += 4;
+
+ LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
+ LD_DP2(b, 2, src_b2, src_b3);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ src_a0 = src_a4;
+ src_a1 = src_a5;
+ src_a2 = src_a6;
+ src_a3 = src_a7;
+ src_b0 = src_b2;
+ src_b1 = src_b3;
+ }
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ a += 8;
+ b += 4;
+ }
+
+ ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
+ ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
+ ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
+ ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9);
+ ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11);
+ ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13);
+ ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15);
+
+ src_a0 = LD_DP(a + 0);
+ src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+ src_a2 = LD_DP(a + 2);
+ src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1);
+ src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0);
+ src_a4 = LD_DP(a + 4);
+ src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
+ src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0);
+ src_a6 = LD_DP(a + 6);
+ src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
+ src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
+
+ res_c0 *= src_a0;
+ res_c1 -= res_c0 * src_a1;
+ res_c2 -= res_c0 * src_a2;
+ res_c3 -= res_c0 * src_a3;
+ res_c4 -= res_c0 * src_a4;
+ res_c5 -= res_c0 * src_a5;
+ res_c6 -= res_c0 * src_a6;
+ res_c7 -= res_c0 * src_a7;
+
+ res_c8 *= src_a0;
+ res_c9 -= res_c8 * src_a1;
+ res_c10 -= res_c8 * src_a2;
+ res_c11 -= res_c8 * src_a3;
+ res_c12 -= res_c8 * src_a4;
+ res_c13 -= res_c8 * src_a5;
+ res_c14 -= res_c8 * src_a6;
+ res_c15 -= res_c8 * src_a7;
+
+ src_a9 = __msa_cast_to_vector_double(*(a + 9));
+ src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
+ src_a10 = LD_DP(a + 10);
+ src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
+ src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
+ src_a12 = LD_DP(a + 12);
+ src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1);
+ src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0);
+ src_a14 = LD_DP(a + 14);
+ src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
+ src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0);
+
+ res_c1 *= src_a9;
+ res_c2 -= res_c1 * src_a10;
+ res_c3 -= res_c1 * src_a11;
+ res_c4 -= res_c1 * src_a12;
+ res_c5 -= res_c1 * src_a13;
+ res_c6 -= res_c1 * src_a14;
+ res_c7 -= res_c1 * src_a15;
+
+ res_c9 *= src_a9;
+ res_c10 -= res_c9 * src_a10;
+ res_c11 -= res_c9 * src_a11;
+ res_c12 -= res_c9 * src_a12;
+ res_c13 -= res_c9 * src_a13;
+ res_c14 -= res_c9 * src_a14;
+ res_c15 -= res_c9 * src_a15;
+
+ ST_DP(res_c0, b + 0);
+ ST_DP(res_c8, b + 2);
+ ST_DP(res_c1, b + 4);
+ ST_DP(res_c9, b + 6);
+
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
+ ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12);
+
+ ST_DP(src_c0, c);
+ ST_DP(src_c4, c_nxt1line);
+ ST_DP(src_c8, c_nxt2line);
+ ST_DP(src_c12, c_nxt3line);
+
+ src_a18 = LD_DP(a + 18);
+ src_a19 = (v2f64) __msa_splati_d((v2i64) src_a18, 1);
+ src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
+ src_a20 = LD_DP(a + 20);
+ src_a21 = (v2f64) __msa_splati_d((v2i64) src_a20, 1);
+ src_a20 = (v2f64) __msa_splati_d((v2i64) src_a20, 0);
+ src_a22 = LD_DP(a + 22);
+ src_a23 = (v2f64) __msa_splati_d((v2i64) src_a22, 1);
+ src_a22 = (v2f64) __msa_splati_d((v2i64) src_a22, 0);
+
+ res_c2 *= src_a18;
+ res_c3 -= res_c2 * src_a19;
+ res_c4 -= res_c2 * src_a20;
+ res_c5 -= res_c2 * src_a21;
+ res_c6 -= res_c2 * src_a22;
+ res_c7 -= res_c2 * src_a23;
+
+ res_c10 *= src_a18;
+ res_c11 -= res_c10 * src_a19;
+ res_c12 -= res_c10 * src_a20;
+ res_c13 -= res_c10 * src_a21;
+ res_c14 -= res_c10 * src_a22;
+ res_c15 -= res_c10 * src_a23;
+
+ src_a27 = __msa_cast_to_vector_double(*(a + 27));
+ src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
+ src_a28 = LD_DP(a + 28);
+ src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
+ src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
+ src_a30 = LD_DP(a + 30);
+ src_a31 = (v2f64) __msa_splati_d((v2i64) src_a30, 1);
+ src_a30 = (v2f64) __msa_splati_d((v2i64) src_a30, 0);
+
+ res_c3 *= src_a27;
+ res_c4 -= res_c3 * src_a28;
+ res_c5 -= res_c3 * src_a29;
+ res_c6 -= res_c3 * src_a30;
+ res_c7 -= res_c3 * src_a31;
+
+ res_c11 *= src_a27;
+ res_c12 -= res_c11 * src_a28;
+ res_c13 -= res_c11 * src_a29;
+ res_c14 -= res_c11 * src_a30;
+ res_c15 -= res_c11 * src_a31;
+
+ ST_DP(res_c2, b + 8);
+ ST_DP(res_c10, b + 10);
+ ST_DP(res_c3, b + 12);
+ ST_DP(res_c11, b + 14);
+
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
+ ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13);
+
+ src_a36 = LD_DP(a + 36);
+ src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1);
+ src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
+ src_a38 = LD_DP(a + 38);
+ src_a39 = (v2f64) __msa_splati_d((v2i64) src_a38, 1);
+ src_a38 = (v2f64) __msa_splati_d((v2i64) src_a38, 0);
+
+ res_c4 *= src_a36;
+ res_c5 -= res_c4 * src_a37;
+ res_c6 -= res_c4 * src_a38;
+ res_c7 -= res_c4 * src_a39;
+
+ res_c12 *= src_a36;
+ res_c13 -= res_c12 * src_a37;
+ res_c14 -= res_c12 * src_a38;
+ res_c15 -= res_c12 * src_a39;
+
+ src_a45 = __msa_cast_to_vector_double(*(a + 45));
+ src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
+ src_a46 = LD_DP(a + 46);
+ src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
+ src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
+
+ res_c5 *= src_a45;
+ res_c6 -= res_c5 * src_a46;
+ res_c7 -= res_c5 * src_a47;
+
+ res_c13 *= src_a45;
+ res_c14 -= res_c13 * src_a46;
+ res_c15 -= res_c13 * src_a47;
+
+ ST_DP(src_c1, c + 2);
+ ST_DP(src_c5, c_nxt1line + 2);
+ ST_DP(src_c9, c_nxt2line + 2);
+ ST_DP(src_c13, c_nxt3line + 2);
+
+ ST_DP(res_c4, b + 16);
+ ST_DP(res_c12, b + 18);
+ ST_DP(res_c5, b + 20);
+ ST_DP(res_c13, b + 22);
+
+ ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
+ ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14);
+
+ src_a63 = __msa_cast_to_vector_double(*(a + 63));
+ src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
+ src_a54 = LD_DP(a + 54);
+ src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
+ src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
+
+ res_c6 *= src_a54;
+ res_c7 -= res_c6 * src_a55;
+
+ res_c14 *= src_a54;
+ res_c15 -= res_c14 * src_a55;
+
+ res_c7 *= src_a63;
+ res_c15 *= src_a63;
+
+ ST_DP(src_c2, c + 4);
+ ST_DP(src_c6, c_nxt1line + 4);
+ ST_DP(src_c10, c_nxt2line + 4);
+ ST_DP(src_c14, c_nxt3line + 4);
+
+ ST_DP(res_c6, b + 24);
+ ST_DP(res_c14, b + 26);
+ ST_DP(res_c7, b + 28);
+ ST_DP(res_c15, b + 30);
+
+ ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
+ ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15);
+
+ ST_DP(src_c3, c + 6);
+ ST_DP(src_c7, c_nxt1line + 6);
+ ST_DP(src_c11, c_nxt2line + 6);
+ ST_DP(src_c15, c_nxt3line + 6);
+}
+
+static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
+ v2f64 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18;
+ v2f64 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28;
+ v2f64 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39;
+ v2f64 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
+
+ if (bk)
+ {
+ BLASLONG i;
+ v2f64 src_b, src_b0, src_b1;
+
+ LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(b);
+
+ a += 8;
+ b += 2;
+
+ for (i = (bk - 1); i--;)
+ {
+ LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
+ src_b1 = LD_DP(b);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_a0 = src_a4;
+ src_a1 = src_a5;
+ src_a2 = src_a6;
+ src_a3 = src_a7;
+ src_b0 = src_b1;
+
+ a += 8;
+ b += 2;
+ }
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+ }
+
+ ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
+ ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
+ ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
+
+ src_a0 = LD_DP(a + 0);
+ src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+ src_a2 = LD_DP(a + 2);
+ src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1);
+ src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0);
+ src_a4 = LD_DP(a + 4);
+ src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
+ src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0);
+ src_a6 = LD_DP(a + 6);
+ src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
+ src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
+
+ res_c0 *= src_a0;
+ res_c1 -= res_c0 * src_a1;
+ res_c2 -= res_c0 * src_a2;
+ res_c3 -= res_c0 * src_a3;
+ res_c4 -= res_c0 * src_a4;
+ res_c5 -= res_c0 * src_a5;
+ res_c6 -= res_c0 * src_a6;
+ res_c7 -= res_c0 * src_a7;
+
+ src_a9 = __msa_cast_to_vector_double(*(a + 9));
+ src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
+ src_a10 = LD_DP(a + 10);
+ src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
+ src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
+ src_a12 = LD_DP(a + 12);
+ src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1);
+ src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0);
+ src_a14 = LD_DP(a + 14);
+ src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
+ src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0);
+
+ res_c1 *= src_a9;
+ res_c2 -= res_c1 * src_a10;
+ res_c3 -= res_c1 * src_a11;
+ res_c4 -= res_c1 * src_a12;
+ res_c5 -= res_c1 * src_a13;
+ res_c6 -= res_c1 * src_a14;
+ res_c7 -= res_c1 * src_a15;
+
+ src_a18 = LD_DP(a + 18);
+ src_a19 = (v2f64) __msa_splati_d((v2i64) src_a18, 1);
+ src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
+ src_a20 = LD_DP(a + 20);
+ src_a21 = (v2f64) __msa_splati_d((v2i64) src_a20, 1);
+ src_a20 = (v2f64) __msa_splati_d((v2i64) src_a20, 0);
+ src_a22 = LD_DP(a + 22);
+ src_a23 = (v2f64) __msa_splati_d((v2i64) src_a22, 1);
+ src_a22 = (v2f64) __msa_splati_d((v2i64) src_a22, 0);
+
+ res_c2 *= src_a18;
+ res_c3 -= res_c2 * src_a19;
+ res_c4 -= res_c2 * src_a20;
+ res_c5 -= res_c2 * src_a21;
+ res_c6 -= res_c2 * src_a22;
+ res_c7 -= res_c2 * src_a23;
+
+ src_a27 = __msa_cast_to_vector_double(*(a + 27));
+ src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
+ src_a28 = LD_DP(a + 28);
+ src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
+ src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
+ src_a30 = LD_DP(a + 30);
+ src_a31 = (v2f64) __msa_splati_d((v2i64) src_a30, 1);
+ src_a30 = (v2f64) __msa_splati_d((v2i64) src_a30, 0);
+
+ res_c3 *= src_a27;
+ res_c4 -= res_c3 * src_a28;
+ res_c5 -= res_c3 * src_a29;
+ res_c6 -= res_c3 * src_a30;
+ res_c7 -= res_c3 * src_a31;
+
+ ST_DP(res_c0, b + 0);
+ ST_DP(res_c1, b + 2);
+ ST_DP(res_c2, b + 4);
+ ST_DP(res_c3, b + 6);
+
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c4, src_c5, c + ldc, 2);
+
+ src_a36 = LD_DP(a + 36);
+ src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1);
+ src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
+ src_a38 = LD_DP(a + 38);
+ src_a39 = (v2f64) __msa_splati_d((v2i64) src_a38, 1);
+ src_a38 = (v2f64) __msa_splati_d((v2i64) src_a38, 0);
+
+ res_c4 *= src_a36;
+ res_c5 -= res_c4 * src_a37;
+ res_c6 -= res_c4 * src_a38;
+ res_c7 -= res_c4 * src_a39;
+
+ src_a45 = __msa_cast_to_vector_double(*(a + 45));
+ src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
+ src_a46 = LD_DP(a + 46);
+ src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
+ src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
+
+ res_c5 *= src_a45;
+ res_c6 -= res_c5 * src_a46;
+ res_c7 -= res_c5 * src_a47;
+
+ src_a63 = __msa_cast_to_vector_double(*(a + 63));
+ src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
+ src_a54 = LD_DP(a + 54);
+ src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
+ src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
+
+ res_c6 *= src_a54;
+ res_c7 -= res_c6 * src_a55;
+
+ res_c7 *= src_a63;
+
+ ST_DP(res_c4, b + 8);
+ ST_DP(res_c5, b + 10);
+ ST_DP(res_c6, b + 12);
+ ST_DP(res_c7, b + 14);
+
+ ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
+ ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
+
+ ST_DP2(src_c2, src_c3, c + 4, 2);
+ ST_DP2(src_c6, src_c7, c + 4 + ldc, 2);
+}
+
+static void dsolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18;
+ FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39;
+ FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+ c4 = *(c + 4);
+ c5 = *(c + 5);
+ c6 = *(c + 6);
+ c7 = *(c + 7);
+
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--; )
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c2 -= a[2] * b[0];
+ c3 -= a[3] * b[0];
+ c4 -= a[4] * b[0];
+ c5 -= a[5] * b[0];
+ c6 -= a[6] * b[0];
+ c7 -= a[7] * b[0];
+
+ a += 8;
+ b += 1;
+ }
+ }
+
+ a0 = *(a + 0);
+ a1 = *(a + 1);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+ a4 = *(a + 4);
+ a5 = *(a + 5);
+ a6 = *(a + 6);
+ a7 = *(a + 7);
+ a9 = *(a + 9);
+ a10 = *(a + 10);
+ a11 = *(a + 11);
+ a12 = *(a + 12);
+ a13 = *(a + 13);
+ a14 = *(a + 14);
+ a15 = *(a + 15);
+ a18 = *(a + 18);
+ a19 = *(a + 19);
+ a20 = *(a + 20);
+ a21 = *(a + 21);
+ a22 = *(a + 22);
+ a23 = *(a + 23);
+ a27 = *(a + 27);
+ a28 = *(a + 28);
+ a29 = *(a + 29);
+ a30 = *(a + 30);
+ a31 = *(a + 31);
+ a36 = *(a + 36);
+ a37 = *(a + 37);
+ a38 = *(a + 38);
+ a39 = *(a + 39);
+ a45 = *(a + 45);
+ a46 = *(a + 46);
+ a47 = *(a + 47);
+ a54 = *(a + 54);
+ a55 = *(a + 55);
+ a63 = *(a + 63);
+
+ c0 *= a0;
+
+ c1 -= c0 * a1;
+ c1 *= a9;
+
+ c2 -= c0 * a2;
+ c2 -= c1 * a10;
+ c2 *= a18;
+
+ c3 -= c0 * a3;
+ c3 -= c1 * a11;
+ c3 -= c2 * a19;
+ c3 *= a27;
+
+ c4 -= c0 * a4;
+ c4 -= c1 * a12;
+ c4 -= c2 * a20;
+ c4 -= c3 * a28;
+ c4 *= a36;
+
+ c5 -= c0 * a5;
+ c5 -= c1 * a13;
+ c5 -= c2 * a21;
+ c5 -= c3 * a29;
+ c5 -= c4 * a37;
+ c5 *= a45;
+
+ c6 -= c0 * a6;
+ c6 -= c1 * a14;
+ c6 -= c2 * a22;
+ c6 -= c3 * a30;
+ c6 -= c4 * a38;
+ c6 -= c5 * a46;
+ c6 *= a54;
+
+ c7 -= c0 * a7;
+ c7 -= c1 * a15;
+ c7 -= c2 * a23;
+ c7 -= c3 * a31;
+ c7 -= c4 * a39;
+ c7 -= c5 * a47;
+ c7 -= c6 * a55;
+ c7 *= a63;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+ *(c + 4) = c4;
+ *(c + 5) = c5;
+ *(c + 6) = c6;
+ *(c + 7) = c7;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+ *(b + 2) = c2;
+ *(b + 3) = c3;
+ *(b + 4) = c4;
+ *(b + 5) = c5;
+ *(b + 6) = c6;
+ *(b + 7) = c7;
+}
+
+static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7;
+ v2f64 src_a10, src_a11, src_a15;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+ LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
+ LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
+
+ if (bk)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(a, 2, src_a0, src_a1);
+ LD_DP2(b, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c6 -= src_a0 * src_b;
+ src_c7 -= src_a1 * src_b;
+
+ a += 4;
+ b += 4;
+ }
+ }
+
+ ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
+ ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5);
+ ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7);
+
+ src_a0 = LD_DP(a + 0);
+ src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+ src_a2 = LD_DP(a + 2);
+ src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1);
+ src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0);
+
+ res_c0 *= src_a0;
+ res_c1 -= res_c0 * src_a1;
+ res_c2 -= res_c0 * src_a2;
+ res_c3 -= res_c0 * src_a3;
+
+ res_c4 *= src_a0;
+ res_c5 -= res_c4 * src_a1;
+ res_c6 -= res_c4 * src_a2;
+ res_c7 -= res_c4 * src_a3;
+
+ src_a5 = __msa_cast_to_vector_double(*(a + 5));
+ src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
+ src_a6 = LD_DP(a + 6);
+ src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
+ src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
+
+ res_c1 *= src_a5;
+ res_c2 -= res_c1 * src_a6;
+ res_c3 -= res_c1 * src_a7;
+
+ res_c5 *= src_a5;
+ res_c6 -= res_c5 * src_a6;
+ res_c7 -= res_c5 * src_a7;
+
+ src_a10 = LD_DP(a + 10);
+ src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
+ src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
+ src_a15 = __msa_cast_to_vector_double(*(a + 15));
+ src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
+
+ res_c2 *= src_a10;
+ res_c3 -= res_c2 * src_a11;
+ res_c3 *= src_a15;
+
+ res_c6 *= src_a10;
+ res_c7 -= res_c6 * src_a11;
+ res_c7 *= src_a15;
+
+ ST_DP(res_c0, b + 0);
+ ST_DP(res_c4, b + 2);
+ ST_DP(res_c1, b + 4);
+ ST_DP(res_c5, b + 6);
+ ST_DP(res_c2, b + 8);
+ ST_DP(res_c6, b + 10);
+ ST_DP(res_c3, b + 12);
+ ST_DP(res_c7, b + 14);
+
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
+ ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6);
+ ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7);
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+ ST_DP2(src_c4, src_c5, c + 2 * ldc, 2);
+ ST_DP2(src_c6, src_c7, c + 3 * ldc, 2);
+}
+
+static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7;
+ v2f64 src_a10, src_a11, src_a15;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+
+ if (bk)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_b, src_b0;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(a, 2, src_a0, src_a1);
+ src_b0 = LD_DP(b);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ a += 4;
+ b += 2;
+ }
+ }
+
+ ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
+
+ src_a0 = LD_DP(a + 0);
+ src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+ src_a2 = LD_DP(a + 2);
+ src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1);
+ src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0);
+
+ res_c0 *= src_a0;
+ res_c1 -= res_c0 * src_a1;
+ res_c2 -= res_c0 * src_a2;
+ res_c3 -= res_c0 * src_a3;
+
+ src_a5 = __msa_cast_to_vector_double(*(a + 5));
+ src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
+ src_a6 = LD_DP(a + 6);
+ src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
+ src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
+
+ res_c1 *= src_a5;
+ res_c2 -= res_c1 * src_a6;
+ res_c3 -= res_c1 * src_a7;
+
+ src_a10 = LD_DP(a + 10);
+ src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
+ src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
+ src_a15 = __msa_cast_to_vector_double(*(a + 15));
+ src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
+
+ res_c2 *= src_a10;
+ res_c3 -= res_c2 * src_a11;
+ res_c3 *= src_a15;
+
+ ST_DP(res_c0, b + 0);
+ ST_DP(res_c1, b + 2);
+ ST_DP(res_c2, b + 4);
+ ST_DP(res_c3, b + 6);
+
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+}
+
+static void dsolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c2 -= a[2] * b[0];
+ c3 -= a[3] * b[0];
+
+ a += 4;
+ b += 1;
+ }
+ }
+
+ a0 = *(a + 0);
+ a1 = *(a + 1);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+ a5 = *(a + 5);
+ a6 = *(a + 6);
+ a7 = *(a + 7);
+ a10 = *(a + 10);
+ a11 = *(a + 11);
+ a15 = *(a + 15);
+
+ c0 *= a0;
+
+ c1 -= c0 * a1;
+ c1 *= a5;
+
+ c2 -= c0 * a2;
+ c2 -= c1 * a6;
+ c2 *= a10;
+
+ c3 -= c0 * a3;
+ c3 -= c1 * a7;
+ c3 -= c2 * a11;
+ c3 *= a15;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+ *(b + 2) = c2;
+ *(b + 3) = c3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+}
+
+static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1;
+ FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + ldc);
+ c1_nxt1 = *(c + 1 + ldc);
+ c0_nxt2 = *(c + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c0_nxt1 -= a[0] * b[1];
+ c1_nxt1 -= a[1] * b[1];
+ c0_nxt2 -= a[0] * b[2];
+ c1_nxt2 -= a[1] * b[2];
+ c0_nxt3 -= a[0] * b[3];
+ c1_nxt3 -= a[1] * b[3];
+
+ a += 2;
+ b += 4;
+ }
+ }
+
+ a0 = *a;
+ a1 = *(a + 1);
+ a3 = *(a + 3);
+
+ c0 *= a0;
+ c1 -= c0 * a1;
+ c1 *= a3;
+
+ c0_nxt1 *= a0;
+ c1_nxt1 -= c0_nxt1 * a1;
+ c1_nxt1 *= a3;
+
+ c0_nxt2 *= a0;
+ c1_nxt2 -= c0_nxt2 * a1;
+ c1_nxt2 *= a3;
+
+ c0_nxt3 *= a0;
+ c1_nxt3 -= c0_nxt3 * a1;
+ c1_nxt3 *= a3;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt1;
+ *(b + 2) = c0_nxt2;
+ *(b + 3) = c0_nxt3;
+ *(b + 4) = c1;
+ *(b + 5) = c1_nxt1;
+ *(b + 6) = c1_nxt2;
+ *(b + 7) = c1_nxt3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + ldc) = c0_nxt1;
+ *(c + 1 + ldc) = c1_nxt1;
+ *(c + 0 + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+ *(c + 0 + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+}
+
+static void dsolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ c0_nxt = *(c + ldc);
+ c1_nxt = *(c + 1 + ldc);
+
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+
+ c0_nxt -= a[0] * b[1];
+ c1_nxt -= a[1] * b[1];
+
+ a += 2;
+ b += 2;
+ }
+ }
+
+ a0 = *a;
+ a1 = *(a + 1);
+ a3 = *(a + 3);
+
+ c0 *= a0;
+ c1 -= c0 * a1;
+ c1 *= a3;
+
+ c0_nxt *= a0;
+ c1_nxt -= c0_nxt * a1;
+ c1_nxt *= a3;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt;
+ *(b + 2) = c1;
+ *(b + 3) = c1_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+
+ *(c + 0 + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+}
+
+static void dsolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT a0, a1, a3, c0, c1;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+
+ a += 2;
+ b += 1;
+ }
+ }
+
+ a0 = *(a + 0);
+ a1 = *(a + 1);
+ a3 = *(a + 3);
+
+ c0 *= a0;
+ c1 -= c0 * a1;
+ c1 *= a3;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+}
+
+static void dsolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
+
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[0] * b[1];
+ c2 -= a[0] * b[2];
+ c3 -= a[0] * b[3];
+
+ a += 1;
+ b += 4;
+ }
+ }
+
+ c0 *= *a;
+ c1 *= *a;
+ c2 *= *a;
+ c3 *= *a;
+
+ *(c + 0 * ldc) = c0;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+ *(b + 2) = c2;
+ *(b + 3) = c3;
+}
+
+static void dsolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT c0, c1;
+
+ c0 = *c;
+ c1 = *(c + ldc);
+
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= *a * b[0];
+ c1 -= *a * b[1];
+
+ a += 1;
+ b += 2;
+ }
+ }
+
+ c0 *= *a;
+ c1 *= *a;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + ldc) = c1;
+}
+
+static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ *c -= *a * *b;
+
+ a += 1;
+ b += 1;
+ }
+ }
+
+ *c *= *a;
+ *b = *c;
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ FLOAT *c, BLASLONG ldc, BLASLONG offset)
+{
+ BLASLONG i, j, kk;
+ FLOAT *aa, *cc;
+
+ for (j = (n >> 2); j--;)
+ {
+ kk = offset;
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x4_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ kk += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x4_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ kk += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x4_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ kk += 2;
+ }
+
+ if (m & 1)
+ {
+ dsolve_1x4_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += k;
+ cc += 1;
+ kk += 1;
+ }
+ }
+
+ b += 4 * k;
+ c += 4 * ldc;
+ }
+
+ if (n & 3)
+ {
+ if (n & 2)
+ {
+ kk = offset;
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x2_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ kk += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x2_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ kk += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x2_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ kk += 2;
+ }
+
+ if (m & 1)
+ {
+ dsolve_1x2_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += k;
+ cc += 1;
+ kk += 1;
+ }
+ }
+
+ b += 2 * k;
+ c += 2 * ldc;
+ }
+
+ if (n & 1)
+ {
+ kk = offset;
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x1_lt_msa(aa, b, cc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ kk += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x1_lt_msa(aa, b, cc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ kk += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x1_lt_msa(aa, b, cc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ kk += 2;
+ }
+
+ if (m & 1)
+ {
+ dgmm_dsolve_1x1_msa(aa, b, cc, kk);
+
+ aa += k;
+ cc += 1;
+ kk += 1;
+ }
+ }
+
+ b += k;
+ c += ldc;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c
new file mode 100644
index 000000000..44313241e
--- /dev/null
+++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c
@@ -0,0 +1,953 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
+ v2f64 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7;
+ v2f64 src_b10, src_b11, src_b15;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
+ LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
+ LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
+
+ if (bk)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
+ v2f64 src_b;
+
+ LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2(b, 2, src_b0, src_b1);
+
+ for (i = (bk - 1); i--;)
+ {
+ a += 8;
+ b += 4;
+
+ LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
+ LD_DP2(b, 2, src_b2, src_b3);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ src_a0 = src_a4;
+ src_a1 = src_a5;
+ src_a2 = src_a6;
+ src_a3 = src_a7;
+ src_b0 = src_b2;
+ src_b1 = src_b3;
+ }
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ a += 8;
+ b += 4;
+ }
+
+ src_b0 = LD_DP(b + 0);
+ src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b2 = LD_DP(b + 2);
+ src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
+ src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
+ src_b5 = __msa_cast_to_vector_double(*(b + 5));
+ src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0);
+ src_b6 = LD_DP(b + 6);
+ src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1);
+ src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0);
+ src_b10 = LD_DP(b + 10);
+ src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1);
+ src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
+ src_b15 = __msa_cast_to_vector_double(*(b + 15));
+ src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0);
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 *= src_b0;
+ src_c3 *= src_b0;
+
+ src_c4 -= src_c0 * src_b1;
+ src_c5 -= src_c1 * src_b1;
+ src_c6 -= src_c2 * src_b1;
+ src_c7 -= src_c3 * src_b1;
+
+ src_c4 *= src_b5;
+ src_c5 *= src_b5;
+ src_c6 *= src_b5;
+ src_c7 *= src_b5;
+
+ src_c8 -= src_c0 * src_b2;
+ src_c9 -= src_c1 * src_b2;
+ src_c10 -= src_c2 * src_b2;
+ src_c11 -= src_c3 * src_b2;
+
+ src_c8 -= src_c4 * src_b6;
+ src_c9 -= src_c5 * src_b6;
+ src_c10 -= src_c6 * src_b6;
+ src_c11 -= src_c7 * src_b6;
+
+ src_c8 *= src_b10;
+ src_c9 *= src_b10;
+ src_c10 *= src_b10;
+ src_c11 *= src_b10;
+
+ src_c12 -= src_c0 * src_b3;
+ src_c13 -= src_c1 * src_b3;
+ src_c14 -= src_c2 * src_b3;
+ src_c15 -= src_c3 * src_b3;
+
+ src_c12 -= src_c4 * src_b7;
+ src_c13 -= src_c5 * src_b7;
+ src_c14 -= src_c6 * src_b7;
+ src_c15 -= src_c7 * src_b7;
+
+ src_c12 -= src_c8 * src_b11;
+ src_c13 -= src_c9 * src_b11;
+ src_c14 -= src_c10 * src_b11;
+ src_c15 -= src_c11 * src_b11;
+
+ src_c12 *= src_b15;
+ src_c13 *= src_b15;
+ src_c14 *= src_b15;
+ src_c15 *= src_b15;
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, c_nxt1line, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
+ ST_DP4(src_c8, src_c9, src_c10, src_c11, c_nxt2line, 2);
+ ST_DP4(src_c8, src_c9, src_c10, src_c11, a + 16, 2);
+ ST_DP4(src_c12, src_c13, src_c14, src_c15, c_nxt3line, 2);
+ ST_DP4(src_c12, src_c13, src_c14, src_c15, a + 24, 2);
+}
+
+static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 src_b0, src_b1, src_b3, src_b;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
+
+ if (bk)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
+
+ LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(b);
+
+ a += 8;
+ b += 2;
+
+ for (i = (bk - 1); i--;)
+ {
+ LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
+ src_b1 = LD_DP(b);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_a0 = src_a4;
+ src_a1 = src_a5;
+ src_a2 = src_a6;
+ src_a3 = src_a7;
+ src_b0 = src_b1;
+
+ a += 8;
+ b += 2;
+ }
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+ }
+
+ src_b0 = LD_DP(b + 0);
+ src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b3 = __msa_cast_to_vector_double(*(b + 3));
+ src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0);
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 *= src_b0;
+ src_c3 *= src_b0;
+
+ src_c4 -= src_c0 * src_b1;
+ src_c5 -= src_c1 * src_b1;
+ src_c6 -= src_c2 * src_b1;
+ src_c7 -= src_c3 * src_b1;
+
+ src_c4 *= src_b3;
+ src_c5 *= src_b3;
+ src_c6 *= src_b3;
+ src_c7 *= src_b3;
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2);
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
+}
+
+static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3;
+ v2f64 src_b0;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+
+ if (bk)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_b;
+
+ for (i = bk; i--;)
+ {
+ LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b = LD_DP(b);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b, (v2i64) src_b);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ a += 8;
+ b += 1;
+ }
+ }
+
+ src_b0 = __msa_cast_to_vector_double(*b);
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 *= src_b0;
+ src_c3 *= src_b0;
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+}
+
+static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7;
+ v2f64 src_b10, src_b11, src_b15;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+ LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
+ LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
+
+ if (bk)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(a, 2, src_a0, src_a1);
+ LD_DP2(b, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c6 -= src_a0 * src_b;
+ src_c7 -= src_a1 * src_b;
+
+ a += 4;
+ b += 4;
+ }
+ }
+
+ src_b0 = LD_DP(b + 0);
+ src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b2 = LD_DP(b + 2);
+ src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
+ src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
+ src_b5 = __msa_cast_to_vector_double(*(b + 5));
+ src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0);
+ src_b6 = LD_DP(b + 6);
+ src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1);
+ src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0);
+ src_b10 = LD_DP(b + 10);
+ src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1);
+ src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
+ src_b15 = __msa_cast_to_vector_double(*(b + 15));
+ src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0);
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+
+ src_c2 -= src_c0 * src_b1;
+ src_c3 -= src_c1 * src_b1;
+
+ src_c2 *= src_b5;
+ src_c3 *= src_b5;
+
+ src_c4 -= src_c0 * src_b2;
+ src_c5 -= src_c1 * src_b2;
+
+ src_c4 -= src_c2 * src_b6;
+ src_c5 -= src_c3 * src_b6;
+
+ src_c4 *= src_b10;
+ src_c5 *= src_b10;
+
+ src_c6 -= src_c0 * src_b3;
+ src_c7 -= src_c1 * src_b3;
+
+ src_c6 -= src_c2 * src_b7;
+ src_c7 -= src_c3 * src_b7;
+
+ src_c6 -= src_c4 * src_b11;
+ src_c7 -= src_c5 * src_b11;
+
+ src_c6 *= src_b15;
+ src_c7 *= src_b15;
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+ ST_DP2(src_c4, src_c5, c + 2 * ldc, 2);
+ ST_DP2(src_c6, src_c7, c + 3 * ldc, 2);
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
+}
+
+static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+
+ if (bk)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_b, src_b0;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(a, 2, src_a0, src_a1);
+ src_b0 = LD_DP(b);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ a += 4;
+ b += 2;
+ }
+ }
+
+ src_b0 = LD_DP(b + 0);
+ src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b3 = __msa_cast_to_vector_double(*(b + 3));
+ src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0);
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+
+ src_c2 -= src_c0 * src_b1;
+ src_c3 -= src_c1 * src_b1;
+
+ src_c2 *= src_b3;
+ src_c3 *= src_b3;
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+}
+
+static void dsolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c2 -= a[2] * b[0];
+ c3 -= a[3] * b[0];
+
+ a += 4;
+ b += 1;
+ }
+ }
+
+ c0 *= *b;
+ c1 *= *b;
+ c2 *= *b;
+ c3 *= *b;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c2;
+ *(a + 3) = c3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+}
+
+static void dsolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15;
+ FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3;
+ FLOAT c1, c1_nxt1, c1_nxt2, c1_nxt3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + 0 + 1 * ldc);
+ c1_nxt1 = *(c + 1 + 1 * ldc);
+ c0_nxt2 = *(c + 0 + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 0 + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c0_nxt1 -= a[0] * b[1];
+ c1_nxt1 -= a[1] * b[1];
+ c0_nxt2 -= a[0] * b[2];
+ c1_nxt2 -= a[1] * b[2];
+ c0_nxt3 -= a[0] * b[3];
+ c1_nxt3 -= a[1] * b[3];
+
+ a += 2;
+ b += 4;
+ }
+ }
+
+ b0 = *(b + 0);
+ b1 = *(b + 1);
+ b2 = *(b + 2);
+ b3 = *(b + 3);
+ b5 = *(b + 5);
+ b6 = *(b + 6);
+ b7 = *(b + 7);
+ b10 = *(b + 10);
+ b11 = *(b + 11);
+ b15 = *(b + 15);
+
+ c0 *= b0;
+ c1 *= b0;
+
+ c0_nxt1 -= c0 * b1;
+ c1_nxt1 -= c1 * b1;
+ c0_nxt1 *= b5;
+ c1_nxt1 *= b5;
+
+ c0_nxt2 -= c0 * b2;
+ c1_nxt2 -= c1 * b2;
+ c0_nxt2 -= c0_nxt1 * b6;
+ c1_nxt2 -= c1_nxt1 * b6;
+ c0_nxt2 *= b10;
+ c1_nxt2 *= b10;
+
+ c0_nxt3 -= c0 * b3;
+ c1_nxt3 -= c1 * b3;
+ c0_nxt3 -= c0_nxt1 * b7;
+ c1_nxt3 -= c1_nxt1 * b7;
+ c0_nxt3 -= c0_nxt2 * b11;
+ c1_nxt3 -= c1_nxt2 * b11;
+ c0_nxt3 *= b15;
+ c1_nxt3 *= b15;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c0_nxt1;
+ *(a + 3) = c1_nxt1;
+ *(a + 4) = c0_nxt2;
+ *(a + 5) = c1_nxt2;
+ *(a + 6) = c0_nxt3;
+ *(a + 7) = c1_nxt3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 1 * ldc) = c0_nxt1;
+ *(c + 1 + 1 * ldc) = c1_nxt1;
+ *(c + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+ *(c + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+}
+
+static void dsolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt = *(c + 0 + ldc);
+ c1_nxt = *(c + 1 + ldc);
+
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+
+ c0_nxt -= a[0] * b[1];
+ c1_nxt -= a[1] * b[1];
+
+ a += 2;
+ b += 2;
+ }
+ }
+
+ b0 = *(b + 0);
+ b1 = *(b + 1);
+ b3 = *(b + 3);
+
+ c0 *= b0;
+ c1 *= b0;
+
+ c0_nxt -= c0 * b1;
+ c1_nxt -= c1 * b1;
+
+ c0_nxt *= b3;
+ c1_nxt *= b3;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c0_nxt;
+ *(a + 3) = c1_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+}
+
+static void dsolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT b0, c0, c1;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+
+ a += 2;
+ b += 1;
+ }
+ }
+
+ b0 = *b;
+
+ c0 *= b0;
+ c1 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+}
+
+static void dsolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
+
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[0] * b[1];
+ c2 -= a[0] * b[2];
+ c3 -= a[0] * b[3];
+
+ a += 1;
+ b += 4;
+ }
+ }
+
+ b0 = *(b + 0);
+ b1 = *(b + 1);
+ b2 = *(b + 2);
+ b3 = *(b + 3);
+ b5 = *(b + 5);
+ b6 = *(b + 6);
+ b7 = *(b + 7);
+ b10 = *(b + 10);
+ b11 = *(b + 11);
+ b15 = *(b + 15);
+
+ c0 *= b0;
+
+ c1 -= c0 * b1;
+ c1 *= b5;
+
+ c2 -= c0 * b2;
+ c2 -= c1 * b6;
+ c2 *= b10;
+
+ c3 -= c0 * b3;
+ c3 -= c1 * b7;
+ c3 -= c2 * b11;
+ c3 *= b15;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c2;
+ *(a + 3) = c3;
+
+ *(c + 0) = c0;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
+}
+
+static void dsolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b1, b3, c0, c1;
+
+ c0 = *c;
+ c1 = *(c + ldc);
+
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= *a * b[0];
+ c1 -= *a * b[1];
+
+ a += 1;
+ b += 2;
+ }
+ }
+
+ b0 = *(b + 0);
+ b1 = *(b + 1);
+ b3 = *(b + 3);
+
+ c0 *= b0;
+
+ c1 -= c0 * b1;
+ c1 *= b3;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + ldc) = c1;
+}
+
+static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ if (bk)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ *c -= *a * *b;
+
+ a += 1;
+ b += 1;
+ }
+ }
+
+ *c *= *a;
+ *b = *c;
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ FLOAT *c, BLASLONG ldc, BLASLONG offset)
+{
+ BLASLONG i, j, kk;
+ FLOAT *aa, *cc;
+
+ kk = -offset;
+
+ for (j = (n >> 2); j--;)
+ {
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x4_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x4_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x4_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ dsolve_1x4_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ kk += 4;
+ b += 4 * k;
+ c += 4 * ldc;
+ }
+
+ if (n & 3)
+ {
+ if (n & 2)
+ {
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x2_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x2_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x2_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ dsolve_1x2_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ b += 2 * k;
+ c += 2 * ldc;
+ kk += 2;
+ }
+
+ if (n & 1)
+ {
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x1_rn_msa(aa, b, cc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x1_rn_msa(aa, b, cc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x1_rn_msa(aa, b, cc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ dgmm_dsolve_1x1_msa(b, aa, cc, kk);
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ b += k;
+ c += ldc;
+ kk += 1;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c
new file mode 100644
index 000000000..49274e5bc
--- /dev/null
+++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c
@@ -0,0 +1,1015 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
+ v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13;
+ v2f64 src_b14, src_b15;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
+ LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
+ LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *pba = a, *pbb = b;
+ v2f64 src_b, src_b0, src_b1, src_b2, src_b3;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
+
+ LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2(pbb, 2, src_b0, src_b1);
+
+ for (i = (bk - 1); i--;)
+ {
+ pba += 8;
+ pbb += 4;
+
+ LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7);
+ LD_DP2(pbb, 2, src_b2, src_b3);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ src_a0 = src_a4;
+ src_a1 = src_a5;
+ src_a2 = src_a6;
+ src_a3 = src_a7;
+ src_b0 = src_b2;
+ src_b1 = src_b3;
+ }
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+ }
+
+ a -= 32;
+ b -= 16;
+
+ src_b12 = LD_DP(b + 12);
+ src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1);
+ src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0);
+ src_b14 = LD_DP(b + 14);
+ src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1);
+ src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0);
+
+ src_b8 = LD_DP(b + 8);
+ src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
+ src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
+ src_b10 = __msa_cast_to_vector_double(*(b + 10));
+ src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
+
+ src_b0 = __msa_cast_to_vector_double(*(b + 0));
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b4 = LD_DP(b + 4);
+ src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
+ src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
+
+ src_c12 *= src_b15;
+ src_c13 *= src_b15;
+ src_c14 *= src_b15;
+ src_c15 *= src_b15;
+
+ src_c8 -= src_c12 * src_b14;
+ src_c9 -= src_c13 * src_b14;
+ src_c10 -= src_c14 * src_b14;
+ src_c11 -= src_c15 * src_b14;
+
+ src_c8 *= src_b10;
+ src_c9 *= src_b10;
+ src_c10 *= src_b10;
+ src_c11 *= src_b10;
+
+ src_c4 -= src_c12 * src_b13;
+ src_c5 -= src_c13 * src_b13;
+ src_c6 -= src_c14 * src_b13;
+ src_c7 -= src_c15 * src_b13;
+
+ src_c4 -= src_c8 * src_b9;
+ src_c5 -= src_c9 * src_b9;
+ src_c6 -= src_c10 * src_b9;
+ src_c7 -= src_c11 * src_b9;
+
+ src_c4 *= src_b5;
+ src_c5 *= src_b5;
+ src_c6 *= src_b5;
+ src_c7 *= src_b5;
+
+ src_c0 -= src_c12 * src_b12;
+ src_c1 -= src_c13 * src_b12;
+ src_c2 -= src_c14 * src_b12;
+ src_c3 -= src_c15 * src_b12;
+
+ src_c0 -= src_c8 * src_b8;
+ src_c1 -= src_c9 * src_b8;
+ src_c2 -= src_c10 * src_b8;
+ src_c3 -= src_c11 * src_b8;
+
+ src_c0 -= src_c4 * src_b4;
+ src_c1 -= src_c5 * src_b4;
+ src_c2 -= src_c6 * src_b4;
+ src_c3 -= src_c7 * src_b4;
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 *= src_b0;
+ src_c3 *= src_b0;
+
+ ST_DP4(src_c12, src_c13, src_c14, src_c15, c_nxt3line, 2);
+ ST_DP4(src_c12, src_c13, src_c14, src_c15, a + 24, 2);
+ ST_DP4(src_c8, src_c9, src_c10, src_c11, c_nxt2line, 2);
+ ST_DP4(src_c8, src_c9, src_c10, src_c11, a + 16, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, c_nxt1line, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+}
+
+static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 src_b0, src_b2, src_b3;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *pba = a, *pbb = b;
+ v2f64 src_b, src_b1, src_a0, src_a1, src_a2, src_a3;
+ v2f64 src_a4, src_a5, src_a6, src_a7;
+
+ LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(pbb);
+
+ for (i = bk - 1; i--;)
+ {
+ pba += 8;
+ pbb += 2;
+
+ LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7);
+ src_b1 = LD_DP(pbb);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_a0 = src_a4;
+ src_a1 = src_a5;
+ src_a2 = src_a6;
+ src_a3 = src_a7;
+ src_b0 = src_b1;
+ }
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+ }
+
+ a -= 16;
+ b -= 4;
+
+ src_b0 = __msa_cast_to_vector_double(*(b + 0));
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b2 = LD_DP(b + 2);
+ src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
+ src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
+
+ src_c4 *= src_b3;
+ src_c5 *= src_b3;
+ src_c6 *= src_b3;
+ src_c7 *= src_b3;
+
+ src_c0 -= src_c4 * src_b2;
+ src_c1 -= src_c5 * src_b2;
+ src_c2 -= src_c6 * src_b2;
+ src_c3 -= src_c7 * src_b2;
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 *= src_b0;
+ src_c3 *= src_b0;
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2);
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
+}
+
+static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3;
+ v2f64 src_b0;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
+ v2f64 src_b1;
+
+ LD_DP4(aa, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(bb);
+
+ aa += 8;
+ bb += 1;
+
+ for (i = (bk - 1); i--;)
+ {
+ LD_DP4(aa, 2, src_a4, src_a5, src_a6, src_a7);
+ src_b1 = LD_DP(bb);
+
+ src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a2 * src_b0;
+ src_c3 -= src_a3 * src_b0;
+
+ src_a0 = src_a4;
+ src_a1 = src_a5;
+ src_a2 = src_a6;
+ src_a3 = src_a7;
+ src_b0 = src_b1;
+
+ aa += 8;
+ bb += 1;
+ }
+
+ src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a2 * src_b0;
+ src_c3 -= src_a3 * src_b0;
+ }
+
+ a -= 8;
+ b -= 1;
+
+ src_b0 = __msa_cast_to_vector_double(*b);
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 *= src_b0;
+ src_c3 *= src_b0;
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+}
+
+static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13;
+ v2f64 src_b14, src_b15;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+ LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
+ LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+ v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(aa, 2, src_a0, src_a1);
+ LD_DP2(bb, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c6 -= src_a0 * src_b;
+ src_c7 -= src_a1 * src_b;
+
+ aa += 4;
+ bb += 4;
+ }
+ }
+
+ a -= 16;
+ b -= 16;
+
+ src_b12 = LD_DP(b + 12);
+ src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1);
+ src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0);
+ src_b14 = LD_DP(b + 14);
+ src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1);
+ src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0);
+
+ src_b8 = LD_DP(b + 8);
+ src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
+ src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
+ src_b10 = __msa_cast_to_vector_double(*(b + 10));
+ src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
+
+ src_b0 = __msa_cast_to_vector_double(*(b + 0));
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b4 = LD_DP(b + 4);
+ src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
+ src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
+
+ src_c6 *= src_b15;
+ src_c7 *= src_b15;
+
+ src_c4 -= src_c6 * src_b14;
+ src_c5 -= src_c7 * src_b14;
+
+ src_c4 *= src_b10;
+ src_c5 *= src_b10;
+
+ src_c2 -= src_c6 * src_b13;
+ src_c3 -= src_c7 * src_b13;
+
+ src_c2 -= src_c4 * src_b9;
+ src_c3 -= src_c5 * src_b9;
+
+ src_c2 *= src_b5;
+ src_c3 *= src_b5;
+
+ src_c0 -= src_c6 * src_b12;
+ src_c1 -= src_c7 * src_b12;
+
+ src_c0 -= src_c4 * src_b8;
+ src_c1 -= src_c5 * src_b8;
+
+ src_c0 -= src_c2 * src_b4;
+ src_c1 -= src_c3 * src_b4;
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+
+ ST_DP2(src_c6, src_c7, c + 3 * ldc, 2);
+ ST_DP2(src_c4, src_c5, c + 2 * ldc, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+ ST_DP2(src_c0, src_c1, c, 2);
+
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+}
+
+static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+ v2f64 src_a0, src_a1, src_b, src_b0;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(aa, 2, src_a0, src_a1);
+ src_b0 = LD_DP(bb);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ aa += 4;
+ bb += 2;
+ }
+ }
+
+ a -= 8;
+ b -= 4;
+
+ src_b0 = __msa_cast_to_vector_double(*(b + 0));
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b2 = LD_DP(b + 2);
+ src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
+ src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
+
+ src_c2 *= src_b3;
+ src_c3 *= src_b3;
+
+ src_c0 -= src_c2 * src_b2;
+ src_c1 -= src_c3 * src_b2;
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+}
+
+static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT b0, c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c2 -= aa[2] * bb[0];
+ c3 -= aa[3] * bb[0];
+
+ aa += 4;
+ bb += 1;
+ }
+ }
+
+ a -= 4;
+
+ b0 = *(b - 1);
+
+ c0 *= b0;
+ c1 *= b0;
+ c2 *= b0;
+ c3 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c2;
+ *(a + 3) = c3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+}
+
+static void dsolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
+ FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + 0 + 1 * ldc);
+ c1_nxt1 = *(c + 1 + 1 * ldc);
+ c0_nxt2 = *(c + 0 + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 0 + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c0_nxt1 -= aa[0] * bb[1];
+ c1_nxt1 -= aa[1] * bb[1];
+ c0_nxt2 -= aa[0] * bb[2];
+ c1_nxt2 -= aa[1] * bb[2];
+ c0_nxt3 -= aa[0] * bb[3];
+ c1_nxt3 -= aa[1] * bb[3];
+
+ aa += 2;
+ bb += 4;
+ }
+ }
+
+ a -= 8;
+ b -= 16;
+
+ b0 = *b;
+ b4 = *(b + 4);
+ b5 = *(b + 5);
+ b8 = *(b + 8);
+ b9 = *(b + 9);
+ b10 = *(b + 10);
+ b12 = *(b + 12);
+ b13 = *(b + 13);
+ b14 = *(b + 14);
+ b15 = *(b + 15);
+
+ c0_nxt3 *= b15;
+ c1_nxt3 *= b15;
+
+ c0_nxt2 -= c0_nxt3 * b14;
+ c1_nxt2 -= c1_nxt3 * b14;
+ c0_nxt2 *= b10;
+ c1_nxt2 *= b10;
+
+ c0_nxt1 -= c0_nxt3 * b13;
+ c1_nxt1 -= c1_nxt3 * b13;
+ c0_nxt1 -= c0_nxt2 * b9;
+ c1_nxt1 -= c1_nxt2 * b9;
+ c0_nxt1 *= b5;
+ c1_nxt1 *= b5;
+
+ c0 -= c0_nxt3 * b12;
+ c1 -= c1_nxt3 * b12;
+ c0 -= c0_nxt2 * b8;
+ c1 -= c1_nxt2 * b8;
+ c0 -= c0_nxt1 * b4;
+ c1 -= c1_nxt1 * b4;
+ c0 *= b0;
+ c1 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c0_nxt1;
+ *(a + 3) = c1_nxt1;
+ *(a + 4) = c0_nxt2;
+ *(a + 5) = c1_nxt2;
+ *(a + 6) = c0_nxt3;
+ *(a + 7) = c1_nxt3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + 1 * ldc) = c0_nxt1;
+ *(c + 1 + 1 * ldc) = c1_nxt1;
+ *(c + 0 + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+ *(c + 0 + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+}
+
+static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt = *(c + 0 + ldc);
+ c1_nxt = *(c + 1 + ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+
+ c0_nxt -= aa[0] * bb[1];
+ c1_nxt -= aa[1] * bb[1];
+
+ aa += 2;
+ bb += 2;
+ }
+ }
+
+ a -= 4;
+ b -= 4;
+
+ b3 = *(b + 3);
+ b2 = *(b + 2);
+ b0 = *b;
+
+ c0_nxt *= b3;
+ c1_nxt *= b3;
+
+ c0 -= c0_nxt * b2;
+ c0 *= b0;
+
+ c1 -= c1_nxt * b2;
+ c1 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c0_nxt;
+ *(a + 3) = c1_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+}
+
+static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT b0, c0, c1;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+
+ aa += 2;
+ bb += 1;
+ }
+ }
+
+ b0 = *(b - 1);
+
+ c0 *= b0;
+ c1 *= b0;
+
+ *(a - 2) = c0;
+ *(a - 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+}
+
+static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15, c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[0] * bb[1];
+ c2 -= aa[0] * bb[2];
+ c3 -= aa[0] * bb[3];
+
+ aa += 1;
+ bb += 4;
+ }
+ }
+
+ a -= 4;
+ b -= 16;
+
+ b0 = *b;
+ b4 = *(b + 4);
+ b5 = *(b + 5);
+ b8 = *(b + 8);
+ b9 = *(b + 9);
+ b10 = *(b + 10);
+ b12 = *(b + 12);
+ b13 = *(b + 13);
+ b14 = *(b + 14);
+ b15 = *(b + 15);
+
+ c3 *= b15;
+
+ c2 -= c3 * b14;
+ c2 *= b10;
+
+ c1 -= c3 * b13;
+ c1 -= c2 * b9;
+ c1 *= b5;
+
+ c0 -= c3 * b12;
+ c0 -= c2 * b8;
+ c0 -= c1 * b4;
+ c0 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c2;
+ *(a + 3) = c3;
+
+ *(c + 0 * ldc) = c0;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
+}
+
+static void dsolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b2, b3, c0, c1;
+
+ c0 = *(c + 0);
+ c1 = *(c + ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--;)
+ {
+ c0 -= *aa * bb[0];
+ c1 -= *aa * bb[1];
+
+ aa += 1;
+ bb += 2;
+ }
+ }
+
+ a -= 2;
+ b -= 4;
+
+ b3 = *(b + 3);
+ b2 = *(b + 2);
+ b0 = *b;
+
+ c1 *= b3;
+
+ c0 -= c1 * b2;
+ c0 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + ldc) = c1;
+}
+
+static void dsolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ if (bk > 0)
+ {
+ BLASLONG i;
+
+ for (i = 0; i < bk; i++)
+ {
+ *c -= a[i] * b[i];
+ }
+ }
+
+ *c *= *(b - 1);
+ *(a - 1) = *c;
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ FLOAT *c, BLASLONG ldc, BLASLONG offset)
+{
+ BLASLONG i, j, kk;
+ FLOAT *aa, *cc, *bb;
+
+ kk = n - offset;
+ c += n * ldc;
+ b += n * k;
+
+ if (n & 3)
+ {
+ if (n & 1)
+ {
+ aa = a;
+ c -= ldc;
+ b -= k;
+ bb = b + kk;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, k - kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, k - kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, k - kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ dsolve_1x1_rt_msa(aa + kk, bb, cc, k - kk);
+
+ aa += k;
+ cc += 1;
+ }
+
+ }
+
+ kk -= 1;
+ }
+
+ if (n & 2)
+ {
+ aa = a;
+ c -= 2 * ldc;
+ b -= 2 * k;
+ bb = b + 2 * kk;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, k - kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, k - kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, k - kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, k - kk);
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ kk -= 2;
+ }
+ }
+
+ for (j = (n >> 2); j--;)
+ {
+ aa = a;
+ b -= 4 * k;
+ bb = b + 4 * kk;
+ c -= 4 * ldc;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x4_rt_msa(aa + kk * 8, bb, cc, ldc, k - kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, k - kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, k - kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, k - kk);
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ kk -= 4;
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/gemv_n.c b/kernel/mips/gemv_n.c
new file mode 100644
index 000000000..4cc177209
--- /dev/null
+++ b/kernel/mips/gemv_n.c
@@ -0,0 +1,56 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG ix,iy;
+ BLASLONG j;
+ FLOAT *a_ptr;
+ FLOAT temp;
+
+ ix = 0;
+ a_ptr = a;
+
+ for (j=0; j
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0;
+ FLOAT maxf=0.0;
+ BLASLONG max=0;
+
+ if (n <= 0 || inc_x <= 0) return(max);
+
+ maxf=ABS(x[0]);
+ ix += inc_x;
+ i++;
+
+ while(i < n)
+ {
+ if( ABS(x[ix]) > maxf )
+ {
+ max = i;
+ maxf = ABS(x[ix]);
+ }
+ ix += inc_x;
+ i++;
+ }
+ return(max+1);
+}
+
+
diff --git a/kernel/mips/iamin.c b/kernel/mips/iamin.c
new file mode 100644
index 000000000..7f1c4d905
--- /dev/null
+++ b/kernel/mips/iamin.c
@@ -0,0 +1,68 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0;
+ FLOAT minf=0.0;
+ BLASLONG min=0;
+
+ if (n <= 0 || inc_x <= 0) return(min);
+
+ minf=ABS(x[0]);
+ ix += inc_x;
+ i++;
+
+ while(i < n)
+ {
+ if( ABS(x[ix]) < ABS(minf) )
+ {
+ min = i;
+ minf = ABS(x[ix]);
+ }
+ ix += inc_x;
+ i++;
+ }
+ return(min+1);
+}
+
+
diff --git a/kernel/mips/imax.c b/kernel/mips/imax.c
new file mode 100644
index 000000000..744bfc0d9
--- /dev/null
+++ b/kernel/mips/imax.c
@@ -0,0 +1,59 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0;
+ FLOAT maxf=0.0;
+ BLASLONG max=0;
+
+ if (n <= 0 || inc_x <= 0) return(max);
+
+ maxf=x[0];
+ ix += inc_x;
+ i++;
+
+ while(i < n)
+ {
+ if( x[ix] > maxf )
+ {
+ max = i;
+ maxf = x[ix];
+ }
+ ix += inc_x;
+ i++;
+ }
+ return(max+1);
+}
+
+
diff --git a/kernel/mips/imin.c b/kernel/mips/imin.c
new file mode 100644
index 000000000..d9b283d2d
--- /dev/null
+++ b/kernel/mips/imin.c
@@ -0,0 +1,59 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0;
+ FLOAT minf=0.0;
+ BLASLONG min=0;
+
+ if (n <= 0 || inc_x <= 0) return(min);
+
+ minf=x[0];
+ ix += inc_x;
+ i++;
+
+ while(i < n)
+ {
+ if( x[ix] > minf )
+ {
+ min = i;
+ minf = x[ix];
+ }
+ ix += inc_x;
+ i++;
+ }
+ return(min+1);
+}
+
+
diff --git a/kernel/mips/izamax.c b/kernel/mips/izamax.c
new file mode 100644
index 000000000..708ee921d
--- /dev/null
+++ b/kernel/mips/izamax.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0;
+ FLOAT maxf;
+ BLASLONG max=0;
+ BLASLONG inc_x2;
+
+ if (n <= 0 || inc_x <= 0) return(max);
+
+ inc_x2 = 2 * inc_x;
+
+ maxf = CABS1(x,0);
+ ix += inc_x2;
+ i++;
+
+ while(i < n)
+ {
+ if( CABS1(x,ix) > maxf )
+ {
+ max = i;
+ maxf = CABS1(x,ix);
+ }
+ ix += inc_x2;
+ i++;
+ }
+ return(max+1);
+}
+
+
diff --git a/kernel/mips/izamin.c b/kernel/mips/izamin.c
new file mode 100644
index 000000000..523605ef4
--- /dev/null
+++ b/kernel/mips/izamin.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0;
+ FLOAT minf;
+ BLASLONG min=0;
+ BLASLONG inc_x2;
+
+ if (n <= 0 || inc_x <= 0) return(min);
+
+ inc_x2 = 2 * inc_x;
+
+ minf = CABS1(x,0);
+ ix += inc_x2;
+ i++;
+
+ while(i < n)
+ {
+ if( CABS1(x,ix) < minf )
+ {
+ min = i;
+ minf = CABS1(x,ix);
+ }
+ ix += inc_x2;
+ i++;
+ }
+ return(min+1);
+}
+
+
diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h
new file mode 100644
index 000000000..dbc185302
--- /dev/null
+++ b/kernel/mips/macros_msa.h
@@ -0,0 +1,747 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef __MACROS_MSA_H__
+#define __MACROS_MSA_H__
+
+#include
+
+#define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
+#define LD_SP(...) LD_W(v4f32, __VA_ARGS__)
+
+#define LD_D(RTYPE, psrc) *((RTYPE *)(psrc))
+#define LD_DP(...) LD_D(v2f64, __VA_ARGS__)
+
+#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SP(...) ST_W(v4f32, __VA_ARGS__)
+
+#define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
+
+#define COPY_FLOAT_TO_VECTOR(a) ( { \
+ v4f32 out; \
+ out = __msa_cast_to_vector_float(a); \
+ out = (v4f32) __msa_splati_w((v4i32) out, 0); \
+ out; \
+} )
+
+#define COPY_DOUBLE_TO_VECTOR(a) ( { \
+ v2f64 out; \
+ out = __msa_cast_to_vector_double(a); \
+ out = (v2f64) __msa_splati_d((v2i64) out, 0); \
+ out; \
+} )
+
+/* Description : Load 2 variables with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+*/
+#define LD_GP2_INC(psrc, stride, out0, out1) \
+{ \
+ out0 = *(psrc); \
+ (psrc) += stride; \
+ out1 = *(psrc); \
+ (psrc) += stride; \
+}
+
+#define LD_GP3_INC(psrc, stride, out0, \
+ out1, out2) \
+{ \
+ LD_GP2_INC(psrc, stride, out0, out1); \
+ out2 = *(psrc); \
+ (psrc) += stride; \
+}
+
+#define LD_GP4_INC(psrc, stride, out0, \
+ out1, out2, out3) \
+{ \
+ LD_GP2_INC(psrc, stride, out0, out1); \
+ LD_GP2_INC(psrc, stride, out2, out3); \
+}
+
+#define LD_GP5_INC(psrc, stride, out0, \
+ out1, out2, out3, out4) \
+{ \
+ LD_GP2_INC(psrc, stride, out0, out1); \
+ LD_GP2_INC(psrc, stride, out2, out3); \
+ out4 = *(psrc); \
+ (psrc) += stride; \
+}
+
+#define LD_GP6_INC(psrc, stride, out0, \
+ out1, out2, out3, \
+ out4, out5) \
+{ \
+ LD_GP2_INC(psrc, stride, out0, out1); \
+ LD_GP2_INC(psrc, stride, out2, out3); \
+ LD_GP2_INC(psrc, stride, out4, out5); \
+}
+
+#define LD_GP7_INC(psrc, stride, out0, \
+ out1, out2, out3, \
+ out4, out5, out6) \
+{ \
+ LD_GP2_INC(psrc, stride, out0, out1); \
+ LD_GP2_INC(psrc, stride, out2, out3); \
+ LD_GP2_INC(psrc, stride, out4, out5); \
+ out6 = *(psrc); \
+ (psrc) += stride; \
+}
+
+#define LD_GP8_INC(psrc, stride, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+{ \
+ LD_GP4_INC(psrc, stride, out0, out1, out2, out3); \
+ LD_GP4_INC(psrc, stride, out4, out5, out6, out7); \
+}
+
+/* Description : Load 2 vectors of single precision floating point elements with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Return Type - single precision floating point
+*/
+#define LD_SP2(psrc, stride, out0, out1) \
+{ \
+ out0 = LD_SP((psrc)); \
+ out1 = LD_SP((psrc) + stride); \
+}
+
+#define LD_SP4(psrc, stride, out0, out1, out2, out3) \
+{ \
+ LD_SP2(psrc, stride, out0, out1) \
+ LD_SP2(psrc + 2 * stride, stride, out2, out3) \
+}
+
+#define LD_SP2_INC(psrc, stride, out0, out1) \
+{ \
+ out0 = LD_SP((psrc)); \
+ (psrc) += stride; \
+ out1 = LD_SP((psrc)); \
+ (psrc) += stride; \
+}
+
+#define LD_SP3_INC(psrc, stride, out0, \
+ out1, out2) \
+{ \
+ LD_SP2_INC(psrc, stride, out0, out1); \
+ out2 = LD_SP((psrc)); \
+ (psrc) += stride; \
+}
+
+#define LD_SP4_INC(psrc, stride, out0, \
+ out1, out2, out3) \
+{ \
+ LD_SP2_INC(psrc, stride, out0, out1); \
+ LD_SP2_INC(psrc, stride, out2, out3); \
+}
+
+#define LD_SP5_INC(psrc, stride, out0, \
+ out1, out2, out3, out4) \
+{ \
+ LD_SP2_INC(psrc, stride, out0, out1); \
+ LD_SP2_INC(psrc, stride, out2, out3); \
+ out4 = LD_SP((psrc)); \
+ (psrc) += stride; \
+}
+
+#define LD_SP6_INC(psrc, stride, out0, \
+ out1, out2, out3, \
+ out4, out5) \
+{ \
+ LD_SP2_INC(psrc, stride, out0, out1); \
+ LD_SP2_INC(psrc, stride, out2, out3); \
+ LD_SP2_INC(psrc, stride, out4, out5); \
+}
+
+#define LD_SP7_INC(psrc, stride, out0, \
+ out1, out2, out3, \
+ out4, out5, out6) \
+{ \
+ LD_SP2_INC(psrc, stride, out0, out1); \
+ LD_SP2_INC(psrc, stride, out2, out3); \
+ LD_SP2_INC(psrc, stride, out4, out5); \
+ out6 = LD_SP((psrc)); \
+ (psrc) += stride; \
+}
+
+#define LD_SP8_INC(psrc, stride, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+{ \
+ LD_SP4_INC(psrc, stride, out0, out1, out2, out3); \
+ LD_SP4_INC(psrc, stride, out4, out5, out6, out7); \
+}
+
+#define LD_SP16_INC(psrc, stride, out0, out1, out2, \
+ out3, out4, out5, out6, out7, out8, \
+ out9, out10, out11, out12, out13, \
+ out14, out15) \
+{ \
+ LD_SP8_INC(psrc, stride, out0, out1, out2, \
+ out3, out4, out5, out6, out7); \
+ LD_SP8_INC(psrc, stride, out8, out9, out10, \
+ out11, out12, out13, out14, out15); \
+}
+
+/* Description : Load 2 vectors of double precision floating point elements with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Return Type - double precision floating point
+*/
+#define LD_DP2(psrc, stride, out0, out1) \
+{ \
+ out0 = LD_DP((psrc)); \
+ out1 = LD_DP((psrc) + stride); \
+}
+
+#define LD_DP4(psrc, stride, out0, out1, out2, out3) \
+{ \
+ LD_DP2(psrc, stride, out0, out1) \
+ LD_DP2(psrc + 2 * stride, stride, out2, out3) \
+}
+
+#define LD_DP2_INC(psrc, stride, out0, out1) \
+{ \
+ out0 = LD_DP(psrc); \
+ (psrc) += stride; \
+ out1 = LD_DP(psrc); \
+ (psrc) += stride; \
+}
+
+#define LD_DP3_INC(psrc, stride, out0, \
+ out1, out2) \
+{ \
+ LD_DP2_INC(psrc, stride, out0, out1); \
+ out2 = LD_DP((psrc)); \
+ (psrc) += stride; \
+}
+
+#define LD_DP4_INC(psrc, stride, out0, \
+ out1, out2, out3) \
+{ \
+ LD_DP2_INC(psrc, stride, out0, out1); \
+ LD_DP2_INC(psrc, stride, out2, out3); \
+}
+
+#define LD_DP5_INC(psrc, stride, out0, \
+ out1, out2, out3, out4) \
+{ \
+ LD_DP2_INC(psrc, stride, out0, out1); \
+ LD_DP2_INC(psrc, stride, out2, out3); \
+ out4 = LD_DP((psrc)); \
+ (psrc) += stride; \
+}
+
+#define LD_DP6_INC(psrc, stride, out0, \
+ out1, out2, out3, \
+ out4, out5) \
+{ \
+ LD_DP2_INC(psrc, stride, out0, out1); \
+ LD_DP2_INC(psrc, stride, out2, out3); \
+ LD_DP2_INC(psrc, stride, out4, out5); \
+}
+
+#define LD_DP7_INC(psrc, stride, out0, \
+ out1, out2, out3, \
+ out4, out5, out6) \
+{ \
+ LD_DP2_INC(psrc, stride, out0, out1); \
+ LD_DP2_INC(psrc, stride, out2, out3); \
+ LD_DP2_INC(psrc, stride, out4, out5); \
+ out6 = LD_DP((psrc)); \
+ (psrc) += stride; \
+}
+
+#define LD_DP8_INC(psrc, stride, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+{ \
+ LD_DP4_INC(psrc, stride, out0, out1, out2, out3); \
+ LD_DP4_INC(psrc, stride, out4, out5, out6, out7); \
+}
+
+#define LD_DP16_INC(psrc, stride, out0, out1, out2, \
+ out3, out4, out5, out6, out7, out8, \
+ out9, out10, out11, out12, out13, \
+ out14, out15) \
+{ \
+ LD_DP8_INC(psrc, stride, out0, out1, out2, \
+ out3, out4, out5, out6, out7); \
+ LD_DP8_INC(psrc, stride, out8, out9, out10, \
+ out11, out12, out13, out14, out15); \
+}
+
+/* Description : Store GP variable with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 4 single precision floating point elements from 'in0' to (pdst)
+ Store 4 single precision floating point elements from 'in1' to (pdst + stride)
+*/
+#define ST_GP2_INC(in0, in1, \
+ pdst, stride) \
+{ \
+ *(pdst) = in0; \
+ (pdst) += stride; \
+ *(pdst) = in1; \
+ (pdst) += stride; \
+}
+
+#define ST_GP3_INC(in0, in1, in2, \
+ pdst, stride) \
+{ \
+ ST_GP2_INC(in0, in1, pdst, stride); \
+ *(pdst) = in2; \
+ (pdst) += stride; \
+}
+
+#define ST_GP4_INC(in0, in1, in2, in3, \
+ pdst, stride) \
+{ \
+ ST_GP2_INC(in0, in1, pdst, stride); \
+ ST_GP2_INC(in2, in3, pdst, stride); \
+}
+
+#define ST_GP5_INC(in0, in1, in2, in3, \
+ in4, pdst, stride) \
+{ \
+ ST_GP2_INC(in0, in1, pdst, stride); \
+ ST_GP2_INC(in2, in3, pdst, stride); \
+ *(pdst) = in4; \
+ (pdst) += stride; \
+}
+
+#define ST_GP6_INC(in0, in1, in2, in3, \
+ in4, in5, pdst, stride) \
+{ \
+ ST_GP2_INC(in0, in1, pdst, stride); \
+ ST_GP2_INC(in2, in3, pdst, stride); \
+ ST_GP2_INC(in4, in5, pdst, stride); \
+}
+
+#define ST_GP7_INC(in0, in1, in2, in3, in4, \
+ in5, in6, pdst, stride) \
+{ \
+ ST_GP2_INC(in0, in1, pdst, stride); \
+ ST_GP2_INC(in2, in3, pdst, stride); \
+ ST_GP2_INC(in4, in5, pdst, stride); \
+ *(pdst) = in6; \
+ (pdst) += stride; \
+}
+
+#define ST_GP8_INC(in0, in1, in2, in3, in4, in5, \
+ in6, in7, pdst, stride) \
+{ \
+ ST_GP4_INC(in0, in1, in2, in3, pdst, stride); \
+ ST_GP4_INC(in4, in5, in6, in7, pdst, stride); \
+}
+
+/* Description : Store vectors of single precision floating point elements with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 4 single precision floating point elements from 'in0' to (pdst)
+ Store 4 single precision floating point elements from 'in1' to (pdst + stride)
+*/
+#define ST_SP2(in0, in1, pdst, stride) \
+{ \
+ ST_SP(in0, (pdst)); \
+ ST_SP(in1, (pdst) + stride); \
+}
+
+#define ST_SP4(in0, in1, in2, in3, pdst, stride) \
+{ \
+ ST_SP2(in0, in1, (pdst), stride); \
+ ST_SP2(in2, in3, (pdst + 2 * stride), stride); \
+}
+
+#define ST_SP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+{ \
+ ST_SP4(in0, in1, in2, in3, (pdst), stride); \
+ ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride); \
+}
+
+#define ST_SP2_INC(in0, in1, pdst, stride) \
+{ \
+ ST_SP(in0, (pdst)); \
+ (pdst) += stride; \
+ ST_SP(in1, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_SP3_INC(in0, in1, in2, \
+ pdst, stride) \
+{ \
+ ST_SP2_INC(in0, in1, pdst, stride); \
+ ST_SP(in2, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_SP4_INC(in0, in1, in2, in3, \
+ pdst, stride) \
+{ \
+ ST_SP2_INC(in0, in1, pdst, stride); \
+ ST_SP2_INC(in2, in3, pdst, stride); \
+}
+
+#define ST_SP5_INC(in0, in1, in2, in3, \
+ in4, pdst, stride) \
+{ \
+ ST_SP2_INC(in0, in1, pdst, stride); \
+ ST_SP2_INC(in2, in3, pdst, stride); \
+ ST_SP(in4, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_SP6_INC(in0, in1, in2, in3, \
+ in4, in5, pdst, stride) \
+{ \
+ ST_SP2_INC(in0, in1, pdst, stride); \
+ ST_SP2_INC(in2, in3, pdst, stride); \
+ ST_SP2_INC(in4, in5, pdst, stride); \
+}
+
+#define ST_SP7_INC(in0, in1, in2, in3, in4, \
+ in5, in6, pdst, stride) \
+{ \
+ ST_SP2_INC(in0, in1, pdst, stride); \
+ ST_SP2_INC(in2, in3, pdst, stride); \
+ ST_SP2_INC(in4, in5, pdst, stride); \
+ ST_SP(in6, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_SP8_INC(in0, in1, in2, in3, in4, in5, \
+ in6, in7, pdst, stride) \
+{ \
+ ST_SP4_INC(in0, in1, in2, in3, pdst, stride); \
+ ST_SP4_INC(in4, in5, in6, in7, pdst, stride); \
+}
+
+#define ST_SP16_INC(in0, in1, in2, in3, in4, in5, in6, \
+ in7, in8, in9, in10, in11, in12, \
+ in13, in14, in15, pdst, stride) \
+{ \
+ ST_SP8_INC(in0, in1, in2, in3, in4, in5, in6, \
+ in7, pdst, stride); \
+ ST_SP8_INC(in8, in9, in10, in11, in12, in13, in14, \
+ in15, pdst, stride); \
+}
+
+/* Description : Store vectors of double precision floating point elements with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 2 double precision floating point elements from 'in0' to (pdst)
+ Store 2 double precision floating point elements from 'in1' to (pdst + stride)
+*/
+#define ST_DP2(in0, in1, pdst, stride) \
+{ \
+ ST_DP(in0, (pdst)); \
+ ST_DP(in1, (pdst) + stride); \
+}
+
+#define ST_DP4(in0, in1, in2, in3, pdst, stride) \
+{ \
+ ST_DP2(in0, in1, (pdst), stride); \
+ ST_DP2(in2, in3, (pdst) + 2 * stride, stride); \
+}
+
+#define ST_DP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+{ \
+ ST_DP4(in0, in1, in2, in3, (pdst), stride); \
+ ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
+}
+
+#define ST_DP2_INC(in0, in1, pdst, stride) \
+{ \
+ ST_DP(in0, (pdst)); \
+ (pdst) += stride; \
+ ST_DP(in1, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_DP3_INC(in0, in1, in2, \
+ pdst, stride) \
+{ \
+ ST_DP2_INC(in0, in1, pdst, stride); \
+ ST_DP(in2, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_DP4_INC(in0, in1, in2, in3, \
+ pdst, stride) \
+{ \
+ ST_DP2_INC(in0, in1, pdst, stride); \
+ ST_DP2_INC(in2, in3, pdst, stride); \
+}
+
+#define ST_DP5_INC(in0, in1, in2, in3, \
+ in4, pdst, stride) \
+{ \
+ ST_DP2_INC(in0, in1, pdst, stride); \
+ ST_DP2_INC(in2, in3, pdst, stride); \
+ ST_DP(in4, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_DP6_INC(in0, in1, in2, in3, \
+ in4, in5, pdst, stride) \
+{ \
+ ST_DP2_INC(in0, in1, pdst, stride); \
+ ST_DP2_INC(in2, in3, pdst, stride); \
+ ST_DP2_INC(in4, in5, pdst, stride); \
+}
+
+#define ST_DP7_INC(in0, in1, in2, in3, in4, \
+ in5, in6, pdst, stride) \
+{ \
+ ST_DP2_INC(in0, in1, pdst, stride); \
+ ST_DP2_INC(in2, in3, pdst, stride); \
+ ST_DP2_INC(in4, in5, pdst, stride); \
+ ST_DP(in6, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_DP8_INC(in0, in1, in2, in3, in4, in5, \
+ in6, in7, pdst, stride) \
+{ \
+ ST_DP4_INC(in0, in1, in2, in3, pdst, stride); \
+ ST_DP4_INC(in4, in5, in6, in7, pdst, stride); \
+}
+
+#define ST_DP16_INC(in0, in1, in2, in3, in4, in5, in6, \
+ in7, in8, in9, in10, in11, in12, \
+ in13, in14, in15, pdst, stride) \
+{ \
+ ST_DP8_INC(in0, in1, in2, in3, in4, in5, in6, \
+ in7, pdst, stride); \
+ ST_DP8_INC(in8, in9, in10, in11, in12, in13, in14, \
+ in15, pdst, stride); \
+}
+
+/* Description : shuffle elements in vector as shf_val
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+*/
+#define SHF_W2(RTYPE, in0, in1, out0, out1, shf_val) \
+{ \
+ out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \
+ out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \
+}
+#define SHF_W2_SP(...) SHF_W2(v4f32, __VA_ARGS__)
+#define SHF_W2_DP(...) SHF_W2(v2f64, __VA_ARGS__)
+
+#define SHF_W3(RTYPE, in0, in1, in2, out0, out1, out2, \
+ shf_val) \
+{ \
+ out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \
+ out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \
+ out2 = (RTYPE) __msa_shf_w((v4i32) in2, shf_val); \
+}
+#define SHF_W3_SP(...) SHF_W3(v4f32, __VA_ARGS__)
+
+#define SHF_W4(RTYPE, in0, in1, in2, in3, \
+ out0, out1, out2, out3, shf_val) \
+{ \
+ SHF_W2(RTYPE, in0, in1, out0, out1, shf_val); \
+ SHF_W2(RTYPE, in2, in3, out2, out3, shf_val); \
+}
+#define SHF_W4_SP(...) SHF_W4(v4f32, __VA_ARGS__)
+#define SHF_W4_DP(...) SHF_W4(v2f64, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of byte elements from 'in0' and 'in1' are
+ interleaved and written to 'out0'
+*/
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
+ out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
+}
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+#define ILVRL_W2_SP(...) ILVRL_W2(v4f32, __VA_ARGS__)
+
+#define ILVRL_D2(RTYPE, in0, in1, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
+ out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
+}
+#define ILVRL_D2_SP(...) ILVRL_D2(v4f32, __VA_ARGS__)
+#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
+
+/* Description : Indexed word element values are replicated to all
+ elements in output vector
+ Arguments : Inputs - in, stidx
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : 'stidx' element value from 'in' vector is replicated to all
+ elements in 'out0' vector
+ 'stidx + 1' element value from 'in' vector is replicated to all
+ elements in 'out1' vector
+ Valid index range for word operation is 0-3
+*/
+#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
+ out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
+}
+#define SPLATI_W2_SP(...) SPLATI_W2(v4f32, __VA_ARGS__)
+
+#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
+{ \
+ SPLATI_W2(RTYPE, in, 0, out0, out1); \
+ SPLATI_W2(RTYPE, in, 2, out2, out3); \
+}
+#define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)
+
+#define SPLATI_D2(RTYPE, in, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_splati_d((v2i64) in, 0); \
+ out1 = (RTYPE) __msa_splati_d((v2i64) in, 1); \
+}
+#define SPLATI_D2_DP(...) SPLATI_D2(v2f64, __VA_ARGS__)
+
+/* Description : Pack even double word elements of vector pairs
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even double word elements of 'in0' are copied to the left half
+ of 'out0' & even double word elements of 'in1' are copied to
+ the right half of 'out0'.
+*/
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
+ out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
+}
+#define PCKEV_D2_SP(...) PCKEV_D2(v4f32, __VA_ARGS__)
+#define PCKEV_D2_SD(...) PCKEV_D2(v2f64, __VA_ARGS__)
+
+#define PCKEV_D3(RTYPE, in0, in1, in2, in3, in4, in5, \
+ out0, out1, out2) \
+{ \
+ out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
+ out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
+ out2 = (RTYPE) __msa_pckev_d((v2i64) in4, (v2i64) in5); \
+}
+#define PCKEV_D3_SP(...) PCKEV_D3(v4f32, __VA_ARGS__)
+
+#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) \
+{ \
+ PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
+}
+#define PCKEV_D4_SP(...) PCKEV_D4(v4f32, __VA_ARGS__)
+
+/* Description : pack both even and odd half of input vectors
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even double word elements of 'in0' and 'in1' are copied to the
+ 'out0' & odd double word elements of 'in0' and 'in1' are
+ copied to the 'out1'.
+*/
+#define PCKEVOD_W2(RTYPE, in0, in1, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1); \
+ out1 = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1); \
+}
+#define PCKEVOD_W2_SP(...) PCKEVOD_W2(v4f32, __VA_ARGS__)
+
+#define PCKEVOD_D2(RTYPE, in0, in1, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
+ out1 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
+}
+#define PCKEVOD_D2_DP(...) PCKEVOD_D2(v2f64, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element from 'in0' is multiplied with elements from 'in1'
+ and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1) \
+{ \
+ out0 = in0 * in1; \
+ out1 = in2 * in3; \
+}
+#define MUL3(in0, in1, in2, in3, in4, in5, \
+ out0, out1, out2) \
+{ \
+ out0 = in0 * in1; \
+ out1 = in2 * in3; \
+ out2 = in4 * in5; \
+}
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) \
+{ \
+ MUL2(in0, in1, in2, in3, out0, out1); \
+ MUL2(in4, in5, in6, in7, out2, out3); \
+}
+
+/* Description : Addition of 2 pairs of variables
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element in 'in0' is added to 'in1' and result is written
+ to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1) \
+{ \
+ out0 = in0 + in1; \
+ out1 = in2 + in3; \
+}
+#define ADD3(in0, in1, in2, in3, in4, in5, \
+ out0, out1, out2) \
+{ \
+ out0 = in0 + in1; \
+ out1 = in2 + in3; \
+ out2 = in4 + in5; \
+}
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) \
+{ \
+ ADD2(in0, in1, in2, in3, out0, out1); \
+ ADD2(in4, in5, in6, in7, out2, out3); \
+}
+
+/* Description : Transpose 4x4 block with word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1, out2, out3
+ Return Type - as per RTYPE
+*/
+#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \
+ out0, out1, out2, out3) \
+{ \
+ v4i32 s0_m, s1_m, s2_m, s3_m; \
+ \
+ ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
+ ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
+ ILVRL_D2(RTYPE, s2_m, s0_m, out0, out1); \
+ ILVRL_D2(RTYPE, s3_m, s1_m, out2, out3); \
+}
+#define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__)
+
+#endif /* __MACROS_MSA_H__ */
diff --git a/kernel/mips/max.c b/kernel/mips/max.c
new file mode 100644
index 000000000..2ad956bc0
--- /dev/null
+++ b/kernel/mips/max.c
@@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+* BLASTEST float : NoTest
+* BLASTEST double : NoTest
+* CTEST : NoTest
+* TEST : NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0;
+ FLOAT maxf=0.0;
+
+ if (n <= 0 || inc_x <= 0) return(maxf);
+
+ maxf=x[0];
+ ix += inc_x;
+ i++;
+
+ while(i < n)
+ {
+ if( x[ix] > maxf )
+ {
+ maxf = x[ix];
+ }
+ ix += inc_x;
+ i++;
+ }
+ return(maxf);
+}
+
+
diff --git a/kernel/mips/min.c b/kernel/mips/min.c
new file mode 100644
index 000000000..2812fe397
--- /dev/null
+++ b/kernel/mips/min.c
@@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+* BLASTEST float : NoTest
+* BLASTEST double : NoTest
+* CTEST : NoTest
+* TEST : NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0;
+ FLOAT minf=0.0;
+
+ if (n <= 0 || inc_x <= 0) return(minf);
+
+ minf=x[0];
+ ix += inc_x;
+ i++;
+
+ while(i < n)
+ {
+ if( x[ix] < minf )
+ {
+ minf = x[ix];
+ }
+ ix += inc_x;
+ i++;
+ }
+ return(minf);
+}
+
+
diff --git a/kernel/mips/nrm2.c b/kernel/mips/nrm2.c
new file mode 100644
index 000000000..fcff09337
--- /dev/null
+++ b/kernel/mips/nrm2.c
@@ -0,0 +1,88 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/13 Saar
+* BLASTEST float : OK
+* BLASTEST double : OK
+* CTEST : OK
+* TEST : OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ FLOAT scale = 0.0;
+ FLOAT ssq = 1.0;
+ FLOAT absxi = 0.0;
+
+
+ if (n <= 0 || inc_x <= 0) return(0.0);
+ if ( n == 1 ) return( ABS(x[0]) );
+
+ n *= inc_x;
+ while(i < n)
+ {
+
+ if ( x[i] != 0.0 )
+ {
+ absxi = ABS( x[i] );
+ if ( scale < absxi )
+ {
+ ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
+ scale = absxi ;
+ }
+ else
+ {
+ ssq += ( absxi/scale ) * ( absxi/scale );
+ }
+
+ }
+ i += inc_x;
+ }
+ scale = scale * sqrt( ssq );
+ return(scale);
+
+}
+
+
diff --git a/kernel/mips/omatcopy_cn.c b/kernel/mips/omatcopy_cn.c
new file mode 100644
index 000000000..11357ec93
--- /dev/null
+++ b/kernel/mips/omatcopy_cn.c
@@ -0,0 +1,82 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+ BLASLONG i,j;
+ FLOAT *aptr,*bptr;
+
+ if ( rows <= 0 ) return(0);
+ if ( cols <= 0 ) return(0);
+
+ aptr = a;
+ bptr = b;
+
+ if ( alpha == 0.0 )
+ {
+ for ( i=0; i
+#include "macros_msa.h"
+
+#define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec))
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i = 0;
+ FLOAT data0, data1, data2, sumf = 0.0;
+ v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+ v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
+ v4f32 zero_v = {0};
+ v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+
+ if (n <= 0 || inc_x <= 0) return (sumf);
+
+ if (1 == inc_x)
+ {
+ if (n > 31)
+ {
+ n -= 32;
+
+ LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ sum_abs0 = AND_VEC_W(src0);
+ sum_abs1 = AND_VEC_W(src1);
+ sum_abs2 = AND_VEC_W(src2);
+ sum_abs3 = AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+ sum_abs1 += AND_VEC_W(src5);
+ sum_abs2 += AND_VEC_W(src6);
+ sum_abs3 += AND_VEC_W(src7);
+ }
+ else
+ {
+ sum_abs0 = zero_v;
+ sum_abs1 = zero_v;
+ sum_abs2 = zero_v;
+ sum_abs3 = zero_v;
+ }
+
+ for (i = 0; i < (n >> 5); i++)
+ {
+ LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+ sum_abs1 += AND_VEC_W(src5);
+ sum_abs2 += AND_VEC_W(src6);
+ sum_abs3 += AND_VEC_W(src7);
+ }
+
+ if (n & 31)
+ {
+ if ((n & 16) && (n & 8) && (n & 4))
+ {
+ LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+ sum_abs1 += AND_VEC_W(src5);
+ sum_abs2 += AND_VEC_W(src6);
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf += sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ else if ((n & 16) && (n & 8))
+ {
+ LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+ sum_abs1 += AND_VEC_W(src5);
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf += sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ else if ((n & 16) && (n & 4))
+ {
+ LD_SP5_INC(x, 4, src0, src1, src2, src3, src4);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+ sum_abs0 += AND_VEC_W(src4);
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf += sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ else if ((n & 8) && (n & 4))
+ {
+ LD_SP3_INC(x, 4, src0, src1, src2);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf += sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ else if (n & 16)
+ {
+ LD_SP4_INC(x, 4, src0, src1, src2, src3);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf += sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ else if (n & 8)
+ {
+ LD_SP2_INC(x, 4, src0, src1);
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf += sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ else if (n & 4)
+ {
+ src0 = LD_SP(x); x += 4;
+
+ sum_abs0 += AND_VEC_W(src0);
+
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf += sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ else
+ {
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf += sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+
+ if (n & 2)
+ {
+ sumf += fabsf(*(x + 0));
+ sumf += fabsf(*(x + 1));
+ x += 2;
+ }
+
+ if (n & 1)
+ {
+ sumf += fabsf(*(x + 0));
+ }
+ }
+ else
+ {
+ sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf += sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+ }
+ }
+ else
+ {
+ if (n > 8)
+ {
+ n -= 8;
+
+ src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
+ x += inc_x;
+ src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x));
+ x += inc_x;
+ src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x));
+ x += inc_x;
+ src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x));
+ x += inc_x;
+
+ sum_abs0 = AND_VEC_W(src0);
+ sum_abs1 = AND_VEC_W(src4);
+ }
+ else
+ {
+ sum_abs0 = zero_v;
+ sum_abs1 = zero_v;
+ }
+
+ for (i = (n >> 3); i--;)
+ {
+ src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
+ x += inc_x;
+ src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x));
+ x += inc_x;
+ src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x));
+ x += inc_x;
+ src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x));
+ x += inc_x;
+
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src4);
+ }
+
+ if (n & 4)
+ {
+ src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
+ x += inc_x;
+
+ sum_abs0 += AND_VEC_W(src0);
+ }
+
+ sum_abs0 += sum_abs1;
+
+ sumf += sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
+
+ if ((n & 2) && (n & 1))
+ {
+ data0 = fabsf(*x); x += inc_x;
+ data1 = fabsf(*x); x += inc_x;
+ data2 = fabsf(*x);
+
+ sumf += data0;
+ sumf += data1;
+ sumf += data2;
+ }
+ else if (n & 2)
+ {
+ data0 = fabsf(*x); x += inc_x;
+ data1 = fabsf(*x);
+
+ sumf += data0;
+ sumf += data1;
+ }
+ else if (n & 1)
+ {
+ data0 = fabsf(*x);
+
+ sumf += data0;
+ }
+ }
+
+ return (sumf);
+}
diff --git a/kernel/mips/scal.c b/kernel/mips/scal.c
new file mode 100644
index 000000000..01f708b1d
--- /dev/null
+++ b/kernel/mips/scal.c
@@ -0,0 +1,50 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0,j=0;
+
+ while(j < n)
+ {
+
+ if ( da == 0.0 )
+ x[i]=0.0;
+ else
+ x[i] = da * x[i] ;
+
+ i += inc_x ;
+ j++;
+
+ }
+ return 0;
+
+}
+
+
diff --git a/kernel/mips/sdot_msa.c b/kernel/mips/sdot_msa.c
new file mode 100644
index 000000000..1997ec5a0
--- /dev/null
+++ b/kernel/mips/sdot_msa.c
@@ -0,0 +1,208 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+/* return float, x,y float */
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+ BLASLONG i = 0;
+ double dot = 0.0;
+ float x0, x1, x2, x3, y0, y1, y2, y3;
+ v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
+ v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
+ v4f32 dot0 = {0, 0, 0, 0};
+
+ if (n < 0) return (dot);
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ for (i = (n >> 5); i--;)
+ {
+ LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+ LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+ dot0 += (vy0 * vx0);
+ dot0 += (vy1 * vx1);
+ dot0 += (vy2 * vx2);
+ dot0 += (vy3 * vx3);
+ dot0 += (vy4 * vx4);
+ dot0 += (vy5 * vx5);
+ dot0 += (vy6 * vx6);
+ dot0 += (vy7 * vx7);
+ }
+
+ if (n & 31)
+ {
+ if ((n & 16) && (n & 8) && (n & 4))
+ {
+ LD_SP7_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6);
+ LD_SP7_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6);
+
+ dot0 += (vy0 * vx0);
+ dot0 += (vy1 * vx1);
+ dot0 += (vy2 * vx2);
+ dot0 += (vy3 * vx3);
+ dot0 += (vy4 * vx4);
+ dot0 += (vy5 * vx5);
+ dot0 += (vy6 * vx6);
+ }
+ else if ((n & 16) && (n & 8))
+ {
+ LD_SP6_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5);
+ LD_SP6_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5);
+
+ dot0 += (vy0 * vx0);
+ dot0 += (vy1 * vx1);
+ dot0 += (vy2 * vx2);
+ dot0 += (vy3 * vx3);
+ dot0 += (vy4 * vx4);
+ dot0 += (vy5 * vx5);
+ }
+ else if ((n & 16) && (n & 4))
+ {
+ LD_SP5_INC(x, 4, vx0, vx1, vx2, vx3, vx4);
+ LD_SP5_INC(y, 4, vy0, vy1, vy2, vy3, vy4);
+
+ dot0 += (vy0 * vx0);
+ dot0 += (vy1 * vx1);
+ dot0 += (vy2 * vx2);
+ dot0 += (vy3 * vx3);
+ dot0 += (vy4 * vx4);
+ }
+ else if ((n & 8) && (n & 4))
+ {
+ LD_SP3_INC(x, 4, vx0, vx1, vx2);
+ LD_SP3_INC(y, 4, vy0, vy1, vy2);
+
+ dot0 += (vy0 * vx0);
+ dot0 += (vy1 * vx1);
+ dot0 += (vy2 * vx2);
+ }
+ else if (n & 16)
+ {
+ LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
+ LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
+
+ dot0 += (vy0 * vx0);
+ dot0 += (vy1 * vx1);
+ dot0 += (vy2 * vx2);
+ dot0 += (vy3 * vx3);
+ }
+ else if (n & 8)
+ {
+ LD_SP2_INC(x, 4, vx0, vx1);
+ LD_SP2_INC(y, 4, vy0, vy1);
+
+ dot0 += (vy0 * vx0);
+ dot0 += (vy1 * vx1);
+ }
+ else if (n & 4)
+ {
+ vx0 = LD_SP(x); x += 4;
+ vy0 = LD_SP(y); y += 4;
+
+ dot0 += (vy0 * vx0);
+ }
+
+ if ((n & 2) && (n & 1))
+ {
+ LD_GP3_INC(x, 1, x0, x1, x2);
+ LD_GP3_INC(y, 1, y0, y1, y2);
+
+ dot += (y0 * x0);
+ dot += (y1 * x1);
+ dot += (y2 * x2);
+ }
+ else if (n & 2)
+ {
+ LD_GP2_INC(x, 1, x0, x1);
+ LD_GP2_INC(y, 1, y0, y1);
+
+ dot += (y0 * x0);
+ dot += (y1 * x1);
+ }
+ else if (n & 1)
+ {
+ x0 = *x;
+ y0 = *y;
+
+ dot += (y0 * x0);
+ }
+ }
+
+ dot += dot0[0];
+ dot += dot0[1];
+ dot += dot0[2];
+ dot += dot0[3];
+ }
+ else
+ {
+ for (i = (n >> 2); i--;)
+ {
+ LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
+ LD_GP4_INC(y, inc_y, y0, y1, y2, y3);
+
+ dot += (y0 * x0);
+ dot += (y1 * x1);
+ dot += (y2 * x2);
+ dot += (y3 * x3);
+ }
+
+ if ((n & 2) && (n & 1))
+ {
+ LD_GP3_INC(x, inc_x, x0, x1, x2);
+ LD_GP3_INC(y, inc_y, y0, y1, y2);
+
+ dot += (y0 * x0);
+ dot += (y1 * x1);
+ dot += (y2 * x2);
+ }
+ else if (n & 2)
+ {
+ LD_GP2_INC(x, inc_x, x0, x1);
+ LD_GP2_INC(y, inc_y, y0, y1);
+
+ dot += (y0 * x0);
+ dot += (y1 * x1);
+ }
+ else if (n & 1)
+ {
+ x0 = *x;
+ y0 = *y;
+
+ dot += (y0 * x0);
+ }
+ }
+
+ return (dot);
+}
diff --git a/kernel/mips/sgemm_kernel_8x8_msa.c b/kernel/mips/sgemm_kernel_8x8_msa.c
new file mode 100644
index 000000000..1695471ad
--- /dev/null
+++ b/kernel/mips/sgemm_kernel_8x8_msa.c
@@ -0,0 +1,2482 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
+ FLOAT *C, BLASLONG ldc
+#ifdef TRMMKERNEL
+ , BLASLONG offset
+#endif
+ )
+{
+ BLASLONG i, j, l, temp;
+#if defined(TRMMKERNEL)
+ BLASLONG off;
+#endif
+ FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
+ FLOAT *pa0, *pb0;
+ FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ FLOAT tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ FLOAT a0, a1, b0, b1, b2, b3, b4, b5, b6, b7;
+ v4f32 v_alpha = {alpha, alpha, alpha, alpha};
+ v4f32 src_a0, src_a1, src_b, src_b0, src_b1;
+ v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v4f32 res0, res1, res2, res3, res4, res5, res6, res7;
+ v4f32 res8, res9, res10, res11, res12, res13, res14, res15;
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off = -offset;
+#endif
+
+ for (j = (n >> 3); j--;)
+ {
+ pc0 = C;
+ pc1 = pc0 + ldc;
+ pc2 = pc1 + ldc;
+ pc3 = pc2 + ldc;
+ pc4 = pc3 + ldc;
+ pc5 = pc4 + ldc;
+ pc6 = pc5 + ldc;
+ pc7 = pc6 + ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ pa0 = A;
+ for (i = (m >> 3); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 8;
+ pb0 = B + off * 8;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 8; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 = src_a0 * src_b;
+ res1 = src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res2 = src_a0 * src_b;
+ res3 = src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res4 = src_a0 * src_b;
+ res5 = src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res6 = src_a0 * src_b;
+ res7 = src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
+ res8 = src_a0 * src_b;
+ res9 = src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
+ res10 = src_a0 * src_b;
+ res11 = src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
+ res12 = src_a0 * src_b;
+ res13 = src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
+ res14 = src_a0 * src_b;
+ res15 = src_a1 * src_b;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res6 += src_a0 * src_b;
+ res7 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
+ res8 += src_a0 * src_b;
+ res9 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
+ res10 += src_a0 * src_b;
+ res11 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
+ res12 += src_a0 * src_b;
+ res13 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
+ res14 += src_a0 * src_b;
+ res15 += src_a1 * src_b;
+
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res6 += src_a0 * src_b;
+ res7 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
+ res8 += src_a0 * src_b;
+ res9 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
+ res10 += src_a0 * src_b;
+ res11 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
+ res12 += src_a0 * src_b;
+ res13 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
+ res14 += src_a0 * src_b;
+ res15 += src_a1 * src_b;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res6 += src_a0 * src_b;
+ res7 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
+ res8 += src_a0 * src_b;
+ res9 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
+ res10 += src_a0 * src_b;
+ res11 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
+ res12 += src_a0 * src_b;
+ res13 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
+ res14 += src_a0 * src_b;
+ res15 += src_a1 * src_b;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+ dst4 = res4 * v_alpha;
+ dst5 = res5 * v_alpha;
+ dst6 = res6 * v_alpha;
+ dst7 = res7 * v_alpha;
+#else
+ LD_SP2(pc0, 4, dst0, dst1);
+ LD_SP2(pc1, 4, dst2, dst3);
+ LD_SP2(pc2, 4, dst4, dst5);
+ LD_SP2(pc3, 4, dst6, dst7);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+ dst2 += res2 * v_alpha;
+ dst3 += res3 * v_alpha;
+ dst4 += res4 * v_alpha;
+ dst5 += res5 * v_alpha;
+ dst6 += res6 * v_alpha;
+ dst7 += res7 * v_alpha;
+#endif
+ ST_SP2_INC(dst0, dst1, pc0, 4);
+ ST_SP2_INC(dst2, dst3, pc1, 4);
+ ST_SP2_INC(dst4, dst5, pc2, 4);
+ ST_SP2_INC(dst6, dst7, pc3, 4);
+
+#if defined(TRMMKERNEL)
+ dst0 = res8 * v_alpha;
+ dst1 = res9 * v_alpha;
+ dst2 = res10 * v_alpha;
+ dst3 = res11 * v_alpha;
+ dst4 = res12 * v_alpha;
+ dst5 = res13 * v_alpha;
+ dst6 = res14 * v_alpha;
+ dst7 = res15 * v_alpha;
+#else
+ LD_SP2(pc4, 4, dst0, dst1);
+ LD_SP2(pc5, 4, dst2, dst3);
+ LD_SP2(pc6, 4, dst4, dst5);
+ LD_SP2(pc7, 4, dst6, dst7);
+
+ dst0 += res8 * v_alpha;
+ dst1 += res9 * v_alpha;
+ dst2 += res10 * v_alpha;
+ dst3 += res11 * v_alpha;
+ dst4 += res12 * v_alpha;
+ dst5 += res13 * v_alpha;
+ dst6 += res14 * v_alpha;
+ dst7 += res15 * v_alpha;
+#endif
+ ST_SP2_INC(dst0, dst1, pc4, 4);
+ ST_SP2_INC(dst2, dst3, pc5, 4);
+ ST_SP2_INC(dst4, dst5, pc6, 4);
+ ST_SP2_INC(dst6, dst7, pc7, 4);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 8; // number of values in B
+#endif
+ pa0 += temp * 8;
+ pb0 += temp * 8;
+#endif
+
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 4)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 4;
+ pb0 = B + off * 8;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 8; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ src_a0 = LD_SP(pa0);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 = src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res1 = src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res2 = src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res3 = src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
+ res4 = src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
+ res5 = src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
+ res6 = src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
+ res7 = src_a0 * src_b;
+
+ pa0 += 4;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ src_a0 = LD_SP(pa0);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res1 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res2 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res3 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
+ res4 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
+ res5 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
+ res6 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
+ res7 += src_a0 * src_b;
+
+ pa0 += 4;
+
+ src_a0 = LD_SP(pa0);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res1 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res2 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res3 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
+ res4 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
+ res5 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
+ res6 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
+ res7 += src_a0 * src_b;
+
+ pa0 += 4;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ src_a0 = LD_SP(pa0);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res1 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res2 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res3 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
+ res4 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
+ res5 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
+ res6 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
+ res7 += src_a0 * src_b;
+
+ pa0 += 4;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+#else
+ dst0 = LD_SP(pc0);
+ dst1 = LD_SP(pc1);
+ dst2 = LD_SP(pc2);
+ dst3 = LD_SP(pc3);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+ dst2 += res2 * v_alpha;
+ dst3 += res3 * v_alpha;
+#endif
+ ST_SP(dst0, pc0);
+ ST_SP(dst1, pc1);
+ ST_SP(dst2, pc2);
+ ST_SP(dst3, pc3);
+
+#if defined(TRMMKERNEL)
+ dst0 = res4 * v_alpha;
+ dst1 = res5 * v_alpha;
+ dst2 = res6 * v_alpha;
+ dst3 = res7 * v_alpha;
+#else
+ dst0 = LD_SP(pc4);
+ dst1 = LD_SP(pc5);
+ dst2 = LD_SP(pc6);
+ dst3 = LD_SP(pc7);
+
+ dst0 += res4 * v_alpha;
+ dst1 += res5 * v_alpha;
+ dst2 += res6 * v_alpha;
+ dst3 += res7 * v_alpha;
+#endif
+ ST_SP(dst0, pc4);
+ ST_SP(dst1, pc5);
+ ST_SP(dst2, pc6);
+ ST_SP(dst3, pc7);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 8; // number of values in B
+#endif
+ pa0 += temp * 4;
+ pb0 += temp * 8;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+
+ pc0 += 4;
+ pc1 += 4;
+ pc2 += 4;
+ pc3 += 4;
+ pc4 += 4;
+ pc5 += 4;
+ pc6 += 4;
+ pc7 += 4;
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2;
+ pb0 = B + off * 8;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 8; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 = a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 = a1 * b0;
+
+ b1 = pb0[1];
+ tmp2 = a0 * b1;
+ tmp3 = a1 * b1;
+
+ b2 = pb0[2];
+ tmp4 = a0 * b2;
+ tmp5 = a1 * b2;
+
+ b3 = pb0[3];
+ tmp6 = a0 * b3;
+ tmp7 = a1 * b3;
+
+ b4 = pb0[4];
+ tmp8 = a0 * b4;
+ tmp9 = a1 * b4;
+
+ b5 = pb0[5];
+ tmp10 = a0 * b5;
+ tmp11 = a1 * b5;
+
+ b6 = pb0[6];
+ tmp12 = a0 * b6;
+ tmp13 = a1 * b6;
+
+ b7 = pb0[7];
+ tmp14 = a0 * b7;
+ tmp15 = a1 * b7;
+
+ pa0 += 2;
+ pb0 += 8;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 += a1 * b0;
+
+ b1 = pb0[1];
+ tmp2 += a0 * b1;
+ tmp3 += a1 * b1;
+
+ b2 = pb0[2];
+ tmp4 += a0 * b2;
+ tmp5 += a1 * b2;
+
+ b3 = pb0[3];
+ tmp6 += a0 * b3;
+ tmp7 += a1 * b3;
+
+ b4 = pb0[4];
+ tmp8 += a0 * b4;
+ tmp9 += a1 * b4;
+
+ b5 = pb0[5];
+ tmp10 += a0 * b5;
+ tmp11 += a1 * b5;
+
+ b6 = pb0[6];
+ tmp12 += a0 * b6;
+ tmp13 += a1 * b6;
+
+ b7 = pb0[7];
+ tmp14 += a0 * b7;
+ tmp15 += a1 * b7;
+
+ pa0 += 2;
+ pb0 += 8;
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 += a1 * b0;
+
+ b1 = pb0[1];
+ tmp2 += a0 * b1;
+ tmp3 += a1 * b1;
+
+ b2 = pb0[2];
+ tmp4 += a0 * b2;
+ tmp5 += a1 * b2;
+
+ b3 = pb0[3];
+ tmp6 += a0 * b3;
+ tmp7 += a1 * b3;
+
+ b4 = pb0[4];
+ tmp8 += a0 * b4;
+ tmp9 += a1 * b4;
+
+ b5 = pb0[5];
+ tmp10 += a0 * b5;
+ tmp11 += a1 * b5;
+
+ b6 = pb0[6];
+ tmp12 += a0 * b6;
+ tmp13 += a1 * b6;
+
+ b7 = pb0[7];
+ tmp14 += a0 * b7;
+ tmp15 += a1 * b7;
+
+ pa0 += 2;
+ pb0 += 8;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 += a1 * b0;
+
+ b1 = pb0[1];
+ tmp2 += a0 * b1;
+ tmp3 += a1 * b1;
+
+ b2 = pb0[2];
+ tmp4 += a0 * b2;
+ tmp5 += a1 * b2;
+
+ b3 = pb0[3];
+ tmp6 += a0 * b3;
+ tmp7 += a1 * b3;
+
+ b4 = pb0[4];
+ tmp8 += a0 * b4;
+ tmp9 += a1 * b4;
+
+ b5 = pb0[5];
+ tmp10 += a0 * b5;
+ tmp11 += a1 * b5;
+
+ b6 = pb0[6];
+ tmp12 += a0 * b6;
+ tmp13 += a1 * b6;
+
+ b7 = pb0[7];
+ tmp14 += a0 * b7;
+ tmp15 += a1 * b7;
+
+ pa0 += 2;
+ pb0 += 8;
+ }
+
+ tmp0 = alpha * tmp0;
+ tmp2 = alpha * tmp2;
+ tmp4 = alpha * tmp4;
+ tmp6 = alpha * tmp6;
+ tmp8 = alpha * tmp8;
+ tmp10 = alpha * tmp10;
+ tmp12 = alpha * tmp12;
+ tmp14 = alpha * tmp14;
+
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp2;
+ pc2[0] = tmp4;
+ pc3[0] = tmp6;
+ pc4[0] = tmp8;
+ pc5[0] = tmp10;
+ pc6[0] = tmp12;
+ pc7[0] = tmp14;
+#else
+ pc0[0] += tmp0;
+ pc1[0] += tmp2;
+ pc2[0] += tmp4;
+ pc3[0] += tmp6;
+ pc4[0] += tmp8;
+ pc5[0] += tmp10;
+ pc6[0] += tmp12;
+ pc7[0] += tmp14;
+#endif
+ tmp1 = alpha * tmp1;
+ tmp3 = alpha * tmp3;
+ tmp5 = alpha * tmp5;
+ tmp7 = alpha * tmp7;
+ tmp9 = alpha * tmp9;
+ tmp11 = alpha * tmp11;
+ tmp13 = alpha * tmp13;
+ tmp15 = alpha * tmp15;
+
+#if defined(TRMMKERNEL)
+ pc0[1] = tmp1;
+ pc1[1] = tmp3;
+ pc2[1] = tmp5;
+ pc3[1] = tmp7;
+ pc4[1] = tmp9;
+ pc5[1] = tmp11;
+ pc6[1] = tmp13;
+ pc7[1] = tmp15;
+#else
+ pc0[1] += tmp1;
+ pc1[1] += tmp3;
+ pc2[1] += tmp5;
+ pc3[1] += tmp7;
+ pc4[1] += tmp9;
+ pc5[1] += tmp11;
+ pc6[1] += tmp13;
+ pc7[1] += tmp15;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 8; // number of values in B
+#endif
+ pa0 += temp * 2;
+ pb0 += temp * 8;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ pc1 += 2;
+ pc2 += 2;
+ pc3 += 2;
+ pc4 += 2;
+ pc5 += 2;
+ pc6 += 2;
+ pc7 += 2;
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 1;
+ pb0 = B + off * 8;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 8; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 = a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 = a0 * b1;
+
+ b2 = pb0[2];
+ tmp2 = a0 * b2;
+
+ b3 = pb0[3];
+ tmp3 = a0 * b3;
+
+ b4 = pb0[4];
+ tmp4 = a0 * b4;
+
+ b5 = pb0[5];
+ tmp5 = a0 * b5;
+
+ b6 = pb0[6];
+ tmp6 = a0 * b6;
+
+ b7 = pb0[7];
+ tmp7 = a0 * b7;
+
+ pa0 += 1;
+ pb0 += 8;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ b2 = pb0[2];
+ tmp2 += a0 * b2;
+
+ b3 = pb0[3];
+ tmp3 += a0 * b3;
+
+ b4 = pb0[4];
+ tmp4 += a0 * b4;
+
+ b5 = pb0[5];
+ tmp5 += a0 * b5;
+
+ b6 = pb0[6];
+ tmp6 += a0 * b6;
+
+ b7 = pb0[7];
+ tmp7 += a0 * b7;
+
+ pa0 += 1;
+ pb0 += 8;
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ b2 = pb0[2];
+ tmp2 += a0 * b2;
+
+ b3 = pb0[3];
+ tmp3 += a0 * b3;
+
+ b4 = pb0[4];
+ tmp4 += a0 * b4;
+
+ b5 = pb0[5];
+ tmp5 += a0 * b5;
+
+ b6 = pb0[6];
+ tmp6 += a0 * b6;
+
+ b7 = pb0[7];
+ tmp7 += a0 * b7;
+
+ pa0 += 1;
+ pb0 += 8;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ b2 = pb0[2];
+ tmp2 += a0 * b2;
+
+ b3 = pb0[3];
+ tmp3 += a0 * b3;
+
+ b4 = pb0[4];
+ tmp4 += a0 * b4;
+
+ b5 = pb0[5];
+ tmp5 += a0 * b5;
+
+ b6 = pb0[6];
+ tmp6 += a0 * b6;
+
+ b7 = pb0[7];
+ tmp7 += a0 * b7;
+
+ pa0 += 1;
+ pb0 += 8;
+ }
+
+ tmp0 = alpha * tmp0;
+ tmp1 = alpha * tmp1;
+ tmp2 = alpha * tmp2;
+ tmp3 = alpha * tmp3;
+ tmp4 = alpha * tmp4;
+ tmp5 = alpha * tmp5;
+ tmp6 = alpha * tmp6;
+ tmp7 = alpha * tmp7;
+
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp1;
+ pc2[0] = tmp2;
+ pc3[0] = tmp3;
+ pc4[0] = tmp4;
+ pc5[0] = tmp5;
+ pc6[0] = tmp6;
+ pc7[0] = tmp7;
+#else
+ pc0[0] += tmp0;
+ pc1[0] += tmp1;
+ pc2[0] += tmp2;
+ pc3[0] += tmp3;
+ pc4[0] += tmp4;
+ pc5[0] += tmp5;
+ pc6[0] += tmp6;
+ pc7[0] += tmp7;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 8; // number of values in B
+#endif
+ pa0 += temp * 1;
+ pb0 += temp * 8;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+ pc0 += 1;
+ pc1 += 1;
+ pc2 += 1;
+ pc3 += 1;
+ pc4 += 1;
+ pc5 += 1;
+ pc6 += 1;
+ pc7 += 1;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 8; // number of values in A
+#endif
+
+ l = (k << 3);
+ B = B + l;
+ i = (ldc << 3);
+ C = C + i;
+ }
+
+ if (n & 4)
+ {
+ pc0 = C;
+ pc1 = pc0 + ldc;
+ pc2 = pc1 + ldc;
+ pc3 = pc2 + ldc;
+
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 8;
+ pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ src_b0 = LD_SP(pb0);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 = src_a0 * src_b;
+ res1 = src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res2 = src_a0 * src_b;
+ res3 = src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res4 = src_a0 * src_b;
+ res5 = src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res6 = src_a0 * src_b;
+ res7 = src_a1 * src_b;
+
+ pb0 += 4;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ src_b0 = LD_SP(pb0);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res6 += src_a0 * src_b;
+ res7 += src_a1 * src_b;
+
+ pb0 += 4;
+
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ src_b0 = LD_SP(pb0);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res6 += src_a0 * src_b;
+ res7 += src_a1 * src_b;
+
+ pb0 += 4;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ src_b0 = LD_SP(pb0);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res4 += src_a0 * src_b;
+ res5 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res6 += src_a0 * src_b;
+ res7 += src_a1 * src_b;
+
+ pb0 += 4;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+ dst4 = res4 * v_alpha;
+ dst5 = res5 * v_alpha;
+ dst6 = res6 * v_alpha;
+ dst7 = res7 * v_alpha;
+#else
+ LD_SP2(pc0, 4, dst0, dst1);
+ LD_SP2(pc1, 4, dst2, dst3);
+ LD_SP2(pc2, 4, dst4, dst5);
+ LD_SP2(pc3, 4, dst6, dst7);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+ dst2 += res2 * v_alpha;
+ dst3 += res3 * v_alpha;
+ dst4 += res4 * v_alpha;
+ dst5 += res5 * v_alpha;
+ dst6 += res6 * v_alpha;
+ dst7 += res7 * v_alpha;
+#endif
+
+ ST_SP2_INC(dst0, dst1, pc0, 4);
+ ST_SP2_INC(dst2, dst3, pc1, 4);
+ ST_SP2_INC(dst4, dst5, pc2, 4);
+ ST_SP2_INC(dst6, dst7, pc3, 4);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 8;
+ pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 4)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 4;
+ pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ src_a0 = LD_SP(pa0);
+ src_b0 = LD_SP(pb0);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 = src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res1 = src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res2 = src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res3 = src_a0 * src_b;
+
+ pa0 += 4;
+ pb0 += 4;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ src_a0 = LD_SP(pa0);
+ src_b0 = LD_SP(pb0);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res1 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res2 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res3 += src_a0 * src_b;
+
+ pa0 += 4;
+ pb0 += 4;
+
+ src_a0 = LD_SP(pa0);
+ src_b0 = LD_SP(pb0);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res1 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res2 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res3 += src_a0 * src_b;
+
+ pa0 += 4;
+ pb0 += 4;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ src_a0 = LD_SP(pa0);
+ src_b0 = LD_SP(pb0);
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res1 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
+ res2 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
+ res3 += src_a0 * src_b;
+
+ pa0 += 4;
+ pb0 += 4;
+ }
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+#else
+ dst0 = LD_SP(pc0);
+ dst1 = LD_SP(pc1);
+ dst2 = LD_SP(pc2);
+ dst3 = LD_SP(pc3);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+ dst2 += res2 * v_alpha;
+ dst3 += res3 * v_alpha;
+#endif
+ ST_SP(dst0, pc0);
+ ST_SP(dst1, pc1);
+ ST_SP(dst2, pc2);
+ ST_SP(dst3, pc3);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 4;
+ pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ pc0 += 4;
+ pc1 += 4;
+ pc2 += 4;
+ pc3 += 4;
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2;
+ pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 = a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 = a1 * b0;
+
+ b1 = pb0[1];
+ tmp2 = a0 * b1;
+ tmp3 = a1 * b1;
+
+ b2 = pb0[2];
+ tmp4 = a0 * b2;
+ tmp5 = a1 * b2;
+
+ b3 = pb0[3];
+ tmp6 = a0 * b3;
+ tmp7 = a1 * b3;
+
+ pa0 += 2;
+ pb0 += 4;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 += a1 * b0;
+
+ b1 = pb0[1];
+ tmp2 += a0 * b1;
+ tmp3 += a1 * b1;
+
+ b2 = pb0[2];
+ tmp4 += a0 * b2;
+ tmp5 += a1 * b2;
+
+ b3 = pb0[3];
+ tmp6 += a0 * b3;
+ tmp7 += a1 * b3;
+
+ pa0 += 2;
+ pb0 += 4;
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 += a1 * b0;
+
+ b1 = pb0[1];
+ tmp2 += a0 * b1;
+ tmp3 += a1 * b1;
+
+ b2 = pb0[2];
+ tmp4 += a0 * b2;
+ tmp5 += a1 * b2;
+
+ b3 = pb0[3];
+ tmp6 += a0 * b3;
+ tmp7 += a1 * b3;
+
+ pa0 += 2;
+ pb0 += 4;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 += a1 * b0;
+
+ b1 = pb0[1];
+ tmp2 += a0 * b1;
+ tmp3 += a1 * b1;
+
+ b2 = pb0[2];
+ tmp4 += a0 * b2;
+ tmp5 += a1 * b2;
+
+ b3 = pb0[3];
+ tmp6 += a0 * b3;
+ tmp7 += a1 * b3;
+
+ pa0 += 2;
+ pb0 += 4;
+ }
+
+ tmp0 = alpha * tmp0;
+ tmp2 = alpha * tmp2;
+ tmp4 = alpha * tmp4;
+ tmp6 = alpha * tmp6;
+
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp2;
+ pc2[0] = tmp4;
+ pc3[0] = tmp6;
+#else
+ pc0[0] += tmp0;
+ pc1[0] += tmp2;
+ pc2[0] += tmp4;
+ pc3[0] += tmp6;
+#endif
+ tmp1 = alpha * tmp1;
+ tmp3 = alpha * tmp3;
+ tmp5 = alpha * tmp5;
+ tmp7 = alpha * tmp7;
+
+#if defined(TRMMKERNEL)
+ pc0[1] = tmp1;
+ pc1[1] = tmp3;
+ pc2[1] = tmp5;
+ pc3[1] = tmp7;
+#else
+ pc0[1] += tmp1;
+ pc1[1] += tmp3;
+ pc2[1] += tmp5;
+ pc3[1] += tmp7;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2;
+ pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ pc1 += 2;
+ pc2 += 2;
+ pc3 += 2;
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 1;
+ pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 = a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 = a0 * b1;
+
+ b2 = pb0[2];
+ tmp2 = a0 * b2;
+
+ b3 = pb0[3];
+ tmp3 = a0 * b3;
+
+ pa0 += 1;
+ pb0 += 4;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ b2 = pb0[2];
+ tmp2 += a0 * b2;
+
+ b3 = pb0[3];
+ tmp3 += a0 * b3;
+
+ pa0 += 1;
+ pb0 += 4;
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ b2 = pb0[2];
+ tmp2 += a0 * b2;
+
+ b3 = pb0[3];
+ tmp3 += a0 * b3;
+
+ pa0 += 1;
+ pb0 += 4;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ b2 = pb0[2];
+ tmp2 += a0 * b2;
+
+ b3 = pb0[3];
+ tmp3 += a0 * b3;
+
+ pa0 += 1;
+ pb0 += 4;
+ }
+
+ tmp0 = alpha * tmp0;
+ tmp1 = alpha * tmp1;
+ tmp2 = alpha * tmp2;
+ tmp3 = alpha * tmp3;
+
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp1;
+ pc2[0] = tmp2;
+ pc3[0] = tmp3;
+#else
+ pc0[0] += tmp0;
+ pc1[0] += tmp1;
+ pc2[0] += tmp2;
+ pc3[0] += tmp3;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 1;
+ pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+ pc0 += 1;
+ pc1 += 1;
+ pc2 += 1;
+ pc3 += 1;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 4; // number of values in A
+#endif
+
+ l = (k << 2);
+ B = B + l;
+ i = (ldc << 2);
+ C = C + i;
+ }
+
+ if (n & 2)
+ {
+ pc0 = C;
+ pc1 = pc0 + ldc;
+
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 8;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ src_b0[0] = pb0[0];
+ src_b0[1] = pb0[1];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 = src_a0 * src_b;
+ res1 = src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res2 = src_a0 * src_b;
+ res3 = src_a1 * src_b;
+
+ pb0 += 2;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ src_b0[0] = pb0[0];
+ src_b0[1] = pb0[1];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+
+ pb0 += 2;
+
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ src_b0[0] = pb0[0];
+ src_b0[1] = pb0[1];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+
+ pb0 += 2;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ src_b0[0] = pb0[0];
+ src_b0[1] = pb0[1];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res2 += src_a0 * src_b;
+ res3 += src_a1 * src_b;
+
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+#else
+ LD_SP2(pc0, 4, dst0, dst1);
+ LD_SP2(pc1, 4, dst2, dst3);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+ dst2 += res2 * v_alpha;
+ dst3 += res3 * v_alpha;
+#endif
+ ST_SP2_INC(dst0, dst1, pc0, 4);
+ ST_SP2_INC(dst2, dst3, pc1, 4);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 8;
+ pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 4)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 4;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ src_a0 = LD_SP(pa0);
+ src_b0[0] = pb0[0];
+ src_b0[1] = pb0[1];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 = src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res1 = src_a0 * src_b;
+
+ pa0 += 4;
+ pb0 += 2;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ src_a0 = LD_SP(pa0);
+ src_b0[0] = pb0[0];
+ src_b0[1] = pb0[1];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res1 += src_a0 * src_b;
+
+ pa0 += 4;
+ pb0 += 2;
+
+ src_a0 = LD_SP(pa0);
+ src_b0[0] = pb0[0];
+ src_b0[1] = pb0[1];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res1 += src_a0 * src_b;
+
+ pa0 += 4;
+ pb0 += 2;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ src_a0 = LD_SP(pa0);
+ src_b0[0] = pb0[0];
+ src_b0[1] = pb0[1];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
+ res1 += src_a0 * src_b;
+
+ pa0 += 4;
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+#else
+ dst0 = LD_SP(pc0);
+ dst1 = LD_SP(pc1);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+#endif
+ ST_SP(dst0, pc0);
+ ST_SP(dst1, pc1);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 4;
+ pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ pc0 += 4;
+ pc1 += 4;
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 = a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 = a1 * b0;
+
+ b1 = pb0[1];
+ tmp2 = a0 * b1;
+ tmp3 = a1 * b1;
+
+ pa0 += 2;
+ pb0 += 2;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 += a1 * b0;
+
+ b1 = pb0[1];
+ tmp2 += a0 * b1;
+ tmp3 += a1 * b1;
+
+ pa0 += 2;
+ pb0 += 2;
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 += a1 * b0;
+
+ b1 = pb0[1];
+ tmp2 += a0 * b1;
+ tmp3 += a1 * b1;
+
+ pa0 += 2;
+ pb0 += 2;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 += a1 * b0;
+
+ b1 = pb0[1];
+ tmp2 += a0 * b1;
+ tmp3 += a1 * b1;
+
+ pa0 += 2;
+ pb0 += 2;
+ }
+
+ tmp0 = alpha * tmp0;
+ tmp1 = alpha * tmp1;
+ tmp2 = alpha * tmp2;
+ tmp3 = alpha * tmp3;
+
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp2;
+ pc0[1] = tmp1;
+ pc1[1] = tmp3;
+#else
+ pc0[0] += tmp0;
+ pc1[0] += tmp2;
+ pc0[1] += tmp1;
+ pc1[1] += tmp3;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2;
+ pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ pc1 += 2;
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 1;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 = a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 = a0 * b1;
+
+ pa0 += 1;
+ pb0 += 2;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ pa0 += 1;
+ pb0 += 2;
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ pa0 += 1;
+ pb0 += 2;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ b1 = pb0[1];
+ tmp1 += a0 * b1;
+
+ pa0 += 1;
+ pb0 += 2;
+ }
+
+ tmp0 = alpha * tmp0;
+ tmp1 = alpha * tmp1;
+
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp1;
+#else
+ pc0[0] += tmp0;
+ pc1[0] += tmp1;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 1;
+ pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+ pc0 += 1;
+ pc1 += 1;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 2; // number of values in A
+#endif
+ l = (k << 1);
+ B = B + l;
+ i = (ldc << 1);
+ C = C + i;
+ }
+
+ if (n & 1)
+ {
+ pc0 = C;
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 8;
+ pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ src_b0[0] = pb0[0];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 = src_a0 * src_b;
+ res1 = src_a1 * src_b;
+
+ pb0 += 1;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ src_b0[0] = pb0[0];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ pb0 += 1;
+
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ src_b0[0] = pb0[0];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ pb0 += 1;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ src_b0[0] = pb0[0];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+ res1 += src_a1 * src_b;
+
+ pb0 += 1;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+#else
+ LD_SP2(pc0, 4, dst0, dst1);
+
+ dst0 += res0 * v_alpha;
+ dst1 += res1 * v_alpha;
+#endif
+ ST_SP2_INC(dst0, dst1, pc0, 4);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 8;
+ pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 4)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 4;
+ pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ src_a0 = LD_SP(pa0);
+ src_b0[0] = pb0[0];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 = src_a0 * src_b;
+
+ pa0 += 4;
+ pb0 += 1;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ src_a0 = LD_SP(pa0);
+ src_b0[0] = pb0[0];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+
+ pa0 += 4;
+ pb0 += 1;
+
+ src_a0 = LD_SP(pa0);
+ src_b0[0] = pb0[0];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+
+ pa0 += 4;
+ pb0 += 1;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ src_a0 = LD_SP(pa0);
+ src_b0[0] = pb0[0];
+
+ src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
+ res0 += src_a0 * src_b;
+
+ pa0 += 4;
+ pb0 += 1;
+ }
+
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+#else
+ dst0 = LD_SP(pc0);
+
+ dst0 += res0 * v_alpha;
+#endif
+ ST_SP(dst0, pc0);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 4;
+ pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ pc0 += 4;
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2;
+ pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 = a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 = a1 * b0;
+
+ pa0 += 2;
+ pb0 += 1;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 += a1 * b0;
+
+ pa0 += 2;
+ pb0 += 1;
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 += a1 * b0;
+
+ pa0 += 2;
+ pb0 += 1;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ a1 = pa0[1];
+ tmp1 += a1 * b0;
+
+ pa0 += 2;
+ pb0 += 1;
+ }
+
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc0[1] = tmp1;
+#else
+ pc0[0] += tmp0;
+ pc0[1] += tmp1;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2;
+ pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 1;
+ pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 = a0 * b0;
+
+ pa0 += 1;
+ pb0 += 1;
+
+ for (l = ((temp - 1) >> 1); l--;)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ pa0 += 1;
+ pb0 += 1;
+
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ pa0 += 1;
+ pb0 += 1;
+ }
+
+ if ((temp - 1) & 1)
+ {
+ a0 = pa0[0];
+ b0 = pb0[0];
+ tmp0 += a0 * b0;
+
+ pa0 += 1;
+ pb0 += 1;
+ }
+
+#if defined(TRMMKERNEL)
+ pc0[0] = alpha * tmp0;
+#else
+ pc0[0] += alpha * tmp0;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 1;
+ pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 1;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 1; // number of values in A
+#endif
+ l = (k << 0);
+ B = B + l;
+ i = (ldc << 0);
+ C = C + i;
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/sgemm_ncopy_8_msa.c b/kernel/mips/sgemm_ncopy_8_msa.c
new file mode 100644
index 000000000..8618c4435
--- /dev/null
+++ b/kernel/mips/sgemm_ncopy_8_msa.c
@@ -0,0 +1,164 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
+ FLOAT *psrc8, *pdst;
+ v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+ v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
+ v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v4f32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+ psrc0 = src;
+ pdst = dst;
+
+ for (j = (n >> 3); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc5 = psrc4 + lda;
+ psrc6 = psrc5 + lda;
+ psrc7 = psrc6 + lda;
+ psrc8 = psrc7 + lda;
+ psrc0 += 8 * lda;
+
+ for (i = (m >> 3); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+ LD_SP2_INC(psrc3, 4, src4, src5);
+ LD_SP2_INC(psrc4, 4, src6, src7);
+ LD_SP2_INC(psrc5, 4, src8, src9);
+ LD_SP2_INC(psrc6, 4, src10, src11);
+ LD_SP2_INC(psrc7, 4, src12, src13);
+ LD_SP2_INC(psrc8, 4, src14, src15);
+
+ TRANSPOSE4x4_SP_SP(src0, src2, src4, src6, dst0, dst2, dst4, dst6);
+ TRANSPOSE4x4_SP_SP(src8, src10, src12, src14, dst1, dst3, dst5,
+ dst7);
+ TRANSPOSE4x4_SP_SP(src1, src3, src5, src7, dst8, dst10, dst12,
+ dst14);
+ TRANSPOSE4x4_SP_SP(src9, src11, src13, src15, dst9, dst11, dst13,
+ dst15);
+
+ ST_SP2_INC(dst0, dst1, pdst, 4);
+ ST_SP2_INC(dst2, dst3, pdst, 4);
+ ST_SP2_INC(dst4, dst5, pdst, 4);
+ ST_SP2_INC(dst6, dst7, pdst, 4);
+ ST_SP2_INC(dst8, dst9, pdst, 4);
+ ST_SP2_INC(dst10, dst11, pdst, 4);
+ ST_SP2_INC(dst12, dst13, pdst, 4);
+ ST_SP2_INC(dst14, dst15, pdst, 4);
+ }
+
+ for (i = (m & 7); i--;)
+ {
+ *pdst++ = *psrc1++;
+ *pdst++ = *psrc2++;
+ *pdst++ = *psrc3++;
+ *pdst++ = *psrc4++;
+ *pdst++ = *psrc5++;
+ *pdst++ = *psrc6++;
+ *pdst++ = *psrc7++;
+ *pdst++ = *psrc8++;
+ }
+ }
+
+ if (n & 4)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc0 += 4 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ src0 = LD_SP(psrc1);
+ src1 = LD_SP(psrc2);
+ src2 = LD_SP(psrc3);
+ src3 = LD_SP(psrc4);
+ psrc1 += 4;
+ psrc2 += 4;
+ psrc3 += 4;
+ psrc4 += 4;
+
+ TRANSPOSE4x4_SP_SP(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
+
+ ST_SP2_INC(dst0, dst1, pdst, 4);
+ ST_SP2_INC(dst2, dst3, pdst, 4);
+ }
+
+ for (i = (m & 3); i--;)
+ {
+ *pdst++ = *psrc1++;
+ *pdst++ = *psrc2++;
+ *pdst++ = *psrc3++;
+ *pdst++ = *psrc4++;
+ }
+ }
+
+ if (n & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc0 += 2 * lda;
+
+ for (i = (m >> 1); i--;)
+ {
+ *pdst++ = *psrc1++;
+ *pdst++ = *psrc2++;
+ *pdst++ = *psrc1++;
+ *pdst++ = *psrc2++;
+ }
+
+ if (m & 1)
+ {
+ *pdst++ = *psrc1++;
+ *pdst++ = *psrc2++;
+ }
+ }
+
+ if (n & 1)
+ {
+ psrc1 = psrc0;
+
+ for (i = m; i--;)
+ {
+ *pdst++ = *psrc1++;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/sgemm_tcopy_8_msa.c b/kernel/mips/sgemm_tcopy_8_msa.c
new file mode 100644
index 000000000..3542eca21
--- /dev/null
+++ b/kernel/mips/sgemm_tcopy_8_msa.c
@@ -0,0 +1,271 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
+ FLOAT *psrc8, *pdst0, *pdst1, *pdst2, *pdst3, *pdst4;
+ v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+ v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
+
+ psrc0 = src;
+ pdst0 = dst;
+
+ pdst2 = dst + m * (n & ~7);
+ pdst3 = dst + m * (n & ~3);
+ pdst4 = dst + m * (n & ~1);
+
+ for (j = (m >> 3); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc5 = psrc4 + lda;
+ psrc6 = psrc5 + lda;
+ psrc7 = psrc6 + lda;
+ psrc8 = psrc7 + lda;
+ psrc0 += 8 * lda;
+
+ pdst1 = pdst0;
+ pdst0 += 64;
+
+ for (i = (n >> 3); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+ LD_SP2_INC(psrc3, 4, src4, src5);
+ LD_SP2_INC(psrc4, 4, src6, src7);
+ LD_SP2_INC(psrc5, 4, src8, src9);
+ LD_SP2_INC(psrc6, 4, src10, src11);
+ LD_SP2_INC(psrc7, 4, src12, src13);
+ LD_SP2_INC(psrc8, 4, src14, src15);
+
+ ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4);
+ ST_SP8(src8, src9, src10, src11, src12, src13, src14, src15,
+ pdst1 + 32, 4);
+ pdst1 += m * 8;
+ }
+
+ if (n & 4)
+ {
+ src0 = LD_SP(psrc1);
+ src1 = LD_SP(psrc2);
+ src2 = LD_SP(psrc3);
+ src3 = LD_SP(psrc4);
+ src4 = LD_SP(psrc5);
+ src5 = LD_SP(psrc6);
+ src6 = LD_SP(psrc7);
+ src7 = LD_SP(psrc8);
+ psrc1 += 4;
+ psrc2 += 4;
+ psrc3 += 4;
+ psrc4 += 4;
+ psrc5 += 4;
+ psrc6 += 4;
+ psrc7 += 4;
+ psrc8 += 4;
+
+ ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4);
+ }
+
+ if (n & 2)
+ {
+ *pdst3++ = *psrc1++;
+ *pdst3++ = *psrc1++;
+ *pdst3++ = *psrc2++;
+ *pdst3++ = *psrc2++;
+ *pdst3++ = *psrc3++;
+ *pdst3++ = *psrc3++;
+ *pdst3++ = *psrc4++;
+ *pdst3++ = *psrc4++;
+ *pdst3++ = *psrc5++;
+ *pdst3++ = *psrc5++;
+ *pdst3++ = *psrc6++;
+ *pdst3++ = *psrc6++;
+ *pdst3++ = *psrc7++;
+ *pdst3++ = *psrc7++;
+ *pdst3++ = *psrc8++;
+ *pdst3++ = *psrc8++;
+ }
+
+ if (n & 1)
+ {
+ *pdst4++ = *psrc1++;
+ *pdst4++ = *psrc2++;
+ *pdst4++ = *psrc3++;
+ *pdst4++ = *psrc4++;
+ *pdst4++ = *psrc5++;
+ *pdst4++ = *psrc6++;
+ *pdst4++ = *psrc7++;
+ *pdst4++ = *psrc8++;
+ }
+ }
+
+ if (m & 4)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc0 += 4 * lda;
+
+ pdst1 = pdst0;
+ pdst0 += 32;
+
+ for (i = (n >> 3); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+ LD_SP2_INC(psrc3, 4, src4, src5);
+ LD_SP2_INC(psrc4, 4, src6, src7);
+
+ ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4);
+ pdst1 += 8 * m;
+ }
+
+ if (n & 4)
+ {
+ src0 = LD_SP(psrc1);
+ src1 = LD_SP(psrc2);
+ src2 = LD_SP(psrc3);
+ src3 = LD_SP(psrc4);
+ psrc1 += 4;
+ psrc2 += 4;
+ psrc3 += 4;
+ psrc4 += 4;
+
+ ST_SP4_INC(src0, src1, src2, src3, pdst2, 4);
+ }
+
+ if (n & 2)
+ {
+ *pdst3++ = *psrc1++;
+ *pdst3++ = *psrc1++;
+ *pdst3++ = *psrc2++;
+ *pdst3++ = *psrc2++;
+ *pdst3++ = *psrc3++;
+ *pdst3++ = *psrc3++;
+ *pdst3++ = *psrc4++;
+ *pdst3++ = *psrc4++;
+ }
+
+ if (n & 1)
+ {
+ *pdst4++ = *psrc1++;
+ *pdst4++ = *psrc2++;
+ *pdst4++ = *psrc3++;
+ *pdst4++ = *psrc4++;
+ }
+ }
+
+ if (m & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc0 += 2 * lda;
+
+ pdst1 = pdst0;
+ pdst0 += 16;
+
+ for (i = (n >> 3); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+
+ ST_SP4(src0, src1, src2, src3, pdst1, 4);
+ pdst1 += 8 * m;
+ }
+
+ if (n & 4)
+ {
+ src0 = LD_SP(psrc1);
+ src1 = LD_SP(psrc2);
+ psrc1 += 4;
+ psrc2 += 4;
+
+ ST_SP2_INC(src0, src1, pdst2, 4);
+ }
+
+ if (n & 2)
+ {
+ *pdst3++ = *psrc1++;
+ *pdst3++ = *psrc1++;
+ *pdst3++ = *psrc2++;
+ *pdst3++ = *psrc2++;
+ }
+
+ if (n & 1)
+ {
+ *pdst4++ = *psrc1++;
+ *pdst4++ = *psrc2++;
+ }
+ }
+
+ if (m & 1)
+ {
+ psrc1 = psrc0;
+ psrc0 += lda;
+
+ pdst1 = pdst0;
+ pdst0 += 8;
+
+ for (i = (n >> 3); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+
+ ST_SP2(src0, src1, pdst1, 4);
+ pdst1 += 8 * m;
+ }
+
+ if (n & 4)
+ {
+ src0 = LD_SP(psrc1);
+ psrc1 += 4;
+
+ ST_SP(src0, pdst2);
+ pdst2 += 4;
+ }
+
+ if (n & 2)
+ {
+ *pdst3++ = *psrc1++;
+ *pdst3++ = *psrc1++;
+ }
+
+ if (n & 1)
+ {
+ *pdst4++ = *psrc1++;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/sgemv_n_msa.c b/kernel/mips/sgemv_n_msa.c
new file mode 100644
index 000000000..ae6e6558f
--- /dev/null
+++ b/kernel/mips/sgemv_n_msa.c
@@ -0,0 +1,515 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#define SGEMV_N_8x8() \
+{ \
+ LD_SP2(pa0 + k, 4, t0, t1); \
+ LD_SP2(pa1 + k, 4, t2, t3); \
+ LD_SP2(pa2 + k, 4, t4, t5); \
+ LD_SP2(pa3 + k, 4, t6, t7); \
+ LD_SP2(pa4 + k, 4, t8, t9); \
+ LD_SP2(pa5 + k, 4, t10, t11); \
+ LD_SP2(pa6 + k, 4, t12, t13); \
+ LD_SP2(pa7 + k, 4, t14, t15); \
+ \
+ y0 += tp0 * t0; \
+ y1 += tp0 * t1; \
+ \
+ y0 += tp1 * t2; \
+ y1 += tp1 * t3; \
+ \
+ y0 += tp2 * t4; \
+ y1 += tp2 * t5; \
+ \
+ y0 += tp3 * t6; \
+ y1 += tp3 * t7; \
+ \
+ y0 += tp4 * t8; \
+ y1 += tp4 * t9; \
+ \
+ y0 += tp5 * t10; \
+ y1 += tp5 * t11; \
+ \
+ y0 += tp6 * t12; \
+ y1 += tp6 * t13; \
+ \
+ y0 += tp7 * t14; \
+ y1 += tp7 * t15; \
+}
+
+#define SGEMV_N_4x8() \
+{ \
+ t0 = LD_SP(pa0 + k); \
+ t2 = LD_SP(pa1 + k); \
+ t4 = LD_SP(pa2 + k); \
+ t6 = LD_SP(pa3 + k); \
+ t8 = LD_SP(pa4 + k); \
+ t10 = LD_SP(pa5 + k); \
+ t12 = LD_SP(pa6 + k); \
+ t14 = LD_SP(pa7 + k); \
+ \
+ y0 += tp0 * t0; \
+ y0 += tp1 * t2; \
+ y0 += tp2 * t4; \
+ y0 += tp3 * t6; \
+ y0 += tp4 * t8; \
+ y0 += tp5 * t10; \
+ y0 += tp6 * t12; \
+ y0 += tp7 * t14; \
+}
+
+#define SGEMV_N_8x4() \
+{ \
+ LD_SP2(pa0 + k, 4, t0, t1); \
+ LD_SP2(pa1 + k, 4, t2, t3); \
+ LD_SP2(pa2 + k, 4, t4, t5); \
+ LD_SP2(pa3 + k, 4, t6, t7); \
+ \
+ y0 += tp0 * t0; \
+ y1 += tp0 * t1; \
+ \
+ y0 += tp1 * t2; \
+ y1 += tp1 * t3; \
+ \
+ y0 += tp2 * t4; \
+ y1 += tp2 * t5; \
+ \
+ y0 += tp3 * t6; \
+ y1 += tp3 * t7; \
+}
+
+#define SGEMV_N_4x4() \
+{ \
+ t0 = LD_SP(pa0 + k); \
+ t2 = LD_SP(pa1 + k); \
+ t4 = LD_SP(pa2 + k); \
+ t6 = LD_SP(pa3 + k); \
+ \
+ y0 += tp0 * t0; \
+ y0 += tp1 * t2; \
+ y0 += tp2 * t4; \
+ y0 += tp3 * t6; \
+}
+
+#define SGEMV_N_8x2() \
+{ \
+ LD_SP2(pa0 + k, 4, t0, t1); \
+ LD_SP2(pa1 + k, 4, t2, t3); \
+ \
+ y0 += tp0 * t0; \
+ y1 += tp0 * t1; \
+ \
+ y0 += tp1 * t2; \
+ y1 += tp1 * t3; \
+}
+
+#define SGEMV_N_4x2() \
+{ \
+ t0 = LD_SP(pa0 + k); \
+ t2 = LD_SP(pa1 + k); \
+ \
+ y0 += tp0 * t0; \
+ y0 += tp1 * t2; \
+}
+
+#define SLOAD_X8_SCALE_GP() \
+ temp0 = alpha * x[0 * inc_x]; \
+ temp1 = alpha * x[1 * inc_x]; \
+ temp2 = alpha * x[2 * inc_x]; \
+ temp3 = alpha * x[3 * inc_x]; \
+ temp4 = alpha * x[4 * inc_x]; \
+ temp5 = alpha * x[5 * inc_x]; \
+ temp6 = alpha * x[6 * inc_x]; \
+ temp7 = alpha * x[7 * inc_x]; \
+ \
+ tp0 = COPY_FLOAT_TO_VECTOR(temp0); \
+ tp1 = COPY_FLOAT_TO_VECTOR(temp1); \
+ tp2 = COPY_FLOAT_TO_VECTOR(temp2); \
+ tp3 = COPY_FLOAT_TO_VECTOR(temp3); \
+ tp4 = COPY_FLOAT_TO_VECTOR(temp4); \
+ tp5 = COPY_FLOAT_TO_VECTOR(temp5); \
+ tp6 = COPY_FLOAT_TO_VECTOR(temp6); \
+ tp7 = COPY_FLOAT_TO_VECTOR(temp7); \
+
+#define SLOAD_X4_SCALE_GP() \
+ temp0 = alpha * x[0 * inc_x]; \
+ temp1 = alpha * x[1 * inc_x]; \
+ temp2 = alpha * x[2 * inc_x]; \
+ temp3 = alpha * x[3 * inc_x]; \
+ \
+ tp0 = COPY_FLOAT_TO_VECTOR(temp0); \
+ tp1 = COPY_FLOAT_TO_VECTOR(temp1); \
+ tp2 = COPY_FLOAT_TO_VECTOR(temp2); \
+ tp3 = COPY_FLOAT_TO_VECTOR(temp3); \
+
+#define SLOAD_X8_SCALE_VECTOR() \
+ LD_SP2(x, 4, x0, x1); \
+ \
+ x0 = x0 * v_alpha; \
+ x1 = x1 * v_alpha; \
+ \
+ SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3); \
+ SPLATI_W4_SP(x1, tp4, tp5, tp6, tp7); \
+
+#define SLOAD_X4_SCALE_VECTOR() \
+ x0 = LD_SP(x); \
+ x0 = x0 * v_alpha; \
+ SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3); \
+
+#define SLOAD_Y8_GP() \
+ y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y))); \
+ y0 = (v4f32) __msa_insert_w((v4i32) y0, 1, *((int *)(y + 1 * inc_y))); \
+ y0 = (v4f32) __msa_insert_w((v4i32) y0, 2, *((int *)(y + 2 * inc_y))); \
+ y0 = (v4f32) __msa_insert_w((v4i32) y0, 3, *((int *)(y + 3 * inc_y))); \
+ y1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 4 * inc_y))); \
+ y1 = (v4f32) __msa_insert_w((v4i32) y1, 1, *((int *)(y + 5 * inc_y))); \
+ y1 = (v4f32) __msa_insert_w((v4i32) y1, 2, *((int *)(y + 6 * inc_y))); \
+ y1 = (v4f32) __msa_insert_w((v4i32) y1, 3, *((int *)(y + 7 * inc_y))); \
+
+#define SLOAD_Y4_GP() \
+ y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y))); \
+ y0 = (v4f32) __msa_insert_w((v4i32) y0, 1, *((int *)(y + 1 * inc_y))); \
+ y0 = (v4f32) __msa_insert_w((v4i32) y0, 2, *((int *)(y + 2 * inc_y))); \
+ y0 = (v4f32) __msa_insert_w((v4i32) y0, 3, *((int *)(y + 3 * inc_y))); \
+
+#define SLOAD_Y8_VECTOR() LD_SP2(y, 4, y0, y1);
+#define SLOAD_Y4_VECTOR() y0 = LD_SP(y);
+
+#define SSTORE_Y8_GP() \
+ *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0); \
+ *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1); \
+ *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2); \
+ *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3); \
+ *((int *)(y + 4 * inc_y)) = __msa_copy_s_w((v4i32) y1, 0); \
+ *((int *)(y + 5 * inc_y)) = __msa_copy_s_w((v4i32) y1, 1); \
+ *((int *)(y + 6 * inc_y)) = __msa_copy_s_w((v4i32) y1, 2); \
+ *((int *)(y + 7 * inc_y)) = __msa_copy_s_w((v4i32) y1, 3); \
+
+#define SSTORE_Y4_GP() \
+ *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0); \
+ *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1); \
+ *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2); \
+ *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3); \
+
+#define SSTORE_Y8_VECTOR() ST_SP2(y0, y1, y, 4);
+#define SSTORE_Y4_VECTOR() ST_SP(y0, y);
+
+#define SGEMV_N_MSA() \
+ for (j = (n >> 3); j--;) \
+ { \
+ SLOAD_X8_SCALE(); \
+ \
+ k = 0; \
+ y = y_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ SLOAD_Y8(); \
+ SGEMV_N_8x8(); \
+ SSTORE_Y8(); \
+ \
+ y += 8 * inc_y; \
+ k += 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ SLOAD_Y4(); \
+ SGEMV_N_4x8(); \
+ SSTORE_Y4(); \
+ \
+ y += 4 * inc_y; \
+ k += 4; \
+ } \
+ \
+ if (m & 3) \
+ { \
+ temp0 = alpha * x[0 * inc_x]; \
+ temp1 = alpha * x[1 * inc_x]; \
+ temp2 = alpha * x[2 * inc_x]; \
+ temp3 = alpha * x[3 * inc_x]; \
+ temp4 = alpha * x[4 * inc_x]; \
+ temp5 = alpha * x[5 * inc_x]; \
+ temp6 = alpha * x[6 * inc_x]; \
+ temp7 = alpha * x[7 * inc_x]; \
+ \
+ for (i = (m & 3); i--;) \
+ { \
+ temp = y[0]; \
+ temp += temp0 * pa0[k]; \
+ temp += temp1 * pa1[k]; \
+ temp += temp2 * pa2[k]; \
+ temp += temp3 * pa3[k]; \
+ temp += temp4 * pa4[k]; \
+ temp += temp5 * pa5[k]; \
+ temp += temp6 * pa6[k]; \
+ temp += temp7 * pa7[k]; \
+ y[0] = temp; \
+ \
+ y += inc_y; \
+ k++; \
+ } \
+ } \
+ pa0 += 8 * lda; \
+ pa1 += 8 * lda; \
+ pa2 += 8 * lda; \
+ pa3 += 8 * lda; \
+ pa4 += 8 * lda; \
+ pa5 += 8 * lda; \
+ pa6 += 8 * lda; \
+ pa7 += 8 * lda; \
+ \
+ x += 8 * inc_x; \
+ } \
+ \
+ if (n & 4) \
+ { \
+ SLOAD_X4_SCALE(); \
+ \
+ k = 0; \
+ y = y_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ SLOAD_Y8(); \
+ SGEMV_N_8x4(); \
+ SSTORE_Y8(); \
+ \
+ y += 8 * inc_y; \
+ k += 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ SLOAD_Y4(); \
+ SGEMV_N_4x4(); \
+ SSTORE_Y4(); \
+ \
+ y += 4 * inc_y; \
+ k += 4; \
+ } \
+ \
+ if (m & 3) \
+ { \
+ temp0 = alpha * x[0 * inc_x]; \
+ temp1 = alpha * x[1 * inc_x]; \
+ temp2 = alpha * x[2 * inc_x]; \
+ temp3 = alpha * x[3 * inc_x]; \
+ \
+ for (i = (m & 3); i--;) \
+ { \
+ temp = y[0]; \
+ temp += temp0 * pa0[k]; \
+ temp += temp1 * pa1[k]; \
+ temp += temp2 * pa2[k]; \
+ temp += temp3 * pa3[k]; \
+ y[0] = temp; \
+ \
+ y += inc_y; \
+ k++; \
+ } \
+ } \
+ \
+ pa0 += 4 * lda; \
+ pa1 += 4 * lda; \
+ pa2 += 4 * lda; \
+ pa3 += 4 * lda; \
+ \
+ x += 4 * inc_x; \
+ } \
+ \
+ if (n & 2) \
+ { \
+ temp0 = alpha * x[0 * inc_x]; \
+ temp1 = alpha * x[1 * inc_x]; \
+ \
+ tp0 = COPY_FLOAT_TO_VECTOR(temp0); \
+ tp1 = COPY_FLOAT_TO_VECTOR(temp1); \
+ \
+ k = 0; \
+ y = y_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ SLOAD_Y8(); \
+ SGEMV_N_8x2(); \
+ SSTORE_Y8(); \
+ \
+ y += 8 * inc_y; \
+ k += 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ SLOAD_Y4(); \
+ SGEMV_N_4x2(); \
+ SSTORE_Y4(); \
+ \
+ y += 4 * inc_y; \
+ k += 4; \
+ } \
+ \
+ if (m & 3) \
+ { \
+ temp0 = alpha * x[0 * inc_x]; \
+ temp1 = alpha * x[1 * inc_x]; \
+ \
+ for (i = (m & 3); i--;) \
+ { \
+ temp = y[0]; \
+ temp += temp0 * pa0[k]; \
+ temp += temp1 * pa1[k]; \
+ y[0] = temp; \
+ \
+ y += inc_y; \
+ k++; \
+ } \
+ } \
+ \
+ pa0 += 2 * lda; \
+ pa1 += 2 * lda; \
+ \
+ x += 2 * inc_x; \
+ } \
+ \
+ if (n & 1) \
+ { \
+ temp = alpha * x[0]; \
+ \
+ k = 0; \
+ y = y_org; \
+ \
+ for (i = m; i--;) \
+ { \
+ y[0] += temp * pa0[k]; \
+ \
+ y += inc_y; \
+ k++; \
+ } \
+ } \
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
+ BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+ FLOAT *buffer)
+{
+ BLASLONG i, j, k;
+ FLOAT *y_org = y;
+ FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
+ FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ v4f32 v_alpha, x0, x1, y0, y1;
+ v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+ v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+
+ v_alpha = COPY_FLOAT_TO_VECTOR(alpha);
+
+ pa0 = A;
+ pa1 = A + lda;
+ pa2 = A + 2 * lda;
+ pa3 = A + 3 * lda;
+ pa4 = A + 4 * lda;
+ pa5 = A + 5 * lda;
+ pa6 = A + 6 * lda;
+ pa7 = A + 7 * lda;
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ #define SLOAD_X8_SCALE SLOAD_X8_SCALE_VECTOR
+ #define SLOAD_X4_SCALE SLOAD_X4_SCALE_VECTOR
+ #define SLOAD_Y8 SLOAD_Y8_VECTOR
+ #define SLOAD_Y4 SLOAD_Y4_VECTOR
+ #define SSTORE_Y8 SSTORE_Y8_VECTOR
+ #define SSTORE_Y4 SSTORE_Y4_VECTOR
+
+ SGEMV_N_MSA();
+
+ #undef SLOAD_X8_SCALE
+ #undef SLOAD_X4_SCALE
+ #undef SLOAD_Y8
+ #undef SLOAD_Y4
+ #undef SSTORE_Y8
+ #undef SSTORE_Y4
+ }
+ else if (1 == inc_y)
+ {
+ #define SLOAD_X8_SCALE SLOAD_X8_SCALE_GP
+ #define SLOAD_X4_SCALE SLOAD_X4_SCALE_GP
+ #define SLOAD_Y8 SLOAD_Y8_VECTOR
+ #define SLOAD_Y4 SLOAD_Y4_VECTOR
+ #define SSTORE_Y8 SSTORE_Y8_VECTOR
+ #define SSTORE_Y4 SSTORE_Y4_VECTOR
+
+ SGEMV_N_MSA();
+
+ #undef SLOAD_X8_SCALE
+ #undef SLOAD_X4_SCALE
+ #undef SLOAD_Y8
+ #undef SLOAD_Y4
+ #undef SSTORE_Y8
+ #undef SSTORE_Y4
+ }
+ else if (1 == inc_x)
+ {
+ #define SLOAD_X8_SCALE SLOAD_X8_SCALE_VECTOR
+ #define SLOAD_X4_SCALE SLOAD_X4_SCALE_VECTOR
+ #define SLOAD_Y8 SLOAD_Y8_GP
+ #define SLOAD_Y4 SLOAD_Y4_GP
+ #define SSTORE_Y8 SSTORE_Y8_GP
+ #define SSTORE_Y4 SSTORE_Y4_GP
+
+ SGEMV_N_MSA();
+
+ #undef SLOAD_X8_SCALE
+ #undef SLOAD_X4_SCALE
+ #undef SLOAD_Y8
+ #undef SLOAD_Y4
+ #undef SSTORE_Y8
+ #undef SSTORE_Y4
+ }
+ else
+ {
+ #define SLOAD_X8_SCALE SLOAD_X8_SCALE_GP
+ #define SLOAD_X4_SCALE SLOAD_X4_SCALE_GP
+ #define SLOAD_Y8 SLOAD_Y8_GP
+ #define SLOAD_Y4 SLOAD_Y4_GP
+ #define SSTORE_Y8 SSTORE_Y8_GP
+ #define SSTORE_Y4 SSTORE_Y4_GP
+
+ SGEMV_N_MSA();
+
+ #undef SLOAD_X8_SCALE
+ #undef SLOAD_X4_SCALE
+ #undef SLOAD_Y8
+ #undef SLOAD_Y4
+ #undef SSTORE_Y8
+ #undef SSTORE_Y4
+ }
+
+ return(0);
+}
diff --git a/kernel/mips/sgemv_t_msa.c b/kernel/mips/sgemv_t_msa.c
new file mode 100644
index 000000000..1c7f2998f
--- /dev/null
+++ b/kernel/mips/sgemv_t_msa.c
@@ -0,0 +1,463 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#define SGEMV_T_8x8() \
+{ \
+ LD_SP2(pa0 + k, 4, t0, t1); \
+ LD_SP2(pa1 + k, 4, t2, t3); \
+ LD_SP2(pa2 + k, 4, t4, t5); \
+ LD_SP2(pa3 + k, 4, t6, t7); \
+ LD_SP2(pa4 + k, 4, t8, t9); \
+ LD_SP2(pa5 + k, 4, t10, t11); \
+ LD_SP2(pa6 + k, 4, t12, t13); \
+ LD_SP2(pa7 + k, 4, t14, t15); \
+ \
+ tp0 += x0 * t0; \
+ tp0 += x1 * t1; \
+ \
+ tp1 += x0 * t2; \
+ tp1 += x1 * t3; \
+ \
+ tp2 += x0 * t4; \
+ tp2 += x1 * t5; \
+ \
+ tp3 += x0 * t6; \
+ tp3 += x1 * t7; \
+ \
+ tp4 += x0 * t8; \
+ tp4 += x1 * t9; \
+ \
+ tp5 += x0 * t10; \
+ tp5 += x1 * t11; \
+ \
+ tp6 += x0 * t12; \
+ tp6 += x1 * t13; \
+ \
+ tp7 += x0 * t14; \
+ tp7 += x1 * t15; \
+}
+
+#define SGEMV_T_8x4() \
+{ \
+ t0 = LD_SP(pa0 + k); \
+ t2 = LD_SP(pa1 + k); \
+ t4 = LD_SP(pa2 + k); \
+ t6 = LD_SP(pa3 + k); \
+ t8 = LD_SP(pa4 + k); \
+ t10 = LD_SP(pa5 + k); \
+ t12 = LD_SP(pa6 + k); \
+ t14 = LD_SP(pa7 + k); \
+ \
+ tp0 += x0 * t0; \
+ tp1 += x0 * t2; \
+ tp2 += x0 * t4; \
+ tp3 += x0 * t6; \
+ tp4 += x0 * t8; \
+ tp5 += x0 * t10; \
+ tp6 += x0 * t12; \
+ tp7 += x0 * t14; \
+}
+
+#define SGEMV_T_4x8() \
+{ \
+ LD_SP2(pa0 + k, 4, t0, t1); \
+ LD_SP2(pa1 + k, 4, t2, t3); \
+ LD_SP2(pa2 + k, 4, t4, t5); \
+ LD_SP2(pa3 + k, 4, t6, t7); \
+ \
+ tp0 += x0 * t0; \
+ tp0 += x1 * t1; \
+ \
+ tp1 += x0 * t2; \
+ tp1 += x1 * t3; \
+ \
+ tp2 += x0 * t4; \
+ tp2 += x1 * t5; \
+ \
+ tp3 += x0 * t6; \
+ tp3 += x1 * t7; \
+}
+
+#define SGEMV_T_4x4() \
+{ \
+ t0 = LD_SP(pa0 + k); \
+ t2 = LD_SP(pa1 + k); \
+ t4 = LD_SP(pa2 + k); \
+ t6 = LD_SP(pa3 + k); \
+ \
+ tp0 += x0 * t0; \
+ tp1 += x0 * t2; \
+ tp2 += x0 * t4; \
+ tp3 += x0 * t6; \
+}
+
+#define SGEMV_T_2x8() \
+{ \
+ LD_SP2(pa0 + k, 4, t0, t1); \
+ LD_SP2(pa1 + k, 4, t2, t3); \
+ \
+ tp0 += x0 * t0; \
+ tp0 += x1 * t1; \
+ \
+ tp1 += x0 * t2; \
+ tp1 += x1 * t3; \
+}
+
+#define SGEMV_T_2x4() \
+{ \
+ t0 = LD_SP(pa0 + k); \
+ t2 = LD_SP(pa1 + k); \
+ \
+ tp0 += x0 * t0; \
+ tp1 += x0 * t2; \
+}
+
+#define SLOAD_X8_GP() \
+ x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x))); \
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *)(x + 1 * inc_x))); \
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *)(x + 2 * inc_x))); \
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *)(x + 3 * inc_x))); \
+ x1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 4 * inc_x))); \
+ x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *)(x + 5 * inc_x))); \
+ x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *)(x + 6 * inc_x))); \
+ x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *)(x + 7 * inc_x))); \
+
+#define SLOAD_X4_GP() \
+ x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x))); \
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *)(x + 1 * inc_x))); \
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *)(x + 2 * inc_x))); \
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *)(x + 3 * inc_x))); \
+
+#define SLOAD_X8_VECTOR() LD_SP2(x, 4, x0, x1);
+#define SLOAD_X4_VECTOR() x0 = LD_SP(x);
+
+#define SGEMV_T_MSA() \
+ for (j = (n >> 3); j--;) \
+ { \
+ tp0 = zero; \
+ tp1 = zero; \
+ tp2 = zero; \
+ tp3 = zero; \
+ tp4 = zero; \
+ tp5 = zero; \
+ tp6 = zero; \
+ tp7 = zero; \
+ \
+ k = 0; \
+ x = srcx_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ SLOAD_X8(); \
+ SGEMV_T_8x8(); \
+ \
+ x += 8 * inc_x; \
+ k += 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ SLOAD_X4(); \
+ SGEMV_T_8x4(); \
+ \
+ x += 4 * inc_x; \
+ k += 4; \
+ } \
+ \
+ TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3, \
+ tp0, tp1, tp2, tp3); \
+ TRANSPOSE4x4_SP_SP(tp4, tp5, tp6, tp7, \
+ tp4, tp5, tp6, tp7); \
+ tp0 += tp1; \
+ tp0 += tp2; \
+ tp0 += tp3; \
+ tp4 += tp5; \
+ tp4 += tp6; \
+ tp4 += tp7; \
+ \
+ temp0 = tp0[0]; \
+ temp1 = tp0[1]; \
+ temp2 = tp0[2]; \
+ temp3 = tp0[3]; \
+ temp4 = tp4[0]; \
+ temp5 = tp4[1]; \
+ temp6 = tp4[2]; \
+ temp7 = tp4[3]; \
+ \
+ for (i = (m & 3); i--;) \
+ { \
+ temp0 += pa0[k] * x[0]; \
+ temp1 += pa1[k] * x[0]; \
+ temp2 += pa2[k] * x[0]; \
+ temp3 += pa3[k] * x[0]; \
+ temp4 += pa4[k] * x[0]; \
+ temp5 += pa5[k] * x[0]; \
+ temp6 += pa6[k] * x[0]; \
+ temp7 += pa7[k] * x[0]; \
+ \
+ x += inc_x; \
+ k++; \
+ } \
+ \
+ res0 = y[0 * inc_y]; \
+ res1 = y[1 * inc_y]; \
+ res2 = y[2 * inc_y]; \
+ res3 = y[3 * inc_y]; \
+ res4 = y[4 * inc_y]; \
+ res5 = y[5 * inc_y]; \
+ res6 = y[6 * inc_y]; \
+ res7 = y[7 * inc_y]; \
+ \
+ res0 += alpha * temp0; \
+ res1 += alpha * temp1; \
+ res2 += alpha * temp2; \
+ res3 += alpha * temp3; \
+ res4 += alpha * temp4; \
+ res5 += alpha * temp5; \
+ res6 += alpha * temp6; \
+ res7 += alpha * temp7; \
+ \
+ y[0 * inc_y] = res0; \
+ y[1 * inc_y] = res1; \
+ y[2 * inc_y] = res2; \
+ y[3 * inc_y] = res3; \
+ y[4 * inc_y] = res4; \
+ y[5 * inc_y] = res5; \
+ y[6 * inc_y] = res6; \
+ y[7 * inc_y] = res7; \
+ \
+ y += 8 * inc_y; \
+ \
+ pa0 += 8 * lda; \
+ pa1 += 8 * lda; \
+ pa2 += 8 * lda; \
+ pa3 += 8 * lda; \
+ pa4 += 8 * lda; \
+ pa5 += 8 * lda; \
+ pa6 += 8 * lda; \
+ pa7 += 8 * lda; \
+ } \
+ \
+ if (n & 4) \
+ { \
+ tp0 = zero; \
+ tp1 = zero; \
+ tp2 = zero; \
+ tp3 = zero; \
+ \
+ k = 0; \
+ x = srcx_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ SLOAD_X8(); \
+ SGEMV_T_4x8(); \
+ \
+ x += 8 * inc_x; \
+ k += 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ SLOAD_X4(); \
+ SGEMV_T_4x4(); \
+ \
+ x += 4 * inc_x; \
+ k += 4; \
+ } \
+ \
+ TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3, \
+ tp0, tp1, tp2, tp3); \
+ tp0 += tp1; \
+ tp0 += tp2; \
+ tp0 += tp3; \
+ \
+ temp0 = tp0[0]; \
+ temp1 = tp0[1]; \
+ temp2 = tp0[2]; \
+ temp3 = tp0[3]; \
+ \
+ for (i = (m & 3); i--;) \
+ { \
+ temp0 += pa0[k] * x[0]; \
+ temp1 += pa1[k] * x[0]; \
+ temp2 += pa2[k] * x[0]; \
+ temp3 += pa3[k] * x[0]; \
+ \
+ x += inc_x; \
+ k++; \
+ } \
+ \
+ res0 = y[0 * inc_y]; \
+ res1 = y[1 * inc_y]; \
+ res2 = y[2 * inc_y]; \
+ res3 = y[3 * inc_y]; \
+ \
+ res0 += alpha * temp0; \
+ res1 += alpha * temp1; \
+ res2 += alpha * temp2; \
+ res3 += alpha * temp3; \
+ \
+ y[0 * inc_y] = res0; \
+ y[1 * inc_y] = res1; \
+ y[2 * inc_y] = res2; \
+ y[3 * inc_y] = res3; \
+ \
+ y += 4 * inc_y; \
+ \
+ pa0 += 4 * lda; \
+ pa1 += 4 * lda; \
+ pa2 += 4 * lda; \
+ pa3 += 4 * lda; \
+ } \
+ \
+ if (n & 2) \
+ { \
+ tp0 = zero; \
+ tp1 = zero; \
+ \
+ k = 0; \
+ x = srcx_org; \
+ \
+ for (i = (m >> 3); i--;) \
+ { \
+ SLOAD_X8(); \
+ SGEMV_T_2x8(); \
+ \
+ x += 8 * inc_x; \
+ k += 8; \
+ } \
+ \
+ if (m & 4) \
+ { \
+ SLOAD_X4(); \
+ SGEMV_T_2x4(); \
+ \
+ x += 4 * inc_x; \
+ k += 4; \
+ } \
+ \
+ ILVRL_W2_SP(tp1, tp0, tp2, tp3); \
+ \
+ tp2 += tp3; \
+ \
+ temp0 = tp2[0] + tp2[2]; \
+ temp1 = tp2[1] + tp2[3]; \
+ \
+ for (i = (m & 3); i--;) \
+ { \
+ temp0 += pa0[k] * x[0]; \
+ temp1 += pa1[k] * x[0]; \
+ \
+ x += inc_x; \
+ k++; \
+ } \
+ \
+ res0 = y[0 * inc_y]; \
+ res1 = y[1 * inc_y]; \
+ \
+ res0 += alpha * temp0; \
+ res1 += alpha * temp1; \
+ \
+ y[0 * inc_y] = res0; \
+ y[1 * inc_y] = res1; \
+ \
+ y += 2 * inc_y; \
+ \
+ pa0 += 2 * lda; \
+ pa1 += 2 * lda; \
+ } \
+ \
+ if (n & 1) \
+ { \
+ temp0 = 0.0; \
+ \
+ k = 0; \
+ x = srcx_org; \
+ \
+ for (i = m; i--;) \
+ { \
+ temp0 += pa0[k] * x[0]; \
+ \
+ x += inc_x; \
+ k++; \
+ } \
+ \
+ y[0] += alpha * temp0; \
+ y += inc_y; \
+ pa0 += lda; \
+ }
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
+ BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+ FLOAT *buffer)
+{
+ BLASLONG i, j, k;
+ FLOAT *srcx_org = x;
+ FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
+ FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
+ v4f32 x0, x1;
+ v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+ v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+ v4f32 zero = {0};
+
+ pa0 = A + 0 * lda;
+ pa1 = A + 1 * lda;
+ pa2 = A + 2 * lda;
+ pa3 = A + 3 * lda;
+ pa4 = A + 4 * lda;
+ pa5 = A + 5 * lda;
+ pa6 = A + 6 * lda;
+ pa7 = A + 7 * lda;
+
+ if (1 == inc_x)
+ {
+ #define SLOAD_X8 SLOAD_X8_VECTOR
+ #define SLOAD_X4 SLOAD_X4_VECTOR
+
+ SGEMV_T_MSA();
+
+ #undef SLOAD_X8
+ #undef SLOAD_X4
+ }
+ else
+ {
+ #define SLOAD_X8 SLOAD_X8_GP
+ #define SLOAD_X4 SLOAD_X4_GP
+
+ SGEMV_T_MSA();
+
+ #undef SLOAD_X8
+ #undef SLOAD_X4
+ }
+
+ return(0);
+}
diff --git a/kernel/mips/strsm_kernel_LN_8x8_msa.c b/kernel/mips/strsm_kernel_LN_8x8_msa.c
new file mode 100644
index 000000000..53891e64f
--- /dev/null
+++ b/kernel/mips/strsm_kernel_LN_8x8_msa.c
@@ -0,0 +1,1786 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
+ v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v4f32 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15;
+ v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24;
+ v4f32 src_a25, src_a26, src_a27, src_a32, src_a33, src_a34, src_a35, src_a36;
+ v4f32 src_a40, src_a41, src_a42, src_a43, src_a44, src_a45;
+ v4f32 src_a48, src_a49, src_a50, src_a51, src_a52, src_a53, src_a54;
+ v4f32 src_a56, src_a57, src_a58, src_a59, src_a60, src_a61, src_a62, src_a63;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+ FLOAT *c_nxt4line = c + 4 * ldc;
+ FLOAT *c_nxt5line = c + 5 * ldc;
+ FLOAT *c_nxt6line = c + 6 * ldc;
+ FLOAT *c_nxt7line = c + 7 * ldc;
+
+ LD_SP2(c, 4, src_c0, src_c1);
+ LD_SP2(c_nxt1line, 4, src_c2, src_c3);
+ LD_SP2(c_nxt2line, 4, src_c4, src_c5);
+ LD_SP2(c_nxt3line, 4, src_c6, src_c7);
+ LD_SP2(c_nxt4line, 4, src_c8, src_c9);
+ LD_SP2(c_nxt5line, 4, src_c10, src_c11);
+ LD_SP2(c_nxt6line, 4, src_c12, src_c13);
+ LD_SP2(c_nxt7line, 4, src_c14, src_c15);
+
+ for (k = 0; k < bk; k++)
+ {
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b = LD_SP(bb + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ src_b = LD_SP(bb + 4);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+
+ aa += 8;
+ bb += 8;
+ }
+
+ a -= 64;
+ b -= 64;
+
+ TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7,
+ res_c4, res_c5, res_c6, res_c7);
+ TRANSPOSE4x4_SP_SP(src_c9, src_c11, src_c13, src_c15,
+ res_c12, res_c13, res_c14, res_c15);
+ TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6,
+ res_c0, res_c1, res_c2, res_c3);
+ TRANSPOSE4x4_SP_SP(src_c8, src_c10, src_c12, src_c14,
+ res_c8, res_c9, res_c10, res_c11);
+
+ src_a = LD_SP(a + 60);
+ SPLATI_W4_SP(src_a, src_a60, src_a61, src_a62, src_a63);
+ src_a = LD_SP(a + 56);
+ SPLATI_W4_SP(src_a, src_a56, src_a57, src_a58, src_a59);
+
+ res_c7 *= src_a63;
+ res_c15 *= src_a63;
+ res_c6 -= res_c7 * src_a62;
+ res_c14 -= res_c15 * src_a62;
+ res_c5 -= res_c7 * src_a61;
+ res_c13 -= res_c15 * src_a61;
+ res_c4 -= res_c7 * src_a60;
+ res_c12 -= res_c15 * src_a60;
+ res_c3 -= res_c7 * src_a59;
+ res_c11 -= res_c15 * src_a59;
+ res_c2 -= res_c7 * src_a58;
+ res_c10 -= res_c15 * src_a58;
+ res_c1 -= res_c7 * src_a57;
+ res_c9 -= res_c15 * src_a57;
+ res_c0 -= res_c7 * src_a56;
+ res_c8 -= res_c15 * src_a56;
+
+ src_a = LD_SP(a + 48);
+ SPLATI_W4_SP(src_a, src_a48, src_a49, src_a50, src_a51);
+ src_a52 = LD_SP(a + 52);
+ src_a54 = (v4f32) __msa_splati_w((v4i32) src_a52, 2);
+ src_a53 = (v4f32) __msa_splati_w((v4i32) src_a52, 1);
+ src_a52 = (v4f32) __msa_splati_w((v4i32) src_a52, 0);
+
+ res_c6 *= src_a54;
+ res_c14 *= src_a54;
+ res_c5 -= res_c6 * src_a53;
+ res_c13 -= res_c14 * src_a53;
+ res_c4 -= res_c6 * src_a52;
+ res_c12 -= res_c14 * src_a52;
+ res_c3 -= res_c6 * src_a51;
+ res_c11 -= res_c14 * src_a51;
+ res_c2 -= res_c6 * src_a50;
+ res_c10 -= res_c14 * src_a50;
+ res_c1 -= res_c6 * src_a49;
+ res_c9 -= res_c14 * src_a49;
+ res_c0 -= res_c6 * src_a48;
+ res_c8 -= res_c14 * src_a48;
+
+ src_a = LD_SP(a + 40);
+ SPLATI_W4_SP(src_a, src_a40, src_a41, src_a42, src_a43);
+ src_a44 = LD_SP(a + 44);
+ src_a45 = (v4f32) __msa_splati_w((v4i32) src_a44, 1);
+ src_a44 = (v4f32) __msa_splati_w((v4i32) src_a44, 0);
+
+ res_c5 *= src_a45;
+ res_c13 *= src_a45;
+ res_c4 -= res_c5 * src_a44;
+ res_c12 -= res_c13 * src_a44;
+ res_c3 -= res_c5 * src_a43;
+ res_c11 -= res_c13 * src_a43;
+ res_c2 -= res_c5 * src_a42;
+ res_c10 -= res_c13 * src_a42;
+ res_c1 -= res_c5 * src_a41;
+ res_c9 -= res_c13 * src_a41;
+ res_c0 -= res_c5 * src_a40;
+ res_c8 -= res_c13 * src_a40;
+
+ src_a = LD_SP(a + 32);
+ SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35);
+ src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36));
+
+ res_c4 *= src_a36;
+ res_c12 *= src_a36;
+ res_c3 -= res_c4 * src_a35;
+ res_c11 -= res_c12 * src_a35;
+ res_c2 -= res_c4 * src_a34;
+ res_c10 -= res_c12 * src_a34;
+ res_c1 -= res_c4 * src_a33;
+ res_c9 -= res_c12 * src_a33;
+ res_c0 -= res_c4 * src_a32;
+ res_c8 -= res_c12 * src_a32;
+
+ ST_SP4(res_c4, res_c12, res_c5, res_c13, b + 32, 4);
+ ST_SP4(res_c6, res_c14, res_c7, res_c15, b + 48, 4);
+
+ TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
+ src_c1, src_c3, src_c5, src_c7);
+ TRANSPOSE4x4_SP_SP(res_c12, res_c13, res_c14, res_c15,
+ src_c9, src_c11, src_c13, src_c15);
+
+ ST_SP(src_c1, c + 4);
+ ST_SP(src_c3, c_nxt1line + 4);
+ ST_SP(src_c5, c_nxt2line + 4);
+ ST_SP(src_c7, c_nxt3line + 4);
+ ST_SP(src_c9, c_nxt4line + 4);
+ ST_SP(src_c11, c_nxt5line + 4);
+ ST_SP(src_c13, c_nxt6line + 4);
+ ST_SP(src_c15, c_nxt7line + 4);
+
+ src_a = LD_SP(a + 24);
+ SPLATI_W4_SP(src_a, src_a24, src_a25, src_a26, src_a27);
+
+ res_c3 *= src_a27;
+ res_c11 *= src_a27;
+ res_c2 -= res_c3 * src_a26;
+ res_c10 -= res_c11 * src_a26;
+ res_c1 -= res_c3 * src_a25;
+ res_c9 -= res_c11 * src_a25;
+ res_c0 -= res_c3 * src_a24;
+ res_c8 -= res_c11 * src_a24;
+
+ src_a16 = LD_SP(a + 16);
+ src_a18 = (v4f32) __msa_splati_w((v4i32) src_a16, 2);
+ src_a17 = (v4f32) __msa_splati_w((v4i32) src_a16, 1);
+ src_a16 = (v4f32) __msa_splati_w((v4i32) src_a16, 0);
+
+ res_c2 *= src_a18;
+ res_c10 *= src_a18;
+ res_c1 -= res_c2 * src_a17;
+ res_c9 -= res_c10 * src_a17;
+ res_c0 -= res_c2 * src_a16;
+ res_c8 -= res_c10 * src_a16;
+
+ src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9));
+ src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8));
+ src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
+
+ res_c1 *= src_a9;
+ res_c9 *= src_a9;
+ res_c0 -= res_c1 * src_a8;
+ res_c8 -= res_c9 * src_a8;
+
+ res_c0 *= src_a0;
+ res_c8 *= src_a0;
+
+ ST_SP4(res_c0, res_c8, res_c1, res_c9, b, 4);
+ ST_SP4(res_c2, res_c10, res_c3, res_c11, b + 16, 4);
+
+ TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
+ src_c0, src_c2, src_c4, src_c6);
+ TRANSPOSE4x4_SP_SP(res_c8, res_c9, res_c10, res_c11,
+ src_c8, src_c10, src_c12, src_c14);
+
+ ST_SP(src_c0, c);
+ ST_SP(src_c2, c_nxt1line);
+ ST_SP(src_c4, c_nxt2line);
+ ST_SP(src_c6, c_nxt3line);
+ ST_SP(src_c8, c_nxt4line);
+ ST_SP(src_c10, c_nxt5line);
+ ST_SP(src_c12, c_nxt6line);
+ ST_SP(src_c14, c_nxt7line);
+}
+
+static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24;
+ v4f32 src_a25, src_a26, src_a27, src_a32, src_a33, src_a34, src_a35;
+ v4f32 src_a36, src_a40, src_a41, src_a42, src_a43, src_a44, src_a45;
+ v4f32 src_a48, src_a49, src_a50, src_a51, src_a52, src_a53, src_a54;
+ v4f32 src_a56, src_a57, src_a58, src_a59, src_a60, src_a61, src_a62, src_a63;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ LD_SP2(c, 4, src_c0, src_c1);
+ LD_SP2(c_nxt1line, 4, src_c2, src_c3);
+ LD_SP2(c_nxt2line, 4, src_c4, src_c5);
+ LD_SP2(c_nxt3line, 4, src_c6, src_c7);
+
+ for (k = 0; k < (bk >> 1); k++)
+ {
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b = LD_SP(bb + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ aa += 8;
+ bb += 4;
+
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b = LD_SP(bb + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ aa += 8;
+ bb += 4;
+ }
+
+ if ((bk & 1) && (bk > 0))
+ {
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b = LD_SP(bb + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+ }
+
+ a -= 64;
+ b -= 32;
+
+ TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6,
+ res_c0, res_c1, res_c2, res_c3);
+ TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7,
+ res_c4, res_c5, res_c6, res_c7);
+
+ src_a = LD_SP(a + 60);
+ SPLATI_W4_SP(src_a, src_a60, src_a61, src_a62, src_a63);
+ src_a = LD_SP(a + 56);
+ SPLATI_W4_SP(src_a, src_a56, src_a57, src_a58, src_a59);
+
+ src_a = LD_SP(a + 48);
+ SPLATI_W4_SP(src_a, src_a48, src_a49, src_a50, src_a51);
+ src_a52 = LD_SP(a + 52);
+ src_a54 = (v4f32) __msa_splati_w((v4i32) src_a52, 2);
+ src_a53 = (v4f32) __msa_splati_w((v4i32) src_a52, 1);
+ src_a52 = (v4f32) __msa_splati_w((v4i32) src_a52, 0);
+
+ res_c7 *= src_a63;
+ res_c6 -= res_c7 * src_a62;
+ res_c5 -= res_c7 * src_a61;
+ res_c4 -= res_c7 * src_a60;
+ res_c3 -= res_c7 * src_a59;
+ res_c2 -= res_c7 * src_a58;
+ res_c1 -= res_c7 * src_a57;
+ res_c0 -= res_c7 * src_a56;
+
+ res_c6 *= src_a54;
+ res_c5 -= res_c6 * src_a53;
+ res_c4 -= res_c6 * src_a52;
+ res_c3 -= res_c6 * src_a51;
+ res_c2 -= res_c6 * src_a50;
+ res_c1 -= res_c6 * src_a49;
+ res_c0 -= res_c6 * src_a48;
+
+ src_a = LD_SP(a + 40);
+ SPLATI_W4_SP(src_a, src_a40, src_a41, src_a42, src_a43);
+ src_a44 = LD_SP(a + 44);
+ src_a45 = (v4f32) __msa_splati_w((v4i32) src_a44, 1);
+ src_a44 = (v4f32) __msa_splati_w((v4i32) src_a44, 0);
+
+ res_c5 *= src_a45;
+ res_c4 -= res_c5 * src_a44;
+ res_c3 -= res_c5 * src_a43;
+ res_c2 -= res_c5 * src_a42;
+ res_c1 -= res_c5 * src_a41;
+ res_c0 -= res_c5 * src_a40;
+
+ src_a = LD_SP(a + 32);
+ SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35);
+ src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36));
+
+ res_c4 *= src_a36;
+ res_c3 -= res_c4 * src_a35;
+ res_c2 -= res_c4 * src_a34;
+ res_c1 -= res_c4 * src_a33;
+ res_c0 -= res_c4 * src_a32;
+
+ src_a = LD_SP(a + 24);
+ SPLATI_W4_SP(src_a, src_a24, src_a25, src_a26, src_a27);
+
+ res_c3 *= src_a27;
+ res_c2 -= res_c3 * src_a26;
+ res_c1 -= res_c3 * src_a25;
+ res_c0 -= res_c3 * src_a24;
+
+ src_a16 = LD_SP(a + 16);
+ src_a18 = (v4f32) __msa_splati_w((v4i32) src_a16, 2);
+ src_a17 = (v4f32) __msa_splati_w((v4i32) src_a16, 1);
+ src_a16 = (v4f32) __msa_splati_w((v4i32) src_a16, 0);
+
+ res_c2 *= src_a18;
+ res_c1 -= res_c2 * src_a17;
+ res_c0 -= res_c2 * src_a16;
+
+ src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9));
+ src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8));
+ src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
+
+ res_c1 *= src_a9;
+ res_c0 -= res_c1 * src_a8;
+
+ res_c0 *= src_a0;
+
+ ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4);
+ ST_SP4(res_c4, res_c5, res_c6, res_c7, b + 16, 4);
+
+ TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
+ src_c0, src_c2, src_c4, src_c6);
+ TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
+ src_c1, src_c3, src_c5, src_c7);
+
+ ST_SP2(src_c0, src_c1, c, 4);
+ ST_SP2(src_c2, src_c3, c_nxt1line, 4);
+ ST_SP2(src_c4, src_c5, c_nxt2line, 4);
+ ST_SP2(src_c6, src_c7, c_nxt3line, 4);
+}
+
+static void ssolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35;
+ FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53;
+ FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63;
+ FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
+ FLOAT c0_nxt, c1_nxt, c2_nxt, c3_nxt, c4_nxt, c5_nxt, c6_nxt, c7_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+ c4 = *(c + 4);
+ c5 = *(c + 5);
+ c6 = *(c + 6);
+ c7 = *(c + 7);
+ c0_nxt = *(c + 0 + ldc);
+ c1_nxt = *(c + 1 + ldc);
+ c2_nxt = *(c + 2 + ldc);
+ c3_nxt = *(c + 3 + ldc);
+ c4_nxt = *(c + 4 + ldc);
+ c5_nxt = *(c + 5 + ldc);
+ c6_nxt = *(c + 6 + ldc);
+ c7_nxt = *(c + 7 + ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c2 -= aa[2] * bb[0];
+ c3 -= aa[3] * bb[0];
+ c4 -= aa[4] * bb[0];
+ c5 -= aa[5] * bb[0];
+ c6 -= aa[6] * bb[0];
+ c7 -= aa[7] * bb[0];
+ c0_nxt -= aa[0] * bb[1];
+ c1_nxt -= aa[1] * bb[1];
+ c2_nxt -= aa[2] * bb[1];
+ c3_nxt -= aa[3] * bb[1];
+ c4_nxt -= aa[4] * bb[1];
+ c5_nxt -= aa[5] * bb[1];
+ c6_nxt -= aa[6] * bb[1];
+ c7_nxt -= aa[7] * bb[1];
+
+ aa += 8;
+ bb += 2;
+ }
+
+ a -= 64;
+ b -= 16;
+
+ a0 = *(a + 0);
+ a8 = *(a + 8);
+ a9 = *(a + 9);
+ a16 = *(a + 16);
+ a17 = *(a + 17);
+ a18 = *(a + 18);
+ a24 = *(a + 24);
+ a25 = *(a + 25);
+ a26 = *(a + 26);
+ a27 = *(a + 27);
+ a32 = *(a + 32);
+ a33 = *(a + 33);
+ a34 = *(a + 34);
+ a35 = *(a + 35);
+ a36 = *(a + 36);
+ a40 = *(a + 40);
+ a41 = *(a + 41);
+ a42 = *(a + 42);
+ a43 = *(a + 43);
+ a44 = *(a + 44);
+ a45 = *(a + 45);
+ a48 = *(a + 48);
+ a49 = *(a + 49);
+ a50 = *(a + 50);
+ a51 = *(a + 51);
+ a52 = *(a + 52);
+ a53 = *(a + 53);
+ a54 = *(a + 54);
+ a56 = *(a + 56);
+ a57 = *(a + 57);
+ a58 = *(a + 58);
+ a59 = *(a + 59);
+ a60 = *(a + 60);
+ a61 = *(a + 61);
+ a62 = *(a + 62);
+ a63 = *(a + 63);
+
+ c7 *= a63;
+ c7_nxt *= a63;
+ c6 -= c7 * a62;
+ c6_nxt -= c7_nxt * a62;
+ c5 -= c7 * a61;
+ c5_nxt -= c7_nxt * a61;
+ c4 -= c7 * a60;
+ c4_nxt -= c7_nxt * a60;
+ c3 -= c7 * a59;
+ c3_nxt -= c7_nxt * a59;
+ c2 -= c7 * a58;
+ c2_nxt -= c7_nxt * a58;
+ c1 -= c7 * a57;
+ c1_nxt -= c7_nxt * a57;
+ c0 -= c7 * a56;
+ c0_nxt -= c7_nxt * a56;
+
+ c6 *= a54;
+ c6_nxt *= a54;
+ c5 -= c6 * a53;
+ c5_nxt -= c6_nxt * a53;
+ c4 -= c6 * a52;
+ c4_nxt -= c6_nxt * a52;
+ c3 -= c6 * a51;
+ c3_nxt -= c6_nxt * a51;
+ c2 -= c6 * a50;
+ c2_nxt -= c6_nxt * a50;
+ c1 -= c6 * a49;
+ c1_nxt -= c6_nxt * a49;
+ c0 -= c6 * a48;
+ c0_nxt -= c6_nxt * a48;
+
+ c5 *= a45;
+ c5_nxt *= a45;
+ c4 -= c5 * a44;
+ c4_nxt -= c5_nxt * a44;
+ c3 -= c5 * a43;
+ c3_nxt -= c5_nxt * a43;
+ c2 -= c5 * a42;
+ c2_nxt -= c5_nxt * a42;
+ c1 -= c5 * a41;
+ c1_nxt -= c5_nxt * a41;
+ c0 -= c5 * a40;
+ c0_nxt -= c5_nxt * a40;
+
+ c4 *= a36;
+ c4_nxt *= a36;
+ c3 -= c4 * a35;
+ c3_nxt -= c4_nxt * a35;
+ c2 -= c4 * a34;
+ c2_nxt -= c4_nxt * a34;
+ c1 -= c4 * a33;
+ c1_nxt -= c4_nxt * a33;
+ c0 -= c4 * a32;
+ c0_nxt -= c4_nxt * a32;
+
+ c3 *= a27;
+ c3_nxt *= a27;
+ c2 -= c3 * a26;
+ c2_nxt -= c3_nxt * a26;
+ c1 -= c3 * a25;
+ c1_nxt -= c3_nxt * a25;
+ c0 -= c3 * a24;
+ c0_nxt -= c3_nxt * a24;
+
+ c2 *= a18;
+ c2_nxt *= a18;
+ c1 -= c2 * a17;
+ c1_nxt -= c2_nxt * a17;
+ c0 -= c2 * a16;
+ c0_nxt -= c2_nxt * a16;
+
+ c1 *= a9;
+ c1_nxt *= a9;
+ c0 -= c1 * a8;
+ c0_nxt -= c1_nxt * a8;
+
+ c0 *= a0;
+ c0_nxt *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt;
+ *(b + 2) = c1;
+ *(b + 3) = c1_nxt;
+ *(b + 4) = c2;
+ *(b + 5) = c2_nxt;
+ *(b + 6) = c3;
+ *(b + 7) = c3_nxt;
+ *(b + 8) = c4;
+ *(b + 9) = c4_nxt;
+ *(b + 10) = c5;
+ *(b + 11) = c5_nxt;
+ *(b + 12) = c6;
+ *(b + 13) = c6_nxt;
+ *(b + 14) = c7;
+ *(b + 15) = c7_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+ *(c + 4) = c4;
+ *(c + 5) = c5;
+ *(c + 6) = c6;
+ *(c + 7) = c7;
+ *(c + 0 + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+ *(c + 2 + ldc) = c2_nxt;
+ *(c + 3 + ldc) = c3_nxt;
+ *(c + 4 + ldc) = c4_nxt;
+ *(c + 5 + ldc) = c5_nxt;
+ *(c + 6 + ldc) = c6_nxt;
+ *(c + 7 + ldc) = c7_nxt;
+}
+
+static void ssolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35;
+ FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53;
+ FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63;
+ FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+ c4 = *(c + 4);
+ c5 = *(c + 5);
+ c6 = *(c + 6);
+ c7 = *(c + 7);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c2 -= aa[2] * bb[0];
+ c3 -= aa[3] * bb[0];
+ c4 -= aa[4] * bb[0];
+ c5 -= aa[5] * bb[0];
+ c6 -= aa[6] * bb[0];
+ c7 -= aa[7] * bb[0];
+
+ aa += 8;
+ bb += 1;
+ }
+
+ a -= 64;
+ b -= 8;
+
+ a0 = *(a + 0);
+ a8 = *(a + 8);
+ a9 = *(a + 9);
+ a16 = *(a + 16);
+ a17 = *(a + 17);
+ a18 = *(a + 18);
+ a24 = *(a + 24);
+ a25 = *(a + 25);
+ a26 = *(a + 26);
+ a27 = *(a + 27);
+ a32 = *(a + 32);
+ a33 = *(a + 33);
+ a34 = *(a + 34);
+ a35 = *(a + 35);
+ a36 = *(a + 36);
+ a40 = *(a + 40);
+ a41 = *(a + 41);
+ a42 = *(a + 42);
+ a43 = *(a + 43);
+ a44 = *(a + 44);
+ a45 = *(a + 45);
+ a48 = *(a + 48);
+ a49 = *(a + 49);
+ a50 = *(a + 50);
+ a51 = *(a + 51);
+ a52 = *(a + 52);
+ a53 = *(a + 53);
+ a54 = *(a + 54);
+ a56 = *(a + 56);
+ a57 = *(a + 57);
+ a58 = *(a + 58);
+ a59 = *(a + 59);
+ a60 = *(a + 60);
+ a61 = *(a + 61);
+ a62 = *(a + 62);
+ a63 = *(a + 63);
+
+ c7 *= a63;
+
+ c6 -= c7 * a62;
+ c6 *= a54;
+
+ c5 -= c7 * a61;
+ c5 -= c6 * a53;
+ c5 *= a45;
+
+ c4 -= c7 * a60;
+ c4 -= c6 * a52;
+ c4 -= c5 * a44;
+ c4 *= a36;
+
+ c3 -= c7 * a59;
+ c3 -= c6 * a51;
+ c3 -= c5 * a43;
+ c3 -= c4 * a35;
+ c3 *= a27;
+
+ c2 -= c7 * a58;
+ c2 -= c6 * a50;
+ c2 -= c5 * a42;
+ c2 -= c4 * a34;
+ c2 -= c3 * a26;
+ c2 *= a18;
+
+ c1 -= c7 * a57;
+ c1 -= c6 * a49;
+ c1 -= c5 * a41;
+ c1 -= c4 * a33;
+ c1 -= c3 * a25;
+ c1 -= c2 * a17;
+ c1 *= a9;
+
+ c0 -= c7 * a56;
+ c0 -= c6 * a48;
+ c0 -= c5 * a40;
+ c0 -= c4 * a32;
+ c0 -= c3 * a24;
+ c0 -= c2 * a16;
+ c0 -= c1 * a8;
+ c0 *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+ *(b + 2) = c2;
+ *(b + 3) = c3;
+ *(b + 4) = c4;
+ *(b + 5) = c5;
+ *(b + 6) = c6;
+ *(b + 7) = c7;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+ *(c + 4) = c4;
+ *(c + 5) = c5;
+ *(c + 6) = c6;
+ *(c + 7) = c7;
+}
+
+static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12;
+ v4f32 src_a13, src_a14, src_a15;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+ FLOAT *c_nxt4line = c + 4 * ldc;
+ FLOAT *c_nxt5line = c + 5 * ldc;
+ FLOAT *c_nxt6line = c + 6 * ldc;
+ FLOAT *c_nxt7line = c + 7 * ldc;
+
+ src_c0 = LD_SP(c);
+ src_c1 = LD_SP(c_nxt1line);
+ src_c2 = LD_SP(c_nxt2line);
+ src_c3 = LD_SP(c_nxt3line);
+ src_c4 = LD_SP(c_nxt4line);
+ src_c5 = LD_SP(c_nxt5line);
+ src_c6 = LD_SP(c_nxt6line);
+ src_c7 = LD_SP(c_nxt7line);
+
+ for (k = 0; k < bk; k++)
+ {
+ src_a0 = LD_SP(aa);
+
+ src_b = LD_SP(bb);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+
+ src_b = LD_SP(bb + 4);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c4 -= src_a0 * src_b0;
+ src_c5 -= src_a0 * src_b1;
+ src_c6 -= src_a0 * src_b2;
+ src_c7 -= src_a0 * src_b3;
+
+ aa += 4;
+ bb += 8;
+ }
+
+ a -= 16;
+ b -= 32;
+
+ TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3,
+ res_c0, res_c1, res_c2, res_c3);
+ TRANSPOSE4x4_SP_SP(src_c4, src_c5, src_c6, src_c7,
+ res_c4, res_c5, res_c6, res_c7);
+
+ src_a = LD_SP(a + 12);
+ SPLATI_W4_SP(src_a, src_a12, src_a13, src_a14, src_a15);
+ src_a8 = LD_SP(a + 8);
+ src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2);
+ src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1);
+ src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0);
+
+ src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5));
+ src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4));
+ src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
+
+ res_c3 *= src_a15;
+ res_c7 *= src_a15;
+ res_c2 -= res_c3 * src_a14;
+ res_c6 -= res_c7 * src_a14;
+ res_c1 -= res_c3 * src_a13;
+ res_c5 -= res_c7 * src_a13;
+ res_c0 -= res_c3 * src_a12;
+ res_c4 -= res_c7 * src_a12;
+
+ res_c2 *= src_a10;
+ res_c6 *= src_a10;
+ res_c1 -= res_c2 * src_a9;
+ res_c5 -= res_c6 * src_a9;
+ res_c0 -= res_c2 * src_a8;
+ res_c4 -= res_c6 * src_a8;
+
+ res_c1 *= src_a5;
+ res_c5 *= src_a5;
+ res_c0 -= res_c1 * src_a4;
+ res_c4 -= res_c5 * src_a4;
+
+ res_c0 *= src_a0;
+ res_c4 *= src_a0;
+
+ ST_SP4(res_c0, res_c4, res_c1, res_c5, b, 4);
+ ST_SP4(res_c2, res_c6, res_c3, res_c7, b + 16, 4);
+
+ TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
+ src_c0, src_c1, src_c2, src_c3);
+ TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
+ src_c4, src_c5, src_c6, src_c7);
+
+ ST_SP(src_c0, c);
+ ST_SP(src_c1, c_nxt1line);
+ ST_SP(src_c2, c_nxt2line);
+ ST_SP(src_c3, c_nxt3line);
+ ST_SP(src_c4, c_nxt4line);
+ ST_SP(src_c5, c_nxt5line);
+ ST_SP(src_c6, c_nxt6line);
+ ST_SP(src_c7, c_nxt7line);
+}
+
+static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
+ v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3;
+ v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12;
+ v4f32 src_a13, src_a14, src_a15;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ src_c0 = LD_SP(c);
+ src_c1 = LD_SP(c_nxt1line);
+ src_c2 = LD_SP(c_nxt2line);
+ src_c3 = LD_SP(c_nxt3line);
+
+ for (k = 0; k < (bk >> 1); k++)
+ {
+ src_a0 = LD_SP(aa);
+
+ src_b = LD_SP(bb);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+
+ aa += 4;
+ bb += 4;
+
+ src_a0 = LD_SP(aa);
+
+ src_b = LD_SP(bb);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+
+ aa += 4;
+ bb += 4;
+ }
+
+ if ((bk & 1) && (bk > 0))
+ {
+ src_a0 = LD_SP(aa);
+
+ src_b = LD_SP(bb);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+ }
+
+ a -= 16;
+ b -= 16;
+
+ TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3,
+ res_c0, res_c1, res_c2, res_c3);
+
+ src_a = LD_SP(a + 12);
+ SPLATI_W4_SP(src_a, src_a12, src_a13, src_a14, src_a15);
+ src_a8 = LD_SP(a + 8);
+ src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2);
+ src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1);
+ src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0);
+ src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5));
+ src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4));
+ src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
+
+ res_c3 *= src_a15;
+ res_c2 -= res_c3 * src_a14;
+ res_c1 -= res_c3 * src_a13;
+ res_c0 -= res_c3 * src_a12;
+
+ res_c2 *= src_a10;
+ res_c1 -= res_c2 * src_a9;
+ res_c0 -= res_c2 * src_a8;
+
+ res_c1 *= src_a5;
+ res_c0 -= res_c1 * src_a4;
+
+ res_c0 *= src_a0;
+
+ ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4);
+
+ TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
+ src_c0, src_c1, src_c2, src_c3);
+
+ ST_SP(src_c0, c);
+ ST_SP(src_c1, c_nxt1line);
+ ST_SP(src_c2, c_nxt2line);
+ ST_SP(src_c3, c_nxt3line);
+}
+
+static void ssolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15;
+ FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+ c0_nxt = *(c + 0 + ldc);
+ c1_nxt = *(c + 1 + ldc);
+ c2_nxt = *(c + 2 + ldc);
+ c3_nxt = *(c + 3 + ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c2 -= aa[2] * bb[0];
+ c3 -= aa[3] * bb[0];
+ c0_nxt -= aa[0] * bb[1];
+ c1_nxt -= aa[1] * bb[1];
+ c2_nxt -= aa[2] * bb[1];
+ c3_nxt -= aa[3] * bb[1];
+
+ aa += 4;
+ bb += 2;
+ }
+
+ a -= 16;
+ b -= 8;
+
+ a0 = *(a + 0);
+ a4 = *(a + 4);
+ a5 = *(a + 5);
+ a8 = *(a + 8);
+ a9 = *(a + 9);
+ a10 = *(a + 10);
+ a12 = *(a + 12);
+ a13 = *(a + 13);
+ a14 = *(a + 14);
+ a15 = *(a + 15);
+
+ c3 *= a15;
+ c3_nxt *= a15;
+
+ c2 -= c3 * a14;
+ c2_nxt -= c3_nxt * a14;
+
+ c2 *= a10;
+ c2_nxt *= a10;
+
+ c1 -= c3 * a13;
+ c1_nxt -= c3_nxt * a13;
+
+ c1 -= c2 * a9;
+ c1_nxt -= c2_nxt * a9;
+
+ c1 *= a5;
+ c1_nxt *= a5;
+
+ c0 -= c3 * a12;
+ c0_nxt -= c3_nxt * a12;
+
+ c0 -= c2 * a8;
+ c0_nxt -= c2_nxt * a8;
+
+ c0 -= c1 * a4;
+ c0_nxt -= c1_nxt * a4;
+
+ c0 *= a0;
+ c0_nxt *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt;
+ *(b + 2) = c1;
+ *(b + 3) = c1_nxt;
+ *(b + 4) = c2;
+ *(b + 5) = c2_nxt;
+ *(b + 6) = c3;
+ *(b + 7) = c3_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+ *(c + 0 + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+ *(c + 2 + ldc) = c2_nxt;
+ *(c + 3 + ldc) = c3_nxt;
+}
+
+static void ssolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c2 -= aa[2] * bb[0];
+ c3 -= aa[3] * bb[0];
+
+ aa += 4;
+ bb += 1;
+ }
+
+ a -= 16;
+ b -= 4;
+
+ a0 = *(a + 0);
+ a4 = *(a + 4);
+ a5 = *(a + 5);
+ a8 = *(a + 8);
+ a9 = *(a + 9);
+ a10 = *(a + 10);
+ a12 = *(a + 12);
+ a13 = *(a + 13);
+ a14 = *(a + 14);
+ a15 = *(a + 15);
+
+ c3 *= a15;
+
+ c2 -= c3 * a14;
+ c2 *= a10;
+
+ c1 -= c3 * a13;
+ c1 -= c2 * a9;
+ c1 *= a5;
+
+ c0 -= c3 * a12;
+ c0 -= c2 * a8;
+ c0 -= c1 * a4;
+ c0 *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+ *(b + 2) = c2;
+ *(b + 3) = c3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+}
+
+static void ssolve_2x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3;
+ FLOAT c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6;
+ FLOAT c0_nxt7, c1_nxt7;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + 0 + 1 * ldc);
+ c1_nxt1 = *(c + 1 + 1 * ldc);
+ c0_nxt2 = *(c + 0 + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 0 + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+ c0_nxt4 = *(c + 0 + 4 * ldc);
+ c1_nxt4 = *(c + 1 + 4 * ldc);
+ c0_nxt5 = *(c + 0 + 5 * ldc);
+ c1_nxt5 = *(c + 1 + 5 * ldc);
+ c0_nxt6 = *(c + 0 + 6 * ldc);
+ c1_nxt6 = *(c + 1 + 6 * ldc);
+ c0_nxt7 = *(c + 0 + 7 * ldc);
+ c1_nxt7 = *(c + 1 + 7 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c0_nxt1 -= aa[0] * bb[1];
+ c1_nxt1 -= aa[1] * bb[1];
+ c0_nxt2 -= aa[0] * bb[2];
+ c1_nxt2 -= aa[1] * bb[2];
+ c0_nxt3 -= aa[0] * bb[3];
+ c1_nxt3 -= aa[1] * bb[3];
+ c0_nxt4 -= aa[0] * bb[4];
+ c1_nxt4 -= aa[1] * bb[4];
+ c0_nxt5 -= aa[0] * bb[5];
+ c1_nxt5 -= aa[1] * bb[5];
+ c0_nxt6 -= aa[0] * bb[6];
+ c1_nxt6 -= aa[1] * bb[6];
+ c0_nxt7 -= aa[0] * bb[7];
+ c1_nxt7 -= aa[1] * bb[7];
+
+ aa += 2;
+ bb += 8;
+ }
+
+ a -= 4;
+ b -= 16;
+
+ a0 = *(a + 0);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+
+ c1 *= a3;
+ c1_nxt1 *= a3;
+ c1_nxt2 *= a3;
+ c1_nxt3 *= a3;
+ c1_nxt4 *= a3;
+ c1_nxt5 *= a3;
+ c1_nxt6 *= a3;
+ c1_nxt7 *= a3;
+
+ c0 -= c1 * a2;
+ c0_nxt1 -= c1_nxt1 * a2;
+ c0_nxt2 -= c1_nxt2 * a2;
+ c0_nxt3 -= c1_nxt3 * a2;
+ c0_nxt4 -= c1_nxt4 * a2;
+ c0_nxt5 -= c1_nxt5 * a2;
+ c0_nxt6 -= c1_nxt6 * a2;
+ c0_nxt7 -= c1_nxt7 * a2;
+
+ c0 *= a0;
+ c0_nxt1 *= a0;
+ c0_nxt2 *= a0;
+ c0_nxt3 *= a0;
+ c0_nxt4 *= a0;
+ c0_nxt5 *= a0;
+ c0_nxt6 *= a0;
+ c0_nxt7 *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt1;
+ *(b + 2) = c0_nxt2;
+ *(b + 3) = c0_nxt3;
+ *(b + 4) = c0_nxt4;
+ *(b + 5) = c0_nxt5;
+ *(b + 6) = c0_nxt6;
+ *(b + 7) = c0_nxt7;
+ *(b + 8) = c1;
+ *(b + 9) = c1_nxt1;
+ *(b + 10) = c1_nxt2;
+ *(b + 11) = c1_nxt3;
+ *(b + 12) = c1_nxt4;
+ *(b + 13) = c1_nxt5;
+ *(b + 14) = c1_nxt6;
+ *(b + 15) = c1_nxt7;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + 1 * ldc) = c0_nxt1;
+ *(c + 1 + 1 * ldc) = c1_nxt1;
+ *(c + 0 + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+ *(c + 0 + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+ *(c + 0 + 4 * ldc) = c0_nxt4;
+ *(c + 1 + 4 * ldc) = c1_nxt4;
+ *(c + 0 + 5 * ldc) = c0_nxt5;
+ *(c + 1 + 5 * ldc) = c1_nxt5;
+ *(c + 0 + 6 * ldc) = c0_nxt6;
+ *(c + 1 + 6 * ldc) = c1_nxt6;
+ *(c + 0 + 7 * ldc) = c0_nxt7;
+ *(c + 1 + 7 * ldc) = c1_nxt7;
+}
+
+static void ssolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1;
+ FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + 0 + ldc);
+ c1_nxt1 = *(c + 1 + ldc);
+ c0_nxt2 = *(c + 0 + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 0 + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c0_nxt1 -= aa[0] * bb[1];
+ c1_nxt1 -= aa[1] * bb[1];
+ c0_nxt2 -= aa[0] * bb[2];
+ c1_nxt2 -= aa[1] * bb[2];
+ c0_nxt3 -= aa[0] * bb[3];
+ c1_nxt3 -= aa[1] * bb[3];
+
+ aa += 2;
+ bb += 4;
+ }
+
+ a -= 4;
+ b -= 8;
+
+ a0 = *(a + 0);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+
+ c1 *= a3;
+ c1_nxt1 *= a3;
+ c1_nxt2 *= a3;
+ c1_nxt3 *= a3;
+
+ c0 -= c1 * a2;
+ c0_nxt1 -= c1_nxt1 * a2;
+ c0_nxt2 -= c1_nxt2 * a2;
+ c0_nxt3 -= c1_nxt3 * a2;
+
+ c0 *= a0;
+ c0_nxt1 *= a0;
+ c0_nxt2 *= a0;
+ c0_nxt3 *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt1;
+ *(b + 2) = c0_nxt2;
+ *(b + 3) = c0_nxt3;
+ *(b + 4) = c1;
+ *(b + 5) = c1_nxt1;
+ *(b + 6) = c1_nxt2;
+ *(b + 7) = c1_nxt3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + ldc) = c0_nxt1;
+ *(c + 1 + ldc) = c1_nxt1;
+ *(c + 0 + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+ *(c + 0 + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+}
+
+static void ssolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt = *(c + 0 + ldc);
+ c1_nxt = *(c + 1 + ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c0_nxt -= aa[0] * bb[1];
+ c1_nxt -= aa[1] * bb[1];
+
+ aa += 2;
+ bb += 2;
+ }
+
+ a -= 4;
+ b -= 4;
+
+ a0 = *(a + 0);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+
+ c1 *= a3;
+ c1_nxt *= a3;
+
+ c0 -= c1 * a2;
+ c0_nxt -= c1_nxt * a2;
+
+ c0 *= a0;
+ c0_nxt *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt;
+ *(b + 2) = c1;
+ *(b + 3) = c1_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+}
+
+static void ssolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT a0, a2, a3, c0, c1;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+
+ aa += 2;
+ bb += 1;
+ }
+
+ a -= 4;
+ b -= 2;
+
+ a0 = *(a + 0);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+
+ c1 *= a3;
+
+ c0 -= c1 * a2;
+ c0 *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+}
+
+static void ssolve_1x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT a0, c0, c1, c2, c3, c4, c5, c6, c7;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
+ c4 = *(c + 4 * ldc);
+ c5 = *(c + 5 * ldc);
+ c6 = *(c + 6 * ldc);
+ c7 = *(c + 7 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[0] * bb[1];
+ c2 -= aa[0] * bb[2];
+ c3 -= aa[0] * bb[3];
+ c4 -= aa[0] * bb[4];
+ c5 -= aa[0] * bb[5];
+ c6 -= aa[0] * bb[6];
+ c7 -= aa[0] * bb[7];
+
+ aa += 1;
+ bb += 8;
+ }
+
+ a0 = *(a - 1);
+
+ c0 *= a0;
+ c1 *= a0;
+ c2 *= a0;
+ c3 *= a0;
+ c4 *= a0;
+ c5 *= a0;
+ c6 *= a0;
+ c7 *= a0;
+
+ *(b - 8) = c0;
+ *(b - 7) = c1;
+ *(b - 6) = c2;
+ *(b - 5) = c3;
+ *(b - 4) = c4;
+ *(b - 3) = c5;
+ *(b - 2) = c6;
+ *(b - 1) = c7;
+
+ *(c + 0 * ldc) = c0;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
+ *(c + 4 * ldc) = c4;
+ *(c + 5 * ldc) = c5;
+ *(c + 6 * ldc) = c6;
+ *(c + 7 * ldc) = c7;
+}
+
+static void ssolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT a0, c0, c1, c2, c3;
+
+ c0 = *(c + 0 * ldc);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[0] * bb[1];
+ c2 -= aa[0] * bb[2];
+ c3 -= aa[0] * bb[3];
+
+ aa += 1;
+ bb += 4;
+ }
+
+ a0 = *(a - 1);
+
+ c0 *= a0;
+ c1 *= a0;
+ c2 *= a0;
+ c3 *= a0;
+
+ *(b - 4) = c0;
+ *(b - 3) = c1;
+ *(b - 2) = c2;
+ *(b - 1) = c3;
+
+ *(c + 0 * ldc) = c0;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
+}
+
+static void ssolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT a0, c0, c1;
+
+ c0 = *c;
+ c1 = *(c + ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[0] * bb[1];
+
+ aa += 1;
+ bb += 2;
+ }
+
+ a0 = *(a - 1);
+
+ c0 *= a0;
+ c1 *= a0;
+
+ *(b - 2) = c0;
+ *(b - 1) = c1;
+
+ *(c + 0 * ldc) = c0;
+ *(c + 1 * ldc) = c1;
+}
+
+static void ssolve_1x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ BLASLONG k;
+
+ for (k = 0; k < bk; k++)
+ {
+ *c -= a[k] * b[k];
+ }
+
+ *c *= *(a - 1);
+ *(b - 1) = *c;
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ FLOAT *c, BLASLONG ldc, BLASLONG offset)
+{
+ FLOAT *aa, *cc;
+ BLASLONG i, j, kk;
+
+ for (j = (n >> 3); j--;)
+ {
+ kk = m + offset;
+ if (m & 7)
+ {
+ if (m & 1)
+ {
+ aa = a + (m - 1) * k + kk;
+ cc = c + (m - 1);
+
+ ssolve_1x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk));
+
+ kk -= 1;
+ }
+
+ if (m & 2)
+ {
+ aa = a + ((m & ~1) - 2) * k + 2 * kk;
+ cc = c + ((m & ~1) - 2);
+
+ ssolve_2x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk));
+
+ kk -= 2;
+ }
+
+ if (m & 4)
+ {
+ aa = a + ((m & ~3) - 4) * k + 4 * kk;
+ cc = c + ((m & ~3) - 4);
+
+ ssolve_4x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk));
+
+ kk -= 4;
+ }
+ }
+
+ i = (m >> 3);
+ if (i > 0)
+ {
+ aa = a + ((m & ~7) - 8) * k;
+ cc = c + ((m & ~7) - 8);
+
+ do
+ {
+ ssolve_8x8_ln_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk));
+
+ aa -= 8 * k;
+ cc -= 8;
+ kk -= 8;
+ i --;
+ } while (i > 0);
+ }
+
+ b += 8 * k;
+ c += 8 * ldc;
+ }
+
+ if (n & 7)
+ {
+ if (n & 4)
+ {
+ kk = m + offset;
+
+ if (m & 7)
+ {
+ if (m & 1)
+ {
+ aa = a + (m - 1) * k + kk;
+ cc = c + (m - 1);
+
+ ssolve_1x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk));
+
+ kk -= 1;
+ }
+
+ if (m & 2)
+ {
+ aa = a + ((m & ~1) - 2) * k + 2 * kk;
+ cc = c + ((m & ~1) - 2);
+
+ ssolve_2x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk));
+
+ kk -= 2;
+ }
+
+ if (m & 4)
+ {
+ aa = a + ((m & ~3) - 4) * k + 4 * kk;
+ cc = c + ((m & ~3) - 4);
+
+ ssolve_4x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk));
+
+ kk -= 4;
+ }
+ }
+
+ i = (m >> 3);
+ if (i > 0)
+ {
+ aa = a + ((m & ~7) - 8) * k;
+ cc = c + ((m & ~7) - 8);
+
+ do
+ {
+ ssolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk));
+
+ aa -= 8 * k;
+ cc -= 8;
+ kk -= 8;
+ i --;
+ } while (i > 0);
+ }
+
+ b += 4 * k;
+ c += 4 * ldc;
+ }
+
+ if (n & 2)
+ {
+ kk = m + offset;
+
+ if (m & 7)
+ {
+ if (m & 1)
+ {
+ aa = a + (m - 1) * k + kk;
+ cc = c + (m - 1);
+
+ ssolve_1x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk));
+
+ kk -= 1;
+ }
+
+ if (m & 2)
+ {
+ aa = a + ((m & ~1) - 2) * k + 2 * kk;
+ cc = c + ((m & ~1) - 2);
+
+ ssolve_2x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk));
+
+ kk -= 2;
+ }
+
+ if (m & 4)
+ {
+ aa = a + ((m & ~3) - 4) * k + 4 * kk;
+ cc = c + ((m & ~3) - 4);
+
+ ssolve_4x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk));
+
+ kk -= 4;
+ }
+ }
+
+ i = (m >> 3);
+ if (i > 0)
+ {
+ aa = a + ((m & ~7) - 8) * k;
+ cc = c + ((m & ~7) - 8);
+
+ do
+ {
+ ssolve_8x2_ln_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk));
+
+ aa -= 8 * k;
+ cc -= 8;
+ kk -= 8;
+ i --;
+ } while (i > 0);
+ }
+
+ b += 2 * k;
+ c += 2 * ldc;
+ }
+
+ if (n & 1)
+ {
+ kk = m + offset;
+
+ if (m & 7)
+ {
+ if (m & 1)
+ {
+ aa = a + (m - 1) * k + kk;
+ cc = c + (m - 1);
+
+ ssolve_1x1_ln_msa(aa, b + kk, cc, (k - kk));
+
+ kk -= 1;
+ }
+
+ if (m & 2)
+ {
+ aa = a + ((m & ~1) - 2) * k + 2 * kk;
+ cc = c + ((m & ~1) - 2);
+
+ ssolve_2x1_ln_msa(aa, b + kk, cc, (k - kk));
+
+ kk -= 2;
+ }
+
+ if (m & 4)
+ {
+ aa = a + ((m & ~3) - 4) * k + 4 * kk;
+ cc = c + ((m & ~3) - 4);
+
+ ssolve_4x1_ln_msa(aa, b + kk, cc, (k - kk));
+
+ kk -= 4;
+ }
+ }
+
+ i = (m >> 3);
+ if (i > 0)
+ {
+ aa = a + ((m & ~7) - 8) * k;
+ cc = c + ((m & ~7) - 8);
+
+ do
+ {
+ ssolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, (k - kk));
+
+ aa -= 8 * k;
+ cc -= 8;
+ kk -= 8;
+ i --;
+ } while (i > 0);
+ }
+
+ b += k;
+ c += ldc;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/strsm_kernel_LT_8x8_msa.c b/kernel/mips/strsm_kernel_LT_8x8_msa.c
new file mode 100644
index 000000000..5834d77b2
--- /dev/null
+++ b/kernel/mips/strsm_kernel_LT_8x8_msa.c
@@ -0,0 +1,1694 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
+ v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v4f32 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15;
+ v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
+ v4f32 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18;
+ v4f32 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28;
+ v4f32 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39;
+ v4f32 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63, src_a;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+ FLOAT *c_nxt4line = c + 4 * ldc;
+ FLOAT *c_nxt5line = c + 5 * ldc;
+ FLOAT *c_nxt6line = c + 6 * ldc;
+ FLOAT *c_nxt7line = c + 7 * ldc;
+
+ LD_SP2(c, 4, src_c0, src_c1);
+ LD_SP2(c_nxt1line, 4, src_c2, src_c3);
+ LD_SP2(c_nxt2line, 4, src_c4, src_c5);
+ LD_SP2(c_nxt3line, 4, src_c6, src_c7);
+ LD_SP2(c_nxt4line, 4, src_c8, src_c9);
+ LD_SP2(c_nxt5line, 4, src_c10, src_c11);
+ LD_SP2(c_nxt6line, 4, src_c12, src_c13);
+ LD_SP2(c_nxt7line, 4, src_c14, src_c15);
+
+ for (k = 0; k < bk; k++)
+ {
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ src_b = LD_SP(b + 4);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+
+ a += 8;
+ b += 8;
+ }
+
+ TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6,
+ res_c0, res_c1, res_c2, res_c3);
+ TRANSPOSE4x4_SP_SP(src_c8, src_c10, src_c12, src_c14,
+ res_c8, res_c9, res_c10, res_c11);
+ TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7,
+ res_c4, res_c5, res_c6, res_c7);
+ TRANSPOSE4x4_SP_SP(src_c9, src_c11, src_c13, src_c15,
+ res_c12, res_c13, res_c14, res_c15);
+
+ src_a = LD_SP(a + 0);
+ SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3);
+ src_a = LD_SP(a + 4);
+ SPLATI_W4_SP(src_a, src_a4, src_a5, src_a6, src_a7);
+
+ res_c0 *= src_a0;
+ res_c8 *= src_a0;
+ res_c1 -= res_c0 * src_a1;
+ res_c9 -= res_c8 * src_a1;
+ res_c2 -= res_c0 * src_a2;
+ res_c10 -= res_c8 * src_a2;
+ res_c3 -= res_c0 * src_a3;
+ res_c11 -= res_c8 * src_a3;
+ res_c4 -= res_c0 * src_a4;
+ res_c12 -= res_c8 * src_a4;
+ res_c5 -= res_c0 * src_a5;
+ res_c13 -= res_c8 * src_a5;
+ res_c6 -= res_c0 * src_a6;
+ res_c14 -= res_c8 * src_a6;
+ res_c7 -= res_c0 * src_a7;
+ res_c15 -= res_c8 * src_a7;
+
+ src_a = LD_SP(a + 9);
+ SPLATI_W4_SP(src_a, src_a9, src_a10, src_a11, src_a12);
+ src_a13 = LD_SP(a + 13);
+ src_a15 = (v4f32) __msa_splati_w((v4i32) src_a13, 2);
+ src_a14 = (v4f32) __msa_splati_w((v4i32) src_a13, 1);
+ src_a13 = (v4f32) __msa_splati_w((v4i32) src_a13, 0);
+
+ res_c1 *= src_a9;
+ res_c9 *= src_a9;
+ res_c2 -= res_c1 * src_a10;
+ res_c10 -= res_c9 * src_a10;
+ res_c3 -= res_c1 * src_a11;
+ res_c11 -= res_c9 * src_a11;
+ res_c4 -= res_c1 * src_a12;
+ res_c12 -= res_c9 * src_a12;
+ res_c5 -= res_c1 * src_a13;
+ res_c13 -= res_c9 * src_a13;
+ res_c6 -= res_c1 * src_a14;
+ res_c14 -= res_c9 * src_a14;
+ res_c7 -= res_c1 * src_a15;
+ res_c15 -= res_c9 * src_a15;
+
+ src_a = LD_SP(a + 18);
+ SPLATI_W4_SP(src_a, src_a18, src_a19, src_a20, src_a21);
+ src_a22 = LD_SP(a + 22);
+ src_a23 = (v4f32) __msa_splati_w((v4i32) src_a22, 1);
+ src_a22 = (v4f32) __msa_splati_w((v4i32) src_a22, 0);
+
+ res_c2 *= src_a18;
+ res_c10 *= src_a18;
+ res_c3 -= res_c2 * src_a19;
+ res_c11 -= res_c10 * src_a19;
+ res_c4 -= res_c2 * src_a20;
+ res_c12 -= res_c10 * src_a20;
+ res_c5 -= res_c2 * src_a21;
+ res_c13 -= res_c10 * src_a21;
+ res_c6 -= res_c2 * src_a22;
+ res_c14 -= res_c10 * src_a22;
+ res_c7 -= res_c2 * src_a23;
+ res_c15 -= res_c10 * src_a23;
+
+ src_a = LD_SP(a + 27);
+ SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30);
+ src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31));
+
+ res_c3 *= src_a27;
+ res_c11 *= src_a27;
+ res_c4 -= res_c3 * src_a28;
+ res_c12 -= res_c11 * src_a28;
+ res_c5 -= res_c3 * src_a29;
+ res_c13 -= res_c11 * src_a29;
+ res_c6 -= res_c3 * src_a30;
+ res_c14 -= res_c11 * src_a30;
+ res_c7 -= res_c3 * src_a31;
+ res_c15 -= res_c11 * src_a31;
+
+ ST_SP4(res_c0, res_c8, res_c1, res_c9, b, 4);
+ ST_SP4(res_c2, res_c10, res_c3, res_c11, b + 16, 4);
+
+ TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
+ src_c0, src_c2, src_c4, src_c6);
+ TRANSPOSE4x4_SP_SP(res_c8, res_c9, res_c10, res_c11,
+ src_c8, src_c10, src_c12, src_c14);
+
+ ST_SP(src_c0, c);
+ ST_SP(src_c2, c_nxt1line);
+ ST_SP(src_c4, c_nxt2line);
+ ST_SP(src_c6, c_nxt3line);
+ ST_SP(src_c8, c_nxt4line);
+ ST_SP(src_c10, c_nxt5line);
+ ST_SP(src_c12, c_nxt6line);
+ ST_SP(src_c14, c_nxt7line);
+
+ src_a = LD_SP(a + 36);
+ SPLATI_W4_SP(src_a, src_a36, src_a37, src_a38, src_a39);
+
+ res_c4 *= src_a36;
+ res_c12 *= src_a36;
+ res_c5 -= res_c4 * src_a37;
+ res_c13 -= res_c12 * src_a37;
+ res_c6 -= res_c4 * src_a38;
+ res_c14 -= res_c12 * src_a38;
+ res_c7 -= res_c4 * src_a39;
+ res_c15 -= res_c12 * src_a39;
+
+ src_a45 = LD_SP(a + 45);
+ src_a47 = (v4f32) __msa_splati_w((v4i32) src_a45, 2);
+ src_a46 = (v4f32) __msa_splati_w((v4i32) src_a45, 1);
+ src_a45 = (v4f32) __msa_splati_w((v4i32) src_a45, 0);
+
+ res_c5 *= src_a45;
+ res_c13 *= src_a45;
+ res_c6 -= res_c5 * src_a46;
+ res_c14 -= res_c13 * src_a46;
+ res_c7 -= res_c5 * src_a47;
+ res_c15 -= res_c13 * src_a47;
+
+ src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54));
+ src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55));
+ src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63));
+
+ res_c6 *= src_a54;
+ res_c14 *= src_a54;
+ res_c7 -= res_c6 * src_a55;
+ res_c15 -= res_c14 * src_a55;
+
+ res_c7 *= src_a63;
+ res_c15 *= src_a63;
+
+ ST_SP4(res_c4, res_c12, res_c5, res_c13, b + 32, 4);
+ ST_SP4(res_c6, res_c14, res_c7, res_c15, b + 48, 4);
+
+ TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
+ src_c1, src_c3, src_c5, src_c7);
+ TRANSPOSE4x4_SP_SP(res_c12, res_c13, res_c14, res_c15,
+ src_c9, src_c11, src_c13, src_c15);
+
+ ST_SP(src_c1, c + 4);
+ ST_SP(src_c3, c_nxt1line + 4);
+ ST_SP(src_c5, c_nxt2line + 4);
+ ST_SP(src_c7, c_nxt3line + 4);
+ ST_SP(src_c9, c_nxt4line + 4);
+ ST_SP(src_c11, c_nxt5line + 4);
+ ST_SP(src_c13, c_nxt6line + 4);
+ ST_SP(src_c15, c_nxt7line + 4);
+}
+
+static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
+ v4f32 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18;
+ v4f32 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28;
+ v4f32 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39;
+ v4f32 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63, src_a;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ LD_SP2(c, 4, src_c0, src_c1);
+ LD_SP2(c_nxt1line, 4, src_c2, src_c3);
+ LD_SP2(c_nxt2line, 4, src_c4, src_c5);
+ LD_SP2(c_nxt3line, 4, src_c6, src_c7);
+
+ for (k = 0; k < bk; k++)
+ {
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ a += 8;
+ b += 4;
+ }
+
+ TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6,
+ res_c0, res_c1, res_c2, res_c3);
+ TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7,
+ res_c4, res_c5, res_c6, res_c7);
+
+ src_a = LD_SP(a + 0);
+ SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3);
+ src_a = LD_SP(a + 4);
+ SPLATI_W4_SP(src_a, src_a4, src_a5, src_a6, src_a7);
+
+ res_c0 *= src_a0;
+ res_c1 -= res_c0 * src_a1;
+ res_c2 -= res_c0 * src_a2;
+ res_c3 -= res_c0 * src_a3;
+ res_c4 -= res_c0 * src_a4;
+ res_c5 -= res_c0 * src_a5;
+ res_c6 -= res_c0 * src_a6;
+ res_c7 -= res_c0 * src_a7;
+
+ src_a = LD_SP(a + 9);
+ SPLATI_W4_SP(src_a, src_a9, src_a10, src_a11, src_a12);
+ src_a13 = LD_SP(a + 13);
+ src_a15 = (v4f32) __msa_splati_w((v4i32) src_a13, 2);
+ src_a14 = (v4f32) __msa_splati_w((v4i32) src_a13, 1);
+ src_a13 = (v4f32) __msa_splati_w((v4i32) src_a13, 0);
+
+ res_c1 *= src_a9;
+ res_c2 -= res_c1 * src_a10;
+ res_c3 -= res_c1 * src_a11;
+ res_c4 -= res_c1 * src_a12;
+ res_c5 -= res_c1 * src_a13;
+ res_c6 -= res_c1 * src_a14;
+ res_c7 -= res_c1 * src_a15;
+
+ src_a = LD_SP(a + 18);
+ SPLATI_W4_SP(src_a, src_a18, src_a19, src_a20, src_a21);
+ src_a22 = LD_SP(a + 22);
+ src_a23 = (v4f32) __msa_splati_w((v4i32) src_a22, 1);
+ src_a22 = (v4f32) __msa_splati_w((v4i32) src_a22, 0);
+
+ res_c2 *= src_a18;
+ res_c3 -= res_c2 * src_a19;
+ res_c4 -= res_c2 * src_a20;
+ res_c5 -= res_c2 * src_a21;
+ res_c6 -= res_c2 * src_a22;
+ res_c7 -= res_c2 * src_a23;
+
+ src_a = LD_SP(a + 27);
+ SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30);
+ src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31));
+
+ res_c3 *= src_a27;
+ res_c4 -= res_c3 * src_a28;
+ res_c5 -= res_c3 * src_a29;
+ res_c6 -= res_c3 * src_a30;
+ res_c7 -= res_c3 * src_a31;
+
+ src_a = LD_SP(a + 36);
+ SPLATI_W4_SP(src_a, src_a36, src_a37, src_a38, src_a39);
+
+ res_c4 *= src_a36;
+ res_c5 -= res_c4 * src_a37;
+ res_c6 -= res_c4 * src_a38;
+ res_c7 -= res_c4 * src_a39;
+
+ src_a45 = LD_SP(a + 45);
+ src_a47 = (v4f32) __msa_splati_w((v4i32) src_a45, 2);
+ src_a46 = (v4f32) __msa_splati_w((v4i32) src_a45, 1);
+ src_a45 = (v4f32) __msa_splati_w((v4i32) src_a45, 0);
+
+ res_c5 *= src_a45;
+ res_c6 -= res_c5 * src_a46;
+ res_c7 -= res_c5 * src_a47;
+
+ src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54));
+ src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55));
+ src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63));
+
+ res_c6 *= src_a54;
+ res_c7 -= res_c6 * src_a55;
+ res_c7 *= src_a63;
+
+ ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4);
+ b += 16;
+ ST_SP4(res_c4, res_c5, res_c6, res_c7, b, 4);
+
+ TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
+ src_c0, src_c2, src_c4, src_c6);
+ TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
+ src_c1, src_c3, src_c5, src_c7);
+
+ ST_SP2(src_c0, src_c1, c, 4);
+ ST_SP2(src_c2, src_c3, c_nxt1line, 4);
+ ST_SP2(src_c4, src_c5, c_nxt2line, 4);
+ ST_SP2(src_c6, src_c7, c_nxt3line, 4);
+}
+
+static void ssolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18;
+ FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39;
+ FLOAT a45, a46, a47, a54, a55, a63;
+ FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
+ FLOAT c0_nxt, c1_nxt, c2_nxt, c3_nxt, c4_nxt, c5_nxt, c6_nxt, c7_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+ c4 = *(c + 4);
+ c5 = *(c + 5);
+ c6 = *(c + 6);
+ c7 = *(c + 7);
+ c0_nxt = *(c + 0 + ldc);
+ c1_nxt = *(c + 1 + ldc);
+ c2_nxt = *(c + 2 + ldc);
+ c3_nxt = *(c + 3 + ldc);
+ c4_nxt = *(c + 4 + ldc);
+ c5_nxt = *(c + 5 + ldc);
+ c6_nxt = *(c + 6 + ldc);
+ c7_nxt = *(c + 7 + ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c2 -= a[2] * b[0];
+ c3 -= a[3] * b[0];
+ c4 -= a[4] * b[0];
+ c5 -= a[5] * b[0];
+ c6 -= a[6] * b[0];
+ c7 -= a[7] * b[0];
+ c0_nxt -= a[0] * b[1];
+ c1_nxt -= a[1] * b[1];
+ c2_nxt -= a[2] * b[1];
+ c3_nxt -= a[3] * b[1];
+ c4_nxt -= a[4] * b[1];
+ c5_nxt -= a[5] * b[1];
+ c6_nxt -= a[6] * b[1];
+ c7_nxt -= a[7] * b[1];
+
+ a += 8;
+ b += 2;
+ }
+
+ a0 = *(a + 0);
+ a1 = *(a + 1);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+ a4 = *(a + 4);
+ a5 = *(a + 5);
+ a6 = *(a + 6);
+ a7 = *(a + 7);
+ a9 = *(a + 9);
+ a10 = *(a + 10);
+ a11 = *(a + 11);
+ a12 = *(a + 12);
+ a13 = *(a + 13);
+ a14 = *(a + 14);
+ a15 = *(a + 15);
+ a18 = *(a + 18);
+ a19 = *(a + 19);
+ a20 = *(a + 20);
+ a21 = *(a + 21);
+ a22 = *(a + 22);
+ a23 = *(a + 23);
+ a27 = *(a + 27);
+ a28 = *(a + 28);
+ a29 = *(a + 29);
+ a30 = *(a + 30);
+ a31 = *(a + 31);
+ a36 = *(a + 36);
+ a37 = *(a + 37);
+ a38 = *(a + 38);
+ a39 = *(a + 39);
+ a45 = *(a + 45);
+ a46 = *(a + 46);
+ a47 = *(a + 47);
+ a54 = *(a + 54);
+ a55 = *(a + 55);
+ a63 = *(a + 63);
+
+ c0 *= a0;
+ c0_nxt *= a0;
+
+ c1 -= c0 * a1;
+ c1_nxt -= c0_nxt * a1;
+ c1 *= a9;
+ c1_nxt *= a9;
+
+ c2 -= c0 * a2;
+ c2_nxt -= c0_nxt * a2;
+ c2 -= c1 * a10;
+ c2_nxt -= c1_nxt * a10;
+ c2 *= a18;
+ c2_nxt *= a18;
+
+ c3 -= c0 * a3;
+ c3_nxt -= c0_nxt * a3;
+ c3 -= c1 * a11;
+ c3_nxt -= c1_nxt * a11;
+ c3 -= c2 * a19;
+ c3_nxt -= c2_nxt * a19;
+ c3 *= a27;
+ c3_nxt *= a27;
+
+ c4 -= c0 * a4;
+ c4_nxt -= c0_nxt * a4;
+ c4 -= c1 * a12;
+ c4_nxt -= c1_nxt * a12;
+ c4 -= c2 * a20;
+ c4_nxt -= c2_nxt * a20;
+ c4 -= c3 * a28;
+ c4_nxt -= c3_nxt * a28;
+ c4 *= a36;
+ c4_nxt *= a36;
+
+ c5 -= c0 * a5;
+ c5_nxt -= c0_nxt * a5;
+ c5 -= c1 * a13;
+ c5_nxt -= c1_nxt * a13;
+ c5 -= c2 * a21;
+ c5_nxt -= c2_nxt * a21;
+ c5 -= c3 * a29;
+ c5_nxt -= c3_nxt * a29;
+ c5 -= c4 * a37;
+ c5_nxt -= c4_nxt * a37;
+ c5 *= a45;
+ c5_nxt *= a45;
+
+ c6 -= c0 * a6;
+ c6_nxt -= c0_nxt * a6;
+ c6 -= c1 * a14;
+ c6_nxt -= c1_nxt * a14;
+ c6 -= c2 * a22;
+ c6_nxt -= c2_nxt * a22;
+ c6 -= c3 * a30;
+ c6_nxt -= c3_nxt * a30;
+ c6 -= c4 * a38;
+ c6_nxt -= c4_nxt * a38;
+ c6 -= c5 * a46;
+ c6_nxt -= c5_nxt * a46;
+ c6 *= a54;
+ c6_nxt *= a54;
+
+ c7 -= c0 * a7;
+ c7_nxt -= c0_nxt * a7;
+ c7 -= c1 * a15;
+ c7_nxt -= c1_nxt * a15;
+ c7 -= c2 * a23;
+ c7_nxt -= c2_nxt * a23;
+ c7 -= c3 * a31;
+ c7_nxt -= c3_nxt * a31;
+ c7 -= c4 * a39;
+ c7_nxt -= c4_nxt * a39;
+ c7 -= c5 * a47;
+ c7_nxt -= c5_nxt * a47;
+ c7 -= c6 * a55;
+ c7_nxt -= c6_nxt * a55;
+ c7 *= a63;
+ c7_nxt *= a63;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+ *(c + 4) = c4;
+ *(c + 5) = c5;
+ *(c + 6) = c6;
+ *(c + 7) = c7;
+ *(c + 0 + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+ *(c + 2 + ldc) = c2_nxt;
+ *(c + 3 + ldc) = c3_nxt;
+ *(c + 4 + ldc) = c4_nxt;
+ *(c + 5 + ldc) = c5_nxt;
+ *(c + 6 + ldc) = c6_nxt;
+ *(c + 7 + ldc) = c7_nxt;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt;
+ *(b + 2) = c1;
+ *(b + 3) = c1_nxt;
+ *(b + 4) = c2;
+ *(b + 5) = c2_nxt;
+ *(b + 6) = c3;
+ *(b + 7) = c3_nxt;
+ *(b + 8) = c4;
+ *(b + 9) = c4_nxt;
+ *(b + 10) = c5;
+ *(b + 11) = c5_nxt;
+ *(b + 12) = c6;
+ *(b + 13) = c6_nxt;
+ *(b + 14) = c7;
+ *(b + 15) = c7_nxt;
+}
+
+static void ssolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18;
+ FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39;
+ FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+ c4 = *(c + 4);
+ c5 = *(c + 5);
+ c6 = *(c + 6);
+ c7 = *(c + 7);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c2 -= a[2] * b[0];
+ c3 -= a[3] * b[0];
+ c4 -= a[4] * b[0];
+ c5 -= a[5] * b[0];
+ c6 -= a[6] * b[0];
+ c7 -= a[7] * b[0];
+
+ a += 8;
+ b += 1;
+ }
+
+ a0 = *(a + 0);
+ a1 = *(a + 1);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+ a4 = *(a + 4);
+ a5 = *(a + 5);
+ a6 = *(a + 6);
+ a7 = *(a + 7);
+ a9 = *(a + 9);
+ a10 = *(a + 10);
+ a11 = *(a + 11);
+ a12 = *(a + 12);
+ a13 = *(a + 13);
+ a14 = *(a + 14);
+ a15 = *(a + 15);
+ a18 = *(a + 18);
+ a19 = *(a + 19);
+ a20 = *(a + 20);
+ a21 = *(a + 21);
+ a22 = *(a + 22);
+ a23 = *(a + 23);
+ a27 = *(a + 27);
+ a28 = *(a + 28);
+ a29 = *(a + 29);
+ a30 = *(a + 30);
+ a31 = *(a + 31);
+ a36 = *(a + 36);
+ a37 = *(a + 37);
+ a38 = *(a + 38);
+ a39 = *(a + 39);
+ a45 = *(a + 45);
+ a46 = *(a + 46);
+ a47 = *(a + 47);
+ a54 = *(a + 54);
+ a55 = *(a + 55);
+ a63 = *(a + 63);
+
+ c0 *= a0;
+
+ c1 -= c0 * a1;
+ c1 *= a9;
+
+ c2 -= c0 * a2;
+ c2 -= c1 * a10;
+ c2 *= a18;
+
+ c3 -= c0 * a3;
+ c3 -= c1 * a11;
+ c3 -= c2 * a19;
+ c3 *= a27;
+
+ c4 -= c0 * a4;
+ c4 -= c1 * a12;
+ c4 -= c2 * a20;
+ c4 -= c3 * a28;
+ c4 *= a36;
+
+ c5 -= c0 * a5;
+ c5 -= c1 * a13;
+ c5 -= c2 * a21;
+ c5 -= c3 * a29;
+ c5 -= c4 * a37;
+ c5 *= a45;
+
+ c6 -= c0 * a6;
+ c6 -= c1 * a14;
+ c6 -= c2 * a22;
+ c6 -= c3 * a30;
+ c6 -= c4 * a38;
+ c6 -= c5 * a46;
+ c6 *= a54;
+
+ c7 -= c0 * a7;
+ c7 -= c1 * a15;
+ c7 -= c2 * a23;
+ c7 -= c3 * a31;
+ c7 -= c4 * a39;
+ c7 -= c5 * a47;
+ c7 -= c6 * a55;
+ c7 *= a63;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+ *(c + 4) = c4;
+ *(c + 5) = c5;
+ *(c + 6) = c6;
+ *(c + 7) = c7;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+ *(b + 2) = c2;
+ *(b + 3) = c3;
+ *(b + 4) = c4;
+ *(b + 5) = c5;
+ *(b + 6) = c6;
+ *(b + 7) = c7;
+}
+
+static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7;
+ v4f32 src_a10, src_a11, src_a15, src_a;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+ FLOAT *c_nxt4line = c + 4 * ldc;
+ FLOAT *c_nxt5line = c + 5 * ldc;
+ FLOAT *c_nxt6line = c + 6 * ldc;
+ FLOAT *c_nxt7line = c + 7 * ldc;
+
+ src_c0 = LD_SP(c);
+ src_c1 = LD_SP(c_nxt1line);
+ src_c2 = LD_SP(c_nxt2line);
+ src_c3 = LD_SP(c_nxt3line);
+ src_c4 = LD_SP(c_nxt4line);
+ src_c5 = LD_SP(c_nxt5line);
+ src_c6 = LD_SP(c_nxt6line);
+ src_c7 = LD_SP(c_nxt7line);
+
+ for (k = 0; k < (bk >> 1); k++)
+ {
+ src_a0 = LD_SP(a);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+
+ src_b = LD_SP(b + 4);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c4 -= src_a0 * src_b0;
+ src_c5 -= src_a0 * src_b1;
+ src_c6 -= src_a0 * src_b2;
+ src_c7 -= src_a0 * src_b3;
+
+ a += 4;
+ b += 8;
+
+ src_a0 = LD_SP(a);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+
+ src_b = LD_SP(b + 4);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c4 -= src_a0 * src_b0;
+ src_c5 -= src_a0 * src_b1;
+ src_c6 -= src_a0 * src_b2;
+ src_c7 -= src_a0 * src_b3;
+
+ a += 4;
+ b += 8;
+ }
+
+ if ((bk & 1) && (bk > 0))
+ {
+ src_a0 = LD_SP(a);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+
+ src_b = LD_SP(b + 4);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c4 -= src_a0 * src_b0;
+ src_c5 -= src_a0 * src_b1;
+ src_c6 -= src_a0 * src_b2;
+ src_c7 -= src_a0 * src_b3;
+
+ a += 4;
+ b += 8;
+ }
+
+ TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3,
+ res_c0, res_c1, res_c2, res_c3);
+ TRANSPOSE4x4_SP_SP(src_c4, src_c5, src_c6, src_c7,
+ res_c4, res_c5, res_c6, res_c7);
+
+ src_a = LD_SP(a + 0);
+ SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3);
+ src_a5 = LD_SP(a + 5);
+ src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2);
+ src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1);
+ src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0);
+ src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10));
+ src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11));
+ src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15));
+
+ res_c0 *= src_a0;
+ res_c4 *= src_a0;
+ res_c1 -= res_c0 * src_a1;
+ res_c5 -= res_c4 * src_a1;
+ res_c2 -= res_c0 * src_a2;
+ res_c6 -= res_c4 * src_a2;
+ res_c3 -= res_c0 * src_a3;
+ res_c7 -= res_c4 * src_a3;
+
+ res_c1 *= src_a5;
+ res_c5 *= src_a5;
+ res_c2 -= res_c1 * src_a6;
+ res_c6 -= res_c5 * src_a6;
+ res_c3 -= res_c1 * src_a7;
+ res_c7 -= res_c5 * src_a7;
+
+ res_c2 *= src_a10;
+ res_c6 *= src_a10;
+ res_c3 -= res_c2 * src_a11;
+ res_c7 -= res_c6 * src_a11;
+
+ res_c3 *= src_a15;
+ res_c7 *= src_a15;
+
+ ST_SP4(res_c0, res_c4, res_c1, res_c5, b, 4);
+ ST_SP4(res_c2, res_c6, res_c3, res_c7, b + 16, 4);
+
+ TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
+ src_c0, src_c1, src_c2, src_c3);
+ TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
+ src_c4, src_c5, src_c6, src_c7);
+
+ ST_SP(src_c0, c);
+ ST_SP(src_c1, c_nxt1line);
+ ST_SP(src_c2, c_nxt2line);
+ ST_SP(src_c3, c_nxt3line);
+ ST_SP(src_c4, c_nxt4line);
+ ST_SP(src_c5, c_nxt5line);
+ ST_SP(src_c6, c_nxt6line);
+ ST_SP(src_c7, c_nxt7line);
+}
+
+static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
+ v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3;
+ v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7;
+ v4f32 src_a10, src_a11, src_a15, src_a;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ src_c0 = LD_SP(c);
+ src_c1 = LD_SP(c_nxt1line);
+ src_c2 = LD_SP(c_nxt2line);
+ src_c3 = LD_SP(c_nxt3line);
+
+ for (k = 0; k < (bk >> 1); k++)
+ {
+ src_a0 = LD_SP(a);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+
+ a += 4;
+ b += 4;
+
+ src_a0 = LD_SP(a);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+
+ a += 4;
+ b += 4;
+ }
+
+ if ((bk & 1) && (bk > 0))
+ {
+ src_a0 = LD_SP(a);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+
+ a += 4;
+ b += 4;
+ }
+
+ TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3,
+ res_c0, res_c1, res_c2, res_c3);
+
+ src_a = LD_SP(a + 0);
+ SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3);
+ src_a5 = LD_SP(a + 5);
+ src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2);
+ src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1);
+ src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0);
+ src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10));
+ src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11));
+ src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15));
+
+ res_c0 *= src_a0;
+ res_c1 -= res_c0 * src_a1;
+ res_c2 -= res_c0 * src_a2;
+ res_c3 -= res_c0 * src_a3;
+
+ res_c1 *= src_a5;
+ res_c2 -= res_c1 * src_a6;
+ res_c3 -= res_c1 * src_a7;
+
+ res_c2 *= src_a10;
+ res_c3 -= res_c2 * src_a11;
+
+ res_c3 *= src_a15;
+
+ ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4);
+
+ TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
+ src_c0, src_c1, src_c2, src_c3);
+
+ ST_SP(src_c0, c);
+ ST_SP(src_c1, c_nxt1line);
+ ST_SP(src_c2, c_nxt2line);
+ ST_SP(src_c3, c_nxt3line);
+}
+
+static void ssolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt;
+ FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+ c0_nxt = *(c + 0 + ldc);
+ c1_nxt = *(c + 1 + ldc);
+ c2_nxt = *(c + 2 + ldc);
+ c3_nxt = *(c + 3 + ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c2 -= a[2] * b[0];
+ c3 -= a[3] * b[0];
+ c0_nxt -= a[0] * b[1];
+ c1_nxt -= a[1] * b[1];
+ c2_nxt -= a[2] * b[1];
+ c3_nxt -= a[3] * b[1];
+
+ a += 4;
+ b += 2;
+ }
+
+ a0 = *(a + 0);
+ a1 = *(a + 1);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+ a5 = *(a + 5);
+ a6 = *(a + 6);
+ a7 = *(a + 7);
+ a10 = *(a + 10);
+ a11 = *(a + 11);
+ a15 = *(a + 15);
+
+ c0 *= a0;
+ c0_nxt *= a0;
+
+ c1 -= c0 * a1;
+ c1_nxt -= c0_nxt * a1;
+
+ c1 *= a5;
+ c1_nxt *= a5;
+
+ c2 -= c0 * a2;
+ c2_nxt -= c0_nxt * a2;
+
+ c2 -= c1 * a6;
+ c2_nxt -= c1_nxt * a6;
+
+ c2 *= a10;
+ c2_nxt *= a10;
+
+ c3 -= c0 * a3;
+ c3_nxt -= c0_nxt * a3;
+
+ c3 -= c1 * a7;
+ c3_nxt -= c1_nxt * a7;
+
+ c3 -= c2 * a11;
+ c3_nxt -= c2_nxt * a11;
+
+ c3 *= a15;
+ c3_nxt *= a15;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt;
+ *(b + 2) = c1;
+ *(b + 3) = c1_nxt;
+ *(b + 4) = c2;
+ *(b + 5) = c2_nxt;
+ *(b + 6) = c3;
+ *(b + 7) = c3_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+ *(c + 0 + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+ *(c + 2 + ldc) = c2_nxt;
+ *(c + 3 + ldc) = c3_nxt;
+}
+
+static void ssolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c2 -= a[2] * b[0];
+ c3 -= a[3] * b[0];
+
+ a += 4;
+ b += 1;
+ }
+
+ a0 = *(a + 0);
+ a1 = *(a + 1);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+ a5 = *(a + 5);
+ a6 = *(a + 6);
+ a7 = *(a + 7);
+ a10 = *(a + 10);
+ a11 = *(a + 11);
+ a15 = *(a + 15);
+
+ c0 *= a0;
+
+ c1 -= c0 * a1;
+ c1 *= a5;
+
+ c2 -= c0 * a2;
+ c2 -= c1 * a6;
+ c2 *= a10;
+
+ c3 -= c0 * a3;
+ c3 -= c1 * a7;
+ c3 -= c2 * a11;
+ c3 *= a15;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+ *(b + 2) = c2;
+ *(b + 3) = c3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+}
+
+static void ssolve_2x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2;
+ FLOAT c0_nxt3, c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5;
+ FLOAT c0_nxt6, c1_nxt6, c0_nxt7, c1_nxt7;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + ldc);
+ c1_nxt1 = *(c + 1 + ldc);
+ c0_nxt2 = *(c + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+ c0_nxt4 = *(c + 4 * ldc);
+ c1_nxt4 = *(c + 1 + 4 * ldc);
+ c0_nxt5 = *(c + 5 * ldc);
+ c1_nxt5 = *(c + 1 + 5 * ldc);
+ c0_nxt6 = *(c + 6 * ldc);
+ c1_nxt6 = *(c + 1 + 6 * ldc);
+ c0_nxt7 = *(c + 7 * ldc);
+ c1_nxt7 = *(c + 1 + 7 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c0_nxt1 -= a[0] * b[1];
+ c1_nxt1 -= a[1] * b[1];
+ c0_nxt2 -= a[0] * b[2];
+ c1_nxt2 -= a[1] * b[2];
+ c0_nxt3 -= a[0] * b[3];
+ c1_nxt3 -= a[1] * b[3];
+ c0_nxt4 -= a[0] * b[4];
+ c1_nxt4 -= a[1] * b[4];
+ c0_nxt5 -= a[0] * b[5];
+ c1_nxt5 -= a[1] * b[5];
+ c0_nxt6 -= a[0] * b[6];
+ c1_nxt6 -= a[1] * b[6];
+ c0_nxt7 -= a[0] * b[7];
+ c1_nxt7 -= a[1] * b[7];
+
+ a += 2;
+ b += 8;
+ }
+
+ a0 = *a;
+ a1 = *(a + 1);
+ a3 = *(a + 3);
+
+ c0 = c0 * a0;
+ c1 = (c1 - c0 * a1) * a3;
+
+ c0_nxt1 = c0_nxt1 * a0;
+ c1_nxt1 = (c1_nxt1 - c0_nxt1 * a1) * a3;
+
+ c0_nxt2 = c0_nxt2 * a0;
+ c1_nxt2 = (c1_nxt2 - c0_nxt2 * a1) * a3;
+
+ c0_nxt3 = c0_nxt3 * a0;
+ c1_nxt3 = (c1_nxt3 - c0_nxt3 * a1) * a3;
+
+ c0_nxt4 = c0_nxt4 * a0;
+ c1_nxt4 = (c1_nxt4 - c0_nxt4 * a1) * a3;
+
+ c0_nxt5 = c0_nxt5 * a0;
+ c1_nxt5 = (c1_nxt5 - c0_nxt5 * a1) * a3;
+
+ c0_nxt6 = c0_nxt6 * a0;
+ c1_nxt6 = (c1_nxt6 - c0_nxt6 * a1) * a3;
+
+ c0_nxt7 = c0_nxt7 * a0;
+ c1_nxt7 = (c1_nxt7 - c0_nxt7 * a1) * a3;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt1;
+ *(b + 2) = c0_nxt2;
+ *(b + 3) = c0_nxt3;
+ *(b + 4) = c0_nxt4;
+ *(b + 5) = c0_nxt5;
+ *(b + 6) = c0_nxt6;
+ *(b + 7) = c0_nxt7;
+ *(b + 8) = c1;
+ *(b + 9) = c1_nxt1;
+ *(b + 10) = c1_nxt2;
+ *(b + 11) = c1_nxt3;
+ *(b + 12) = c1_nxt4;
+ *(b + 13) = c1_nxt5;
+ *(b + 14) = c1_nxt6;
+ *(b + 15) = c1_nxt7;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + ldc) = c0_nxt1;
+ *(c + 1 + ldc) = c1_nxt1;
+ *(c + 0 + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+ *(c + 0 + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+ *(c + 0 + 4 * ldc) = c0_nxt4;
+ *(c + 1 + 4 * ldc) = c1_nxt4;
+ *(c + 0 + 5 * ldc) = c0_nxt5;
+ *(c + 1 + 5 * ldc) = c1_nxt5;
+ *(c + 0 + 6 * ldc) = c0_nxt6;
+ *(c + 1 + 6 * ldc) = c1_nxt6;
+ *(c + 0 + 7 * ldc) = c0_nxt7;
+ *(c + 1 + 7 * ldc) = c1_nxt7;
+}
+
+static void ssolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1;
+ FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + ldc);
+ c1_nxt1 = *(c + 1 + ldc);
+ c0_nxt2 = *(c + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c0_nxt1 -= a[0] * b[1];
+ c1_nxt1 -= a[1] * b[1];
+ c0_nxt2 -= a[0] * b[2];
+ c1_nxt2 -= a[1] * b[2];
+ c0_nxt3 -= a[0] * b[3];
+ c1_nxt3 -= a[1] * b[3];
+
+ a += 2;
+ b += 4;
+ }
+
+ a0 = *a;
+ a1 = *(a + 1);
+ a3 = *(a + 3);
+
+ c0 *= a0;
+ c0_nxt1 *= a0;
+ c0_nxt2 *= a0;
+ c0_nxt3 *= a0;
+
+ c1 -= c0 * a1;
+ c1_nxt1 -= c0_nxt1 * a1;
+ c1_nxt2 -= c0_nxt2 * a1;
+ c1_nxt3 -= c0_nxt3 * a1;
+ c1 *= a3;
+ c1_nxt1 *= a3;
+ c1_nxt2 *= a3;
+ c1_nxt3 *= a3;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt1;
+ *(b + 2) = c0_nxt2;
+ *(b + 3) = c0_nxt3;
+ *(b + 4) = c1;
+ *(b + 5) = c1_nxt1;
+ *(b + 6) = c1_nxt2;
+ *(b + 7) = c1_nxt3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + ldc) = c0_nxt1;
+ *(c + 1 + ldc) = c1_nxt1;
+ *(c + 0 + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+ *(c + 0 + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+}
+
+static void ssolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt = *(c + ldc);
+ c1_nxt = *(c + 1 + ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c0_nxt -= a[0] * b[1];
+ c1_nxt -= a[1] * b[1];
+
+ a += 2;
+ b += 2;
+ }
+
+ a0 = *a;
+ a1 = *(a + 1);
+ a3 = *(a + 3);
+
+ c0 *= a0;
+ c0_nxt *= a0;
+ c1 -= c0 * a1;
+ c1_nxt -= c0_nxt * a1;
+ c1 *= a3;
+ c1_nxt *= a3;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt;
+ *(b + 2) = c1;
+ *(b + 3) = c1_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+}
+
+static void ssolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT c0, c1;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+
+ a += 2;
+ b += 1;
+ }
+
+ c0 *= *(a + 0);
+
+ c1 -= c0 * *(a + 1);
+ c1 *= *(a + 3);
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+}
+
+static void ssolve_1x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
+ c4 = *(c + 4 * ldc);
+ c5 = *(c + 5 * ldc);
+ c6 = *(c + 6 * ldc);
+ c7 = *(c + 7 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[0] * b[1];
+ c2 -= a[0] * b[2];
+ c3 -= a[0] * b[3];
+ c4 -= a[0] * b[4];
+ c5 -= a[0] * b[5];
+ c6 -= a[0] * b[6];
+ c7 -= a[0] * b[7];
+
+ a += 1;
+ b += 8;
+ }
+
+ c0 *= *a;
+ c1 *= *a;
+ c2 *= *a;
+ c3 *= *a;
+ c4 *= *a;
+ c5 *= *a;
+ c6 *= *a;
+ c7 *= *a;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+ *(b + 2) = c2;
+ *(b + 3) = c3;
+ *(b + 4) = c4;
+ *(b + 5) = c5;
+ *(b + 6) = c6;
+ *(b + 7) = c7;
+
+ *(c + 0) = c0;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
+ *(c + 4 * ldc) = c4;
+ *(c + 5 * ldc) = c5;
+ *(c + 6 * ldc) = c6;
+ *(c + 7 * ldc) = c7;
+}
+
+static void ssolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT c0, c1, c2, c3;
+
+ c0 = *(c + 0 * ldc);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[0] * b[1];
+ c2 -= a[0] * b[2];
+ c3 -= a[0] * b[3];
+
+ a += 1;
+ b += 4;
+ }
+
+ c0 *= *a;
+ c1 *= *a;
+ c2 *= *a;
+ c3 *= *a;
+
+ *c = c0;
+ *(c + ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
+
+ *b = *c;
+ *(b + 1) = *(c + ldc);
+ *(b + 2) = *(c + 2 * ldc);
+ *(b + 3) = *(c + 3 * ldc);
+}
+
+static void ssolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT c0, c1;
+
+ c0 = *c;
+ c1 = *(c + ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[0] * b[1];
+
+ a += 1;
+ b += 2;
+ }
+
+ *c = c0 * *a;
+ *(c + ldc) = c1 * *a;
+
+ *b = *c;
+ *(b + 1) = *(c + ldc);
+}
+
+static void ssolve_1x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ BLASLONG k;
+
+ for (k = 0; k < bk; k++)
+ {
+ *c -= a[0] * b[0];
+
+ a++;
+ b++;
+ }
+
+ *c *= *a;
+ *b = *c;
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ FLOAT *c, BLASLONG ldc, BLASLONG offset)
+{
+ FLOAT *aa, *cc;
+ BLASLONG i, j, kk;
+
+ for (j = (n >> 3); j--;)
+ {
+ kk = offset;
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ ssolve_8x8_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ kk += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ ssolve_4x8_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ kk += 4;
+ }
+
+ if (m & 2)
+ {
+ ssolve_2x8_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ kk += 2;
+ }
+
+ if (m & 1)
+ {
+ ssolve_1x8_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += k;
+ cc += 1;
+ kk += 1;
+ }
+ }
+
+ b += 8 * k;
+ c += 8 * ldc;
+ }
+
+ if (n & 7)
+ {
+ if (n & 4)
+ {
+ kk = offset;
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ ssolve_8x4_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ kk += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ ssolve_4x4_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ kk += 4;
+ }
+
+ if (m & 2)
+ {
+ ssolve_2x4_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ kk += 2;
+ }
+
+ if (m & 1)
+ {
+ ssolve_1x4_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += k;
+ cc += 1;
+ kk += 1;
+ }
+ }
+
+ b += 4 * k;
+ c += 4 * ldc;
+ }
+
+ if (n & 2)
+ {
+ kk = offset;
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ ssolve_8x2_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ kk += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ ssolve_4x2_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ kk += 4;
+ }
+
+ if (m & 2)
+ {
+ ssolve_2x2_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ kk += 2;
+ }
+
+ if (m & 1)
+ {
+ ssolve_1x2_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += k;
+ cc += 1;
+ kk += 1;
+ }
+ }
+
+ b += 2 * k;
+ c += 2 * ldc;
+ }
+
+ if (n & 1)
+ {
+ kk = offset;
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ ssolve_8x1_lt_msa(aa, b, cc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ kk += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ ssolve_4x1_lt_msa(aa, b, cc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ kk += 4;
+ }
+
+ if (m & 2)
+ {
+ ssolve_2x1_lt_msa(aa, b, cc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ kk += 2;
+ }
+
+ if (m & 1)
+ {
+ ssolve_1x1_lt_msa(aa, b, cc, kk);
+
+ aa += k;
+ cc += 1;
+ kk += 1;
+ }
+ }
+
+ b += k;
+ c += ldc;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/strsm_kernel_RN_8x8_msa.c b/kernel/mips/strsm_kernel_RN_8x8_msa.c
new file mode 100644
index 000000000..642ee3757
--- /dev/null
+++ b/kernel/mips/strsm_kernel_RN_8x8_msa.c
@@ -0,0 +1,1704 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ v4f32 src_a0, src_a1;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
+ v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7;
+ v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18;
+ v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28;
+ v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39;
+ v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+ FLOAT *c_nxt4line = c + 4 * ldc;
+ FLOAT *c_nxt5line = c + 5 * ldc;
+ FLOAT *c_nxt6line = c + 6 * ldc;
+ FLOAT *c_nxt7line = c + 7 * ldc;
+
+ LD_SP2(c, 4, src_c0, src_c1);
+ LD_SP2(c_nxt1line, 4, src_c2, src_c3);
+ LD_SP2(c_nxt2line, 4, src_c4, src_c5);
+ LD_SP2(c_nxt3line, 4, src_c6, src_c7);
+ LD_SP2(c_nxt4line, 4, src_c8, src_c9);
+ LD_SP2(c_nxt5line, 4, src_c10, src_c11);
+ LD_SP2(c_nxt6line, 4, src_c12, src_c13);
+ LD_SP2(c_nxt7line, 4, src_c14, src_c15);
+
+ for (k = 0; k < bk; k++)
+ {
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ src_b = LD_SP(b + 4);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+
+ a += 8;
+ b += 8;
+ }
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_b = LD_SP(b + 4);
+ SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7);
+
+ src_b = LD_SP(b + 9);
+ SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12);
+ src_b13 = LD_SP(b + 13);
+ src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2);
+ src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1);
+ src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0);
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 -= src_c0 * src_b1;
+ src_c3 -= src_c1 * src_b1;
+ src_c4 -= src_c0 * src_b2;
+ src_c5 -= src_c1 * src_b2;
+ src_c6 -= src_c0 * src_b3;
+ src_c7 -= src_c1 * src_b3;
+ src_c8 -= src_c0 * src_b4;
+ src_c9 -= src_c1 * src_b4;
+ src_c10 -= src_c0 * src_b5;
+ src_c11 -= src_c1 * src_b5;
+ src_c12 -= src_c0 * src_b6;
+ src_c13 -= src_c1 * src_b6;
+ src_c14 -= src_c0 * src_b7;
+ src_c15 -= src_c1 * src_b7;
+
+ ST_SP2(src_c0, src_c1, a, 4);
+ ST_SP2(src_c0, src_c1, c, 4);
+
+ src_c2 *= src_b9;
+ src_c3 *= src_b9;
+ src_c4 -= src_c2 * src_b10;
+ src_c5 -= src_c3 * src_b10;
+ src_c6 -= src_c2 * src_b11;
+ src_c7 -= src_c3 * src_b11;
+ src_c8 -= src_c2 * src_b12;
+ src_c9 -= src_c3 * src_b12;
+ src_c10 -= src_c2 * src_b13;
+ src_c11 -= src_c3 * src_b13;
+ src_c12 -= src_c2 * src_b14;
+ src_c13 -= src_c3 * src_b14;
+ src_c14 -= src_c2 * src_b15;
+ src_c15 -= src_c3 * src_b15;
+
+ ST_SP2(src_c2, src_c3, a + 8, 4);
+ ST_SP2(src_c2, src_c3, c_nxt1line, 4);
+
+ src_b = LD_SP(b + 18);
+ SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21);
+ src_b22 = LD_SP(b + 22);
+ src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1);
+ src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0);
+
+ src_b = LD_SP(b + 27);
+ SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30);
+ src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31));
+
+ src_c4 *= src_b18;
+ src_c5 *= src_b18;
+ src_c6 -= src_c4 * src_b19;
+ src_c7 -= src_c5 * src_b19;
+ src_c8 -= src_c4 * src_b20;
+ src_c9 -= src_c5 * src_b20;
+ src_c10 -= src_c4 * src_b21;
+ src_c11 -= src_c5 * src_b21;
+ src_c12 -= src_c4 * src_b22;
+ src_c13 -= src_c5 * src_b22;
+ src_c14 -= src_c4 * src_b23;
+ src_c15 -= src_c5 * src_b23;
+
+ ST_SP2(src_c4, src_c5, a + 16, 4);
+ ST_SP2(src_c4, src_c5, c_nxt2line, 4);
+
+ src_c6 *= src_b27;
+ src_c7 *= src_b27;
+ src_c8 -= src_c6 * src_b28;
+ src_c9 -= src_c7 * src_b28;
+ src_c10 -= src_c6 * src_b29;
+ src_c11 -= src_c7 * src_b29;
+ src_c12 -= src_c6 * src_b30;
+ src_c13 -= src_c7 * src_b30;
+ src_c14 -= src_c6 * src_b31;
+ src_c15 -= src_c7 * src_b31;
+
+ ST_SP2(src_c6, src_c7, a + 24, 4);
+ ST_SP2(src_c6, src_c7, c_nxt3line, 4);
+
+ src_b = LD_SP(b + 36);
+ SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39);
+
+ src_b45 = LD_SP(b + 45);
+ src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2);
+ src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1);
+ src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0);
+
+ src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54));
+ src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55));
+ src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63));
+
+ src_c8 *= src_b36;
+ src_c9 *= src_b36;
+ src_c10 -= src_c8 * src_b37;
+ src_c11 -= src_c9 * src_b37;
+ src_c12 -= src_c8 * src_b38;
+ src_c13 -= src_c9 * src_b38;
+ src_c14 -= src_c8 * src_b39;
+ src_c15 -= src_c9 * src_b39;
+
+ ST_SP2(src_c8, src_c9, a + 32, 4);
+ ST_SP2(src_c8, src_c9, c_nxt4line, 4);
+
+ src_c10 *= src_b45;
+ src_c11 *= src_b45;
+ src_c12 -= src_c10 * src_b46;
+ src_c13 -= src_c11 * src_b46;
+ src_c14 -= src_c10 * src_b47;
+ src_c15 -= src_c11 * src_b47;
+
+ ST_SP2(src_c10, src_c11, a + 40, 4);
+ ST_SP2(src_c10, src_c11, c_nxt5line, 4);
+
+ src_c12 *= src_b54;
+ src_c13 *= src_b54;
+ src_c14 -= src_c12 * src_b55;
+ src_c15 -= src_c13 * src_b55;
+
+ ST_SP2(src_c12, src_c13, a + 48, 4);
+ ST_SP2(src_c12, src_c13, c_nxt6line, 4);
+
+ src_c14 *= src_b63;
+ src_c15 *= src_b63;
+
+ ST_SP2(src_c14, src_c15, a + 56, 4);
+ ST_SP2(src_c14, src_c15, c_nxt7line, 4);
+}
+
+static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v4f32 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7;
+ v4f32 src_b10, src_b11, src_b15, src_b, src_a0, src_a1;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ LD_SP2(c, 4, src_c0, src_c1);
+ LD_SP2(c_nxt1line, 4, src_c2, src_c3);
+ LD_SP2(c_nxt2line, 4, src_c4, src_c5);
+ LD_SP2(c_nxt3line, 4, src_c6, src_c7);
+
+ for (k = 0; k < (bk >> 1); k++)
+ {
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ a += 8;
+ b += 4;
+
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ a += 8;
+ b += 4;
+ }
+
+ if ((bk & 1) && (bk > 0))
+ {
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ a += 8;
+ b += 4;
+ }
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_b5 = LD_SP(b + 5);
+ src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2);
+ src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1);
+ src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
+ src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10));
+ src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11));
+ src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15));
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 -= src_c0 * src_b1;
+ src_c3 -= src_c1 * src_b1;
+ src_c4 -= src_c0 * src_b2;
+ src_c5 -= src_c1 * src_b2;
+ src_c6 -= src_c0 * src_b3;
+ src_c7 -= src_c1 * src_b3;
+
+ src_c2 *= src_b5;
+ src_c3 *= src_b5;
+ src_c4 -= src_c2 * src_b6;
+ src_c5 -= src_c3 * src_b6;
+ src_c6 -= src_c2 * src_b7;
+ src_c7 -= src_c3 * src_b7;
+
+ src_c4 *= src_b10;
+ src_c5 *= src_b10;
+ src_c6 -= src_c4 * src_b11;
+ src_c7 -= src_c5 * src_b11;
+
+ src_c6 *= src_b15;
+ src_c7 *= src_b15;
+
+ ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
+ ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
+
+ ST_SP2(src_c0, src_c1, c, 4);
+ ST_SP2(src_c2, src_c3, c_nxt1line, 4);
+ ST_SP2(src_c4, src_c5, c_nxt2line, 4);
+ ST_SP2(src_c6, src_c7, c_nxt3line, 4);
+}
+
+static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ v4f32 src_a0, src_a1;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3;
+ FLOAT *c_nxt1line = c + ldc;
+
+ LD_SP2(c, 4, src_c0, src_c1);
+ LD_SP2(c_nxt1line, 4, src_c2, src_c3);
+
+ for (k = 0; k < (bk >> 1); k++)
+ {
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+
+ a += 8;
+ b += 2;
+
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+
+ a += 8;
+ b += 2;
+ }
+
+ if ((bk & 1) && (bk > 0))
+ {
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+
+ a += 8;
+ b += 2;
+ }
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
+ src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 -= src_c0 * src_b1;
+ src_c3 -= src_c1 * src_b1;
+ src_c2 *= src_b3;
+ src_c3 *= src_b3;
+
+ ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
+ ST_SP2(src_c0, src_c1, c, 4);
+ ST_SP2(src_c2, src_c3, c_nxt1line, 4);
+}
+
+static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ v4f32 src_a0, src_a1, src_c0, src_c1, src_b0;
+
+ LD_SP2(c, 4, src_c0, src_c1);
+
+ for (k = 0; k < (bk >> 2); k++)
+ {
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+
+ a += 8;
+ b += 1;
+
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+
+ a += 8;
+ b += 1;
+
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+
+ a += 8;
+ b += 1;
+
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+
+ a += 8;
+ b += 1;
+ }
+
+ if ((bk & 3) && (bk > 0))
+ {
+ if (bk & 2)
+ {
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+
+ a += 8;
+ b += 1;
+
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+
+ a += 8;
+ b += 1;
+ }
+
+ if (bk & 1)
+ {
+ LD_SP2(a, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+
+ a += 8;
+ b += 1;
+ }
+ }
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+
+ ST_SP2(src_c0, src_c1, a, 4);
+ ST_SP2(src_c0, src_c1, c, 4);
+}
+
+static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7;
+ v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18;
+ v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28;
+ v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39;
+ v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b, src_a0;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+ FLOAT *c_nxt4line = c + 4 * ldc;
+ FLOAT *c_nxt5line = c + 5 * ldc;
+ FLOAT *c_nxt6line = c + 6 * ldc;
+ FLOAT *c_nxt7line = c + 7 * ldc;
+
+ src_c0 = LD_SP(c);
+ src_c1 = LD_SP(c_nxt1line);
+ src_c2 = LD_SP(c_nxt2line);
+ src_c3 = LD_SP(c_nxt3line);
+ src_c4 = LD_SP(c_nxt4line);
+ src_c5 = LD_SP(c_nxt5line);
+ src_c6 = LD_SP(c_nxt6line);
+ src_c7 = LD_SP(c_nxt7line);
+
+ for (k = 0; k < bk; k++)
+ {
+ src_a0 = LD_SP(a);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+
+ src_b = LD_SP(b + 4);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c4 -= src_a0 * src_b0;
+ src_c5 -= src_a0 * src_b1;
+ src_c6 -= src_a0 * src_b2;
+ src_c7 -= src_a0 * src_b3;
+
+ a += 4;
+ b += 8;
+ }
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_b = LD_SP(b + 4);
+ SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7);
+
+ src_b = LD_SP(b + 9);
+ SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12);
+ src_b13 = LD_SP(b + 13);
+ src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2);
+ src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1);
+ src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0);
+
+ src_b = LD_SP(b + 18);
+ SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21);
+ src_b22 = LD_SP(b + 22);
+ src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1);
+ src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0);
+
+ src_b = LD_SP(b + 27);
+ SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30);
+ src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31));
+
+ src_b = LD_SP(b + 36);
+ SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39);
+
+ src_b45 = LD_SP(b + 45);
+ src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2);
+ src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1);
+ src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0);
+
+ src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54));
+ src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55));
+ src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63));
+
+ src_c0 *= src_b0;
+ src_c1 -= src_c0 * src_b1;
+ src_c2 -= src_c0 * src_b2;
+ src_c3 -= src_c0 * src_b3;
+ src_c4 -= src_c0 * src_b4;
+ src_c5 -= src_c0 * src_b5;
+ src_c6 -= src_c0 * src_b6;
+ src_c7 -= src_c0 * src_b7;
+
+ src_c1 *= src_b9;
+ src_c2 -= src_c1 * src_b10;
+ src_c3 -= src_c1 * src_b11;
+ src_c4 -= src_c1 * src_b12;
+ src_c5 -= src_c1 * src_b13;
+ src_c6 -= src_c1 * src_b14;
+ src_c7 -= src_c1 * src_b15;
+
+ src_c2 *= src_b18;
+ src_c3 -= src_c2 * src_b19;
+ src_c4 -= src_c2 * src_b20;
+ src_c5 -= src_c2 * src_b21;
+ src_c6 -= src_c2 * src_b22;
+ src_c7 -= src_c2 * src_b23;
+
+ src_c3 *= src_b27;
+ src_c4 -= src_c3 * src_b28;
+ src_c5 -= src_c3 * src_b29;
+ src_c6 -= src_c3 * src_b30;
+ src_c7 -= src_c3 * src_b31;
+
+ src_c4 *= src_b36;
+ src_c5 -= src_c4 * src_b37;
+ src_c6 -= src_c4 * src_b38;
+ src_c7 -= src_c4 * src_b39;
+
+ src_c5 *= src_b45;
+ src_c6 -= src_c5 * src_b46;
+ src_c7 -= src_c5 * src_b47;
+
+ src_c6 *= src_b54;
+ src_c7 -= src_c6 * src_b55;
+
+ src_c7 *= src_b63;
+
+ ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
+ ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
+
+ ST_SP(src_c0, c);
+ ST_SP(src_c1, c_nxt1line);
+ ST_SP(src_c2, c_nxt2line);
+ ST_SP(src_c3, c_nxt3line);
+ ST_SP(src_c4, c_nxt4line);
+ ST_SP(src_c5, c_nxt5line);
+ ST_SP(src_c6, c_nxt6line);
+ ST_SP(src_c7, c_nxt7line);
+}
+
+static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b2, src_b3;
+ v4f32 src_b5, src_b6, src_b7, src_b10, src_b11, src_b15, src_b, src_a0;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ src_c0 = LD_SP(c);
+ src_c1 = LD_SP(c_nxt1line);
+ src_c2 = LD_SP(c_nxt2line);
+ src_c3 = LD_SP(c_nxt3line);
+
+ for (k = 0; k < (bk >> 1); k++)
+ {
+ src_a0 = LD_SP(a);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+
+ a += 4;
+ b += 4;
+
+ src_a0 = LD_SP(a);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+
+ a += 4;
+ b += 4;
+ }
+
+ if ((bk & 1) && (bk > 0))
+ {
+ src_a0 = LD_SP(a);
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+
+ a += 4;
+ b += 4;
+ }
+
+ src_b = LD_SP(b + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_b5 = LD_SP(b + 5);
+ src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2);
+ src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1);
+ src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
+ src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10));
+ src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11));
+ src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15));
+
+ src_c0 *= src_b0;
+ src_c1 -= src_c0 * src_b1;
+ src_c2 -= src_c0 * src_b2;
+ src_c3 -= src_c0 * src_b3;
+
+ src_c1 *= src_b5;
+ src_c2 -= src_c1 * src_b6;
+ src_c3 -= src_c1 * src_b7;
+
+ src_c2 *= src_b10;
+ src_c3 -= src_c2 * src_b11;
+
+ src_c3 *= src_b15;
+
+ ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
+
+ ST_SP(src_c0, c);
+ ST_SP(src_c1, c_nxt1line);
+ ST_SP(src_c2, c_nxt2line);
+ ST_SP(src_c3, c_nxt3line);
+}
+
+static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ v4f32 src_a, src_c0, src_c1, src_b0, src_b1, src_b3;
+ FLOAT *c_nxt1line = c + ldc;
+
+ src_c0 = LD_SP(c);
+ src_c1 = LD_SP(c_nxt1line);
+
+ for (k = 0; k < (bk >> 2); k++)
+ {
+ src_a = LD_SP(a);
+ src_b0 = LD_SP(b);
+ src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
+ src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+
+ a += 4;
+ b += 2;
+
+ src_a = LD_SP(a);
+ src_b0 = LD_SP(b);
+ src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
+ src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+
+ a += 4;
+ b += 2;
+
+ src_a = LD_SP(a);
+ src_b0 = LD_SP(b);
+ src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
+ src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+
+ a += 4;
+ b += 2;
+
+ src_a = LD_SP(a);
+ src_b0 = LD_SP(b);
+ src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
+ src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+
+ a += 4;
+ b += 2;
+ }
+
+ if ((bk & 3) && (bk > 0))
+ {
+ if (bk & 2)
+ {
+ src_a = LD_SP(a);
+ src_b0 = LD_SP(b);
+ src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
+ src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+
+ a += 4;
+ b += 2;
+
+ src_a = LD_SP(a);
+ src_b0 = LD_SP(b);
+ src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
+ src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+
+ a += 4;
+ b += 2;
+ }
+
+ if (bk & 1)
+ {
+ src_a = LD_SP(a);
+ src_b0 = LD_SP(b);
+ src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
+ src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+
+ a += 4;
+ b += 2;
+ }
+ }
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
+ src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
+
+ src_c0 *= src_b0;
+ src_c1 -= src_c0 * src_b1;
+ src_c1 *= src_b3;
+
+ ST_SP2(src_c0, src_c1, a, 4);
+
+ ST_SP(src_c0, c);
+ ST_SP(src_c1, c_nxt1line);
+}
+
+static void ssolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT b0, c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c2 -= a[2] * b[0];
+ c3 -= a[3] * b[0];
+
+ a += 4;
+ b += 1;
+ }
+
+ b0 = *(b + 0);
+
+ c0 *= b0;
+ c1 *= b0;
+ c2 *= b0;
+ c3 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c2;
+ *(a + 3) = c3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+}
+
+static void ssolve_2x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15;
+ FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31;
+ FLOAT b36, b37, b38, b39, b45, b46, b47, b54, b55, b63;
+ FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
+ FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6;
+ FLOAT c0_nxt7, c1_nxt7;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + 0 + 1 * ldc);
+ c1_nxt1 = *(c + 1 + 1 * ldc);
+ c0_nxt2 = *(c + 0 + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 0 + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+ c0_nxt4 = *(c + 0 + 4 * ldc);
+ c1_nxt4 = *(c + 1 + 4 * ldc);
+ c0_nxt5 = *(c + 0 + 5 * ldc);
+ c1_nxt5 = *(c + 1 + 5 * ldc);
+ c0_nxt6 = *(c + 0 + 6 * ldc);
+ c1_nxt6 = *(c + 1 + 6 * ldc);
+ c0_nxt7 = *(c + 0 + 7 * ldc);
+ c1_nxt7 = *(c + 1 + 7 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c0_nxt1 -= a[0] * b[1];
+ c1_nxt1 -= a[1] * b[1];
+ c0_nxt2 -= a[0] * b[2];
+ c1_nxt2 -= a[1] * b[2];
+ c0_nxt3 -= a[0] * b[3];
+ c1_nxt3 -= a[1] * b[3];
+ c0_nxt4 -= a[0] * b[4];
+ c1_nxt4 -= a[1] * b[4];
+ c0_nxt5 -= a[0] * b[5];
+ c1_nxt5 -= a[1] * b[5];
+ c0_nxt6 -= a[0] * b[6];
+ c1_nxt6 -= a[1] * b[6];
+ c0_nxt7 -= a[0] * b[7];
+ c1_nxt7 -= a[1] * b[7];
+
+ a += 2;
+ b += 8;
+ }
+
+ b0 = *(b + 0);
+ b1 = *(b + 1);
+ b2 = *(b + 2);
+ b3 = *(b + 3);
+ b4 = *(b + 4);
+ b5 = *(b + 5);
+ b6 = *(b + 6);
+ b7 = *(b + 7);
+ b9 = *(b + 9);
+ b10 = *(b + 10);
+ b11 = *(b + 11);
+ b12 = *(b + 12);
+ b13 = *(b + 13);
+ b14 = *(b + 14);
+ b15 = *(b + 15);
+ b18 = *(b + 18);
+ b19 = *(b + 19);
+ b20 = *(b + 20);
+ b21 = *(b + 21);
+ b22 = *(b + 22);
+ b23 = *(b + 23);
+ b27 = *(b + 27);
+ b28 = *(b + 28);
+ b29 = *(b + 29);
+ b30 = *(b + 30);
+ b31 = *(b + 31);
+ b36 = *(b + 36);
+ b37 = *(b + 37);
+ b38 = *(b + 38);
+ b39 = *(b + 39);
+ b45 = *(b + 45);
+ b46 = *(b + 46);
+ b47 = *(b + 47);
+ b54 = *(b + 54);
+ b55 = *(b + 55);
+ b63 = *(b + 63);
+
+ c0 *= b0;
+ c1 *= b0;
+
+ c0_nxt1 -= c0 * b1;
+ c1_nxt1 -= c1 * b1;
+
+ c0_nxt2 -= c0 * b2;
+ c1_nxt2 -= c1 * b2;
+
+ c0_nxt3 -= c0 * b3;
+ c1_nxt3 -= c1 * b3;
+
+ c0_nxt4 -= c0 * b4;
+ c1_nxt4 -= c1 * b4;
+
+ c0_nxt5 -= c0 * b5;
+ c1_nxt5 -= c1 * b5;
+
+ c0_nxt6 -= c0 * b6;
+ c1_nxt6 -= c1 * b6;
+
+ c0_nxt7 -= c0 * b7;
+ c1_nxt7 -= c1 * b7;
+
+ c0_nxt1 *= b9;
+ c1_nxt1 *= b9;
+
+ c0_nxt2 -= c0_nxt1 * b10;
+ c1_nxt2 -= c1_nxt1 * b10;
+
+ c0_nxt3 -= c0_nxt1 * b11;
+ c1_nxt3 -= c1_nxt1 * b11;
+
+ c0_nxt4 -= c0_nxt1 * b12;
+ c1_nxt4 -= c1_nxt1 * b12;
+
+ c0_nxt5 -= c0_nxt1 * b13;
+ c1_nxt5 -= c1_nxt1 * b13;
+
+ c0_nxt6 -= c0_nxt1 * b14;
+ c1_nxt6 -= c1_nxt1 * b14;
+
+ c0_nxt7 -= c0_nxt1 * b15;
+ c1_nxt7 -= c1_nxt1 * b15;
+
+ c0_nxt2 *= b18;
+ c1_nxt2 *= b18;
+
+ c0_nxt3 -= c0_nxt2 * b19;
+ c1_nxt3 -= c1_nxt2 * b19;
+
+ c0_nxt4 -= c0_nxt2 * b20;
+ c1_nxt4 -= c1_nxt2 * b20;
+
+ c0_nxt5 -= c0_nxt2 * b21;
+ c1_nxt5 -= c1_nxt2 * b21;
+
+ c0_nxt6 -= c0_nxt2 * b22;
+ c1_nxt6 -= c1_nxt2 * b22;
+
+ c0_nxt7 -= c0_nxt2 * b23;
+ c1_nxt7 -= c1_nxt2 * b23;
+
+ c0_nxt3 *= b27;
+ c1_nxt3 *= b27;
+
+ c0_nxt4 -= c0_nxt3 * b28;
+ c1_nxt4 -= c1_nxt3 * b28;
+
+ c0_nxt5 -= c0_nxt3 * b29;
+ c1_nxt5 -= c1_nxt3 * b29;
+
+ c0_nxt6 -= c0_nxt3 * b30;
+ c1_nxt6 -= c1_nxt3 * b30;
+
+ c0_nxt7 -= c0_nxt3 * b31;
+ c1_nxt7 -= c1_nxt3 * b31;
+
+ c0_nxt4 *= b36;
+ c1_nxt4 *= b36;
+
+ c0_nxt5 -= c0_nxt4 * b37;
+ c1_nxt5 -= c1_nxt4 * b37;
+
+ c0_nxt6 -= c0_nxt4 * b38;
+ c1_nxt6 -= c1_nxt4 * b38;
+
+ c0_nxt7 -= c0_nxt4 * b39;
+ c1_nxt7 -= c1_nxt4 * b39;
+
+ c0_nxt5 *= b45;
+ c1_nxt5 *= b45;
+
+ c0_nxt6 -= c0_nxt5 * b46;
+ c1_nxt6 -= c1_nxt5 * b46;
+
+ c0_nxt7 -= c0_nxt5 * b47;
+ c1_nxt7 -= c1_nxt5 * b47;
+
+ c0_nxt6 *= b54;
+ c1_nxt6 *= b54;
+
+ c0_nxt7 -= c0_nxt6 * b55;
+ c1_nxt7 -= c1_nxt6 * b55;
+
+ c0_nxt7 *= b63;
+ c1_nxt7 *= b63;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c0_nxt1;
+ *(a + 3) = c1_nxt1;
+ *(a + 4) = c0_nxt2;
+ *(a + 5) = c1_nxt2;
+ *(a + 6) = c0_nxt3;
+ *(a + 7) = c1_nxt3;
+ *(a + 8) = c0_nxt4;
+ *(a + 9) = c1_nxt4;
+ *(a + 10) = c0_nxt5;
+ *(a + 11) = c1_nxt5;
+ *(a + 12) = c0_nxt6;
+ *(a + 13) = c1_nxt6;
+ *(a + 14) = c0_nxt7;
+ *(a + 15) = c1_nxt7;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + 1 * ldc) = c0_nxt1;
+ *(c + 1 + 1 * ldc) = c1_nxt1;
+ *(c + 0 + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+ *(c + 0 + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+ *(c + 0 + 4 * ldc) = c0_nxt4;
+ *(c + 1 + 4 * ldc) = c1_nxt4;
+ *(c + 0 + 5 * ldc) = c0_nxt5;
+ *(c + 1 + 5 * ldc) = c1_nxt5;
+ *(c + 0 + 6 * ldc) = c0_nxt6;
+ *(c + 1 + 6 * ldc) = c1_nxt6;
+ *(c + 0 + 7 * ldc) = c0_nxt7;
+ *(c + 1 + 7 * ldc) = c1_nxt7;
+}
+
+static void ssolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1;
+ FLOAT c0_nxt1, c0_nxt2, c0_nxt3, c1_nxt1, c1_nxt2, c1_nxt3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + 0 + 1 * ldc);
+ c1_nxt1 = *(c + 1 + 1 * ldc);
+ c0_nxt2 = *(c + 0 + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 0 + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c0_nxt1 -= a[0] * b[1];
+ c1_nxt1 -= a[1] * b[1];
+ c0_nxt2 -= a[0] * b[2];
+ c1_nxt2 -= a[1] * b[2];
+ c0_nxt3 -= a[0] * b[3];
+ c1_nxt3 -= a[1] * b[3];
+
+ a += 2;
+ b += 4;
+ }
+
+ b0 = *(b + 0);
+ b1 = *(b + 1);
+ b2 = *(b + 2);
+ b3 = *(b + 3);
+ b5 = *(b + 5);
+ b6 = *(b + 6);
+ b7 = *(b + 7);
+ b10 = *(b + 10);
+ b11 = *(b + 11);
+ b15 = *(b + 15);
+
+ c0 *= b0;
+ c1 *= b0;
+
+ c0_nxt1 -= c0 * b1;
+ c1_nxt1 -= c1 * b1;
+ c0_nxt1 *= b5;
+ c1_nxt1 *= b5;
+
+ c0_nxt2 -= c0 * b2;
+ c1_nxt2 -= c1 * b2;
+ c0_nxt2 -= c0_nxt1 * b6;
+ c1_nxt2 -= c1_nxt1 * b6;
+ c0_nxt2 *= b10;
+ c1_nxt2 *= b10;
+
+ c0_nxt3 -= c0 * b3;
+ c1_nxt3 -= c1 * b3;
+ c0_nxt3 -= c0_nxt1 * b7;
+ c1_nxt3 -= c1_nxt1 * b7;
+ c0_nxt3 -= c0_nxt2 * b11;
+ c1_nxt3 -= c1_nxt2 * b11;
+ c0_nxt3 *= b15;
+ c1_nxt3 *= b15;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c0_nxt1;
+ *(a + 3) = c1_nxt1;
+ *(a + 4) = c0_nxt2;
+ *(a + 5) = c1_nxt2;
+ *(a + 6) = c0_nxt3;
+ *(a + 7) = c1_nxt3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 1 * ldc) = c0_nxt1;
+ *(c + 1 + 1 * ldc) = c1_nxt1;
+ *(c + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+ *(c + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+}
+
+static void ssolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt = *(c + 0 + ldc);
+ c1_nxt = *(c + 1 + ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c0_nxt -= a[0] * b[1];
+ c1_nxt -= a[1] * b[1];
+
+ a += 2;
+ b += 2;
+ }
+
+ b0 = *(b + 0);
+ b1 = *(b + 1);
+ b3 = *(b + 3);
+
+ c0 *= b0;
+ c1 *= b0;
+
+ c0_nxt -= c0 * b1;
+ c1_nxt -= c1 * b1;
+
+ c0_nxt *= b3;
+ c1_nxt *= b3;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c0_nxt;
+ *(a + 3) = c1_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+}
+
+static void ssolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT b0, c0, c1;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+
+ a += 2;
+ b += 1;
+ }
+
+ b0 = *(b + 0);
+
+ c0 *= b0;
+ c1 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+}
+
+static void ssolve_1x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15;
+ FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31, b36, b37, b38;
+ FLOAT b39, b45, b46, b47, b54, b55, b63, c0, c1, c2, c3, c4, c5, c6, c7;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
+ c4 = *(c + 4 * ldc);
+ c5 = *(c + 5 * ldc);
+ c6 = *(c + 6 * ldc);
+ c7 = *(c + 7 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[0] * b[1];
+ c2 -= a[0] * b[2];
+ c3 -= a[0] * b[3];
+ c4 -= a[0] * b[4];
+ c5 -= a[0] * b[5];
+ c6 -= a[0] * b[6];
+ c7 -= a[0] * b[7];
+
+ a += 1;
+ b += 8;
+ }
+
+ b0 = *(b + 0);
+ b1 = *(b + 1);
+ b2 = *(b + 2);
+ b3 = *(b + 3);
+ b4 = *(b + 4);
+ b5 = *(b + 5);
+ b6 = *(b + 6);
+ b7 = *(b + 7);
+ b9 = *(b + 9);
+ b10 = *(b + 10);
+ b11 = *(b + 11);
+ b12 = *(b + 12);
+ b13 = *(b + 13);
+ b14 = *(b + 14);
+ b15 = *(b + 15);
+ b18 = *(b + 18);
+ b19 = *(b + 19);
+ b20 = *(b + 20);
+ b21 = *(b + 21);
+ b22 = *(b + 22);
+ b23 = *(b + 23);
+ b27 = *(b + 27);
+ b28 = *(b + 28);
+ b29 = *(b + 29);
+ b30 = *(b + 30);
+ b31 = *(b + 31);
+ b36 = *(b + 36);
+ b37 = *(b + 37);
+ b38 = *(b + 38);
+ b39 = *(b + 39);
+ b45 = *(b + 45);
+ b46 = *(b + 46);
+ b47 = *(b + 47);
+ b54 = *(b + 54);
+ b55 = *(b + 55);
+ b63 = *(b + 63);
+
+ c0 *= b0;
+
+ c1 -= c0 * b1;
+ c1 *= b9;
+
+ c2 -= c0 * b2;
+ c2 -= c1 * b10;
+ c2 *= b18;
+
+ c3 -= c0 * b3;
+ c3 -= c1 * b11;
+ c3 -= c2 * b19;
+ c3 *= b27;
+
+ c4 -= c0 * b4;
+ c4 -= c1 * b12;
+ c4 -= c2 * b20;
+ c4 -= c3 * b28;
+ c4 *= b36;
+
+ c5 -= c0 * b5;
+ c5 -= c1 * b13;
+ c5 -= c2 * b21;
+ c5 -= c3 * b29;
+ c5 -= c4 * b37;
+ c5 *= b45;
+
+ c6 -= c0 * b6;
+ c6 -= c1 * b14;
+ c6 -= c2 * b22;
+ c6 -= c3 * b30;
+ c6 -= c4 * b38;
+ c6 -= c5 * b46;
+ c6 *= b54;
+
+ c7 -= c0 * b7;
+ c7 -= c1 * b15;
+ c7 -= c2 * b23;
+ c7 -= c3 * b31;
+ c7 -= c4 * b39;
+ c7 -= c5 * b47;
+ c7 -= c6 * b55;
+ c7 *= b63;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c2;
+ *(a + 3) = c3;
+ *(a + 4) = c4;
+ *(a + 5) = c5;
+ *(a + 6) = c6;
+ *(a + 7) = c7;
+
+ *(c + 0) = c0;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
+ *(c + 4 * ldc) = c4;
+ *(c + 5 * ldc) = c5;
+ *(c + 6 * ldc) = c6;
+ *(c + 7 * ldc) = c7;
+}
+
+static void ssolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[0] * b[1];
+ c2 -= a[0] * b[2];
+ c3 -= a[0] * b[3];
+
+ a += 1;
+ b += 4;
+ }
+
+ b0 = *(b + 0);
+ b1 = *(b + 1);
+ b2 = *(b + 2);
+ b3 = *(b + 3);
+ b5 = *(b + 5);
+ b6 = *(b + 6);
+ b7 = *(b + 7);
+ b10 = *(b + 10);
+ b11 = *(b + 11);
+ b15 = *(b + 15);
+
+ c0 *= b0;
+
+ c1 -= c0 * b1;
+ c1 *= b5;
+
+ c2 -= c0 * b2;
+ c2 -= c1 * b6;
+ c2 *= b10;
+
+ c3 -= c0 * b3;
+ c3 -= c1 * b7;
+ c3 -= c2 * b11;
+ c3 *= b15;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c2;
+ *(a + 3) = c3;
+
+ *(c + 0) = c0;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
+}
+
+static void ssolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT b0, b1, b3, c0, c1;
+
+ c0 = *c;
+ c1 = *(c + ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= a[0] * b[0];
+ c1 -= a[0] * b[1];
+
+ a += 1;
+ b += 2;
+ }
+
+ b0 = *(b + 0);
+ b1 = *(b + 1);
+ b3 = *(b + 3);
+
+ c0 *= b0;
+
+ c1 -= c0 * b1;
+ c1 *= b3;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + ldc) = c1;
+}
+
+static void ssolve_1x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ BLASLONG k;
+
+ for (k = 0; k < bk; k++)
+ {
+ *c -= a[0] * b[0];
+
+ a++;
+ b++;
+ }
+
+ *c *= *b;
+ *a = *c;
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ FLOAT *c, BLASLONG ldc, BLASLONG offset)
+{
+ FLOAT *aa, *cc;
+ BLASLONG i, j, kk;
+
+ kk = -offset;
+
+ for (j = (n >> 3); j--;)
+ {
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ ssolve_8x8_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ ssolve_4x8_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ ssolve_2x8_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ ssolve_1x8_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ kk += 8;
+ b += 8 * k;
+ c += 8 * ldc;
+ }
+
+ if (n & 7)
+ {
+ if (n & 4)
+ {
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ ssolve_8x4_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ ssolve_4x4_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ ssolve_2x4_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ ssolve_1x4_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ b += 4 * k;
+ c += 4 * ldc;
+ kk += 4;
+ }
+
+ if (n & 2)
+ {
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ ssolve_8x2_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ ssolve_4x2_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ ssolve_2x2_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ ssolve_1x2_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ b += 2 * k;
+ c += 2 * ldc;
+ kk += 2;
+ }
+
+ if (n & 1)
+ {
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ ssolve_8x1_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ ssolve_4x1_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ ssolve_2x1_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ ssolve_1x1_rn_msa(aa, b, cc, kk);
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ b += k;
+ c += ldc;
+ kk += 1;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/strsm_kernel_RT_8x8_msa.c b/kernel/mips/strsm_kernel_RT_8x8_msa.c
new file mode 100644
index 000000000..21e41c8fb
--- /dev/null
+++ b/kernel/mips/strsm_kernel_RT_8x8_msa.c
@@ -0,0 +1,1726 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ v4f32 src_a0, src_a1, src_b1, src_b2, src_b3;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
+ v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24;
+ v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35;
+ v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45;
+ v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54;
+ v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+ FLOAT *c_nxt4line = c + 4 * ldc;
+ FLOAT *c_nxt5line = c + 5 * ldc;
+ FLOAT *c_nxt6line = c + 6 * ldc;
+ FLOAT *c_nxt7line = c + 7 * ldc;
+
+ LD_SP2(c, 4, src_c0, src_c1);
+ LD_SP2(c_nxt1line, 4, src_c2, src_c3);
+ LD_SP2(c_nxt2line, 4, src_c4, src_c5);
+ LD_SP2(c_nxt3line, 4, src_c6, src_c7);
+ LD_SP2(c_nxt4line, 4, src_c8, src_c9);
+ LD_SP2(c_nxt5line, 4, src_c10, src_c11);
+ LD_SP2(c_nxt6line, 4, src_c12, src_c13);
+ LD_SP2(c_nxt7line, 4, src_c14, src_c15);
+
+ for (k = 0; k < bk; k++)
+ {
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b = LD_SP(bb + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ src_b = LD_SP(bb + 4);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+
+ aa += 8;
+ bb += 8;
+ }
+
+ b -= 64;
+
+ src_b = LD_SP(b + 60);
+ SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63);
+ src_b = LD_SP(b + 56);
+ SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59);
+
+ src_c15 *= src_b63;
+ src_c14 *= src_b63;
+ src_c13 -= src_c15 * src_b62;
+ src_c12 -= src_c14 * src_b62;
+ src_c11 -= src_c15 * src_b61;
+ src_c10 -= src_c14 * src_b61;
+ src_c9 -= src_c15 * src_b60;
+ src_c8 -= src_c14 * src_b60;
+ src_c7 -= src_c15 * src_b59;
+ src_c6 -= src_c14 * src_b59;
+ src_c5 -= src_c15 * src_b58;
+ src_c4 -= src_c14 * src_b58;
+ src_c3 -= src_c15 * src_b57;
+ src_c2 -= src_c14 * src_b57;
+ src_c1 -= src_c15 * src_b56;
+ src_c0 -= src_c14 * src_b56;
+
+ src_b = LD_SP(b + 48);
+ SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51);
+ src_b52 = LD_SP(b + 52);
+ src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2);
+ src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1);
+ src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0);
+
+ src_c12 *= src_b54;
+ src_c13 *= src_b54;
+ src_c10 -= src_c12 * src_b53;
+ src_c11 -= src_c13 * src_b53;
+ src_c8 -= src_c12 * src_b52;
+ src_c9 -= src_c13 * src_b52;
+ src_c6 -= src_c12 * src_b51;
+ src_c7 -= src_c13 * src_b51;
+ src_c4 -= src_c12 * src_b50;
+ src_c5 -= src_c13 * src_b50;
+ src_c2 -= src_c12 * src_b49;
+ src_c3 -= src_c13 * src_b49;
+ src_c0 -= src_c12 * src_b48;
+ src_c1 -= src_c13 * src_b48;
+
+ ST_SP4(src_c12, src_c13, src_c14, src_c15, a - 16, 4);
+ ST_SP2(src_c12, src_c13, c_nxt6line, 4);
+ ST_SP2(src_c14, src_c15, c_nxt7line, 4);
+
+ src_b = LD_SP(b + 40);
+ SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43);
+ src_b44 = LD_SP(b + 44);
+ src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1);
+ src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0);
+
+ src_c10 *= src_b45;
+ src_c11 *= src_b45;
+ src_c8 -= src_c10 * src_b44;
+ src_c9 -= src_c11 * src_b44;
+ src_c6 -= src_c10 * src_b43;
+ src_c7 -= src_c11 * src_b43;
+ src_c4 -= src_c10 * src_b42;
+ src_c5 -= src_c11 * src_b42;
+ src_c2 -= src_c10 * src_b41;
+ src_c3 -= src_c11 * src_b41;
+ src_c0 -= src_c10 * src_b40;
+ src_c1 -= src_c11 * src_b40;
+
+ src_b = LD_SP(b + 32);
+ SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
+ src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36));
+
+ src_c8 *= src_b36;
+ src_c9 *= src_b36;
+ src_c6 -= src_c8 * src_b35;
+ src_c7 -= src_c9 * src_b35;
+ src_c4 -= src_c8 * src_b34;
+ src_c5 -= src_c9 * src_b34;
+ src_c2 -= src_c8 * src_b33;
+ src_c3 -= src_c9 * src_b33;
+ src_c0 -= src_c8 * src_b32;
+ src_c1 -= src_c9 * src_b32;
+
+ ST_SP4(src_c8, src_c9, src_c10, src_c11, a - 32, 4);
+ ST_SP2(src_c8, src_c9, c_nxt4line, 4);
+ ST_SP2(src_c10, src_c11, c_nxt5line, 4);
+
+ src_b = LD_SP(b + 24);
+ SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27);
+
+ src_c6 *= src_b27;
+ src_c7 *= src_b27;
+ src_c4 -= src_c6 * src_b26;
+ src_c5 -= src_c7 * src_b26;
+ src_c2 -= src_c6 * src_b25;
+ src_c3 -= src_c7 * src_b25;
+ src_c0 -= src_c6 * src_b24;
+ src_c1 -= src_c7 * src_b24;
+
+ src_b16 = LD_SP(b + 16);
+ src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2);
+ src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1);
+ src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0);
+
+ src_c4 *= src_b18;
+ src_c5 *= src_b18;
+ src_c2 -= src_c4 * src_b17;
+ src_c3 -= src_c5 * src_b17;
+ src_c0 -= src_c4 * src_b16;
+ src_c1 -= src_c5 * src_b16;
+
+ ST_SP4(src_c4, src_c5, src_c6, src_c7, a - 48, 4);
+ ST_SP2(src_c4, src_c5, c_nxt2line, 4);
+ ST_SP2(src_c6, src_c7, c_nxt3line, 4);
+
+ src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9));
+ src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8));
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+
+ src_c2 *= src_b9;
+ src_c3 *= src_b9;
+ src_c0 -= src_c2 * src_b8;
+ src_c1 -= src_c3 * src_b8;
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+
+ ST_SP4(src_c0, src_c1, src_c2, src_c3, a - 64, 4);
+
+ ST_SP2(src_c0, src_c1, c, 4);
+ ST_SP2(src_c2, src_c3, c_nxt1line, 4);
+}
+
+static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ v4f32 src_a0, src_a1, src_b1, src_b2, src_b3;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v4f32 src_b, src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12;
+ v4f32 src_b13, src_b14, src_b15;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ LD_SP2(c, 4, src_c0, src_c1);
+ LD_SP2(c_nxt1line, 4, src_c2, src_c3);
+ LD_SP2(c_nxt2line, 4, src_c4, src_c5);
+ LD_SP2(c_nxt3line, 4, src_c6, src_c7);
+
+ for (k = 0; k < (bk >> 1); k++)
+ {
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b = LD_SP(bb + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ aa += 8;
+ bb += 4;
+
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b = LD_SP(bb + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ aa += 8;
+ bb += 4;
+ }
+
+ if ((bk & 1) && (bk > 0))
+ {
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b = LD_SP(bb + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+ }
+
+ a -= 32;
+ b -= 16;
+
+ src_b = LD_SP(b + 12);
+ SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15);
+ src_b8 = LD_SP(b + 8);
+ src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
+ src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
+ src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
+ src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5));
+ src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4));
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+
+ src_c7 *= src_b15;
+ src_c6 *= src_b15;
+ src_c5 -= src_c7 * src_b14;
+ src_c4 -= src_c6 * src_b14;
+ src_c3 -= src_c7 * src_b13;
+ src_c2 -= src_c6 * src_b13;
+ src_c1 -= src_c7 * src_b12;
+ src_c0 -= src_c6 * src_b12;
+
+ src_c5 *= src_b10;
+ src_c4 *= src_b10;
+ src_c3 -= src_c5 * src_b9;
+ src_c2 -= src_c4 * src_b9;
+ src_c1 -= src_c5 * src_b8;
+ src_c0 -= src_c4 * src_b8;
+
+ src_c3 *= src_b5;
+ src_c2 *= src_b5;
+ src_c1 -= src_c3 * src_b4;
+ src_c0 -= src_c2 * src_b4;
+
+ src_c1 *= src_b0;
+ src_c0 *= src_b0;
+
+ ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
+ ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
+
+ ST_SP2(src_c0, src_c1, c, 4);
+ ST_SP2(src_c2, src_c3, c_nxt1line, 4);
+ ST_SP2(src_c4, src_c5, c_nxt2line, 4);
+ ST_SP2(src_c6, src_c7, c_nxt3line, 4);
+}
+
+static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ v4f32 src_a0, src_a1, src_b1;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3;
+ FLOAT *c_nxt1line = c + ldc;
+
+ LD_SP2(c, 4, src_c0, src_c1);
+ LD_SP2(c_nxt1line, 4, src_c2, src_c3);
+
+ for (k = 0; k < (bk >> 1); k++)
+ {
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+
+ aa += 8;
+ bb += 2;
+
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+
+ aa += 8;
+ bb += 2;
+ }
+
+ if ((bk & 1) && (bk > 0))
+ {
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ }
+
+ a -= 16;
+ b -= 4;
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+ src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2));
+ src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
+
+ src_c2 *= src_b3;
+ src_c3 *= src_b3;
+ src_c0 -= src_c2 * src_b2;
+ src_c1 -= src_c3 * src_b2;
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+
+ ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
+ ST_SP2(src_c0, src_c1, c, 4);
+ ST_SP2(src_c2, src_c3, c_nxt1line, 4);
+}
+
+static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ v4f32 src_a0, src_a1, src_c0, src_c1, src_b0;
+
+ LD_SP2(c, 4, src_c0, src_c1);
+
+ for (k = 0; k < (bk >> 2); k++)
+ {
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+
+ aa += 8;
+ bb += 1;
+
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+
+ aa += 8;
+ bb += 1;
+
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+
+ aa += 8;
+ bb += 1;
+
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+
+ aa += 8;
+ bb += 1;
+ }
+
+ if ((bk & 3) && (bk > 0))
+ {
+ if (bk & 2)
+ {
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+
+ aa += 8;
+ bb += 1;
+
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+
+ aa += 8;
+ bb += 1;
+ }
+
+ if (bk & 1)
+ {
+ LD_SP2(aa, 4, src_a0, src_a1);
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ }
+ }
+
+ a -= 8;
+ b -= 1;
+
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+
+ ST_SP2(src_c0, src_c1, a, 4);
+ ST_SP2(src_c0, src_c1, c, 4);
+}
+
+static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ v4f32 src_a0, src_b1, src_b2, src_b3;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24;
+ v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35;
+ v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45;
+ v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54;
+ v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+ FLOAT *c_nxt4line = c + 4 * ldc;
+ FLOAT *c_nxt5line = c + 5 * ldc;
+ FLOAT *c_nxt6line = c + 6 * ldc;
+ FLOAT *c_nxt7line = c + 7 * ldc;
+
+ src_c0 = LD_SP(c);
+ src_c1 = LD_SP(c_nxt1line);
+ src_c2 = LD_SP(c_nxt2line);
+ src_c3 = LD_SP(c_nxt3line);
+ src_c4 = LD_SP(c_nxt4line);
+ src_c5 = LD_SP(c_nxt5line);
+ src_c6 = LD_SP(c_nxt6line);
+ src_c7 = LD_SP(c_nxt7line);
+
+ for (k = 0; k < bk; k++)
+ {
+ src_a0 = LD_SP(aa);
+
+ src_b = LD_SP(bb + 0);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a0 * src_b1;
+ src_c2 -= src_a0 * src_b2;
+ src_c3 -= src_a0 * src_b3;
+
+ src_b = LD_SP(bb + 4);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+ src_c4 -= src_a0 * src_b0;
+ src_c5 -= src_a0 * src_b1;
+ src_c6 -= src_a0 * src_b2;
+ src_c7 -= src_a0 * src_b3;
+
+ aa += 4;
+ bb += 8;
+ }
+
+ a -= 32;
+ b -= 64;
+
+ src_b = LD_SP(b + 60);
+ SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63);
+ src_b = LD_SP(b + 56);
+ SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59);
+
+ src_b = LD_SP(b + 48);
+ SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51);
+ src_b52 = LD_SP(b + 52);
+ src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2);
+ src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1);
+ src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0);
+
+ src_b = LD_SP(b + 40);
+ SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43);
+ src_b44 = LD_SP(b + 44);
+ src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1);
+ src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0);
+
+ src_b = LD_SP(b + 32);
+ SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
+ src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36));
+
+ src_b = LD_SP(b + 24);
+ SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27);
+
+ src_b16 = LD_SP(b + 16);
+ src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2);
+ src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1);
+ src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0);
+
+ src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9));
+ src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8));
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+
+ src_c7 *= src_b63;
+ src_c6 -= src_c7 * src_b62;
+ src_c5 -= src_c7 * src_b61;
+ src_c4 -= src_c7 * src_b60;
+ src_c3 -= src_c7 * src_b59;
+ src_c2 -= src_c7 * src_b58;
+ src_c1 -= src_c7 * src_b57;
+ src_c0 -= src_c7 * src_b56;
+
+ src_c6 *= src_b54;
+ src_c5 -= src_c6 * src_b53;
+ src_c4 -= src_c6 * src_b52;
+ src_c3 -= src_c6 * src_b51;
+ src_c2 -= src_c6 * src_b50;
+ src_c1 -= src_c6 * src_b49;
+ src_c0 -= src_c6 * src_b48;
+
+ src_c5 *= src_b45;
+ src_c4 -= src_c5 * src_b44;
+ src_c3 -= src_c5 * src_b43;
+ src_c2 -= src_c5 * src_b42;
+ src_c1 -= src_c5 * src_b41;
+ src_c0 -= src_c5 * src_b40;
+
+ src_c4 *= src_b36;
+ src_c3 -= src_c4 * src_b35;
+ src_c2 -= src_c4 * src_b34;
+ src_c1 -= src_c4 * src_b33;
+ src_c0 -= src_c4 * src_b32;
+
+ src_c3 *= src_b27;
+ src_c2 -= src_c3 * src_b26;
+ src_c1 -= src_c3 * src_b25;
+ src_c0 -= src_c3 * src_b24;
+
+ src_c2 *= src_b18;
+ src_c1 -= src_c2 * src_b17;
+ src_c0 -= src_c2 * src_b16;
+
+ src_c1 *= src_b9;
+ src_c0 -= src_c1 * src_b8;
+
+ src_c0 *= src_b0;
+
+ ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
+ ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
+
+ ST_SP(src_c0, c);
+ ST_SP(src_c1, c_nxt1line);
+ ST_SP(src_c2, c_nxt2line);
+ ST_SP(src_c3, c_nxt3line);
+ ST_SP(src_c4, c_nxt4line);
+ ST_SP(src_c5, c_nxt5line);
+ ST_SP(src_c6, c_nxt6line);
+ ST_SP(src_c7, c_nxt7line);
+}
+
+static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ v4f32 src_c0, src_c1, src_c2, src_c3, src_b;
+ v4f32 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13;
+ v4f32 src_b14, src_b15, src_a, src_b1, src_b2, src_b3;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ src_c0 = LD_SP(c);
+ src_c1 = LD_SP(c_nxt1line);
+ src_c2 = LD_SP(c_nxt2line);
+ src_c3 = LD_SP(c_nxt3line);
+
+ for (k = 0; k < (bk >> 1); k++)
+ {
+ src_a = LD_SP(aa);
+
+ src_b = LD_SP(bb);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+ src_c2 -= src_a * src_b2;
+ src_c3 -= src_a * src_b3;
+
+ aa += 4;
+ bb += 4;
+
+ src_a = LD_SP(aa);
+
+ src_b = LD_SP(bb);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+ src_c2 -= src_a * src_b2;
+ src_c3 -= src_a * src_b3;
+
+ aa += 4;
+ bb += 4;
+ }
+
+ if ((bk & 1) && (bk > 0))
+ {
+ src_a = LD_SP(aa);
+
+ src_b = LD_SP(bb);
+ SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+ src_c2 -= src_a * src_b2;
+ src_c3 -= src_a * src_b3;
+ }
+
+ a -= 16;
+ b -= 16;
+
+ src_b = LD_SP(b + 12);
+ SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15);
+ src_b8 = LD_SP(b + 8);
+ src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
+ src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
+ src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
+ src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5));
+ src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4));
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+
+ src_c3 *= src_b15;
+ src_c2 -= src_c3 * src_b14;
+ src_c1 -= src_c3 * src_b13;
+ src_c0 -= src_c3 * src_b12;
+
+ src_c2 *= src_b10;
+ src_c1 -= src_c2 * src_b9;
+ src_c0 -= src_c2 * src_b8;
+
+ src_c1 *= src_b5;
+ src_c0 -= src_c1 * src_b4;
+
+ src_c0 *= src_b0;
+
+ ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
+
+ ST_SP(src_c0, c);
+ ST_SP(src_c1, c_nxt1line);
+ ST_SP(src_c2, c_nxt2line);
+ ST_SP(src_c3, c_nxt3line);
+}
+
+static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ v4f32 src_a, src_b1, src_c0, src_c1, src_b0, src_b2, src_b3;
+ FLOAT *c_nxt1line = c + ldc;
+
+ src_c0 = LD_SP(c);
+ src_c1 = LD_SP(c_nxt1line);
+
+ for (k = 0; k < (bk >> 2); k++)
+ {
+ src_a = LD_SP(aa);
+ src_b0 = LD_SP(bb);
+ src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
+ src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+
+ aa += 4;
+ bb += 2;
+
+ src_a = LD_SP(aa);
+ src_b0 = LD_SP(bb);
+ src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
+ src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+
+ aa += 4;
+ bb += 2;
+
+ src_a = LD_SP(aa);
+ src_b0 = LD_SP(bb);
+ src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
+ src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+
+ aa += 4;
+ bb += 2;
+
+ src_a = LD_SP(aa);
+ src_b0 = LD_SP(bb);
+ src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
+ src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+
+ aa += 4;
+ bb += 2;
+ }
+
+ if ((bk & 3) && (bk > 0))
+ {
+ if (bk & 2)
+ {
+ src_a = LD_SP(aa);
+ src_b0 = LD_SP(bb);
+ src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
+ src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+
+ aa += 4;
+ bb += 2;
+
+ src_a = LD_SP(aa);
+ src_b0 = LD_SP(bb);
+ src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
+ src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+
+ aa += 4;
+ bb += 2;
+ }
+
+ if (bk & 1)
+ {
+ src_a = LD_SP(aa);
+ src_b0 = LD_SP(bb);
+ src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
+ src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
+
+ src_c0 -= src_a * src_b0;
+ src_c1 -= src_a * src_b1;
+ }
+ }
+
+ a -= 8;
+ b -= 4;
+
+ src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
+ src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2));
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+
+ src_c1 *= src_b3;
+ src_c0 -= src_c1 * src_b2;
+ src_c0 *= src_b0;
+
+ ST_SP2(src_c0, src_c1, a, 4);
+
+ ST_SP(src_c0, c);
+ ST_SP(src_c1, c_nxt1line);
+}
+
+static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT b0, c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c2 -= aa[2] * bb[0];
+ c3 -= aa[3] * bb[0];
+
+ aa += 4;
+ bb += 1;
+ }
+
+ a -= 4;
+ b -= 1;
+
+ b0 = *b;
+
+ c0 *= b0;
+ c1 *= b0;
+ c2 *= b0;
+ c3 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c2;
+ *(a + 3) = c3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+}
+
+static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35;
+ FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54;
+ FLOAT b56, b57, b58, b59, b60, b61, b62, b63, c0_nxt7, c1_nxt7;
+ FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
+ FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + 0 + 1 * ldc);
+ c1_nxt1 = *(c + 1 + 1 * ldc);
+ c0_nxt2 = *(c + 0 + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 0 + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+ c0_nxt4 = *(c + 0 + 4 * ldc);
+ c1_nxt4 = *(c + 1 + 4 * ldc);
+ c0_nxt5 = *(c + 0 + 5 * ldc);
+ c1_nxt5 = *(c + 1 + 5 * ldc);
+ c0_nxt6 = *(c + 0 + 6 * ldc);
+ c1_nxt6 = *(c + 1 + 6 * ldc);
+ c0_nxt7 = *(c + 0 + 7 * ldc);
+ c1_nxt7 = *(c + 1 + 7 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c0_nxt1 -= aa[0] * bb[1];
+ c1_nxt1 -= aa[1] * bb[1];
+ c0_nxt2 -= aa[0] * bb[2];
+ c1_nxt2 -= aa[1] * bb[2];
+ c0_nxt3 -= aa[0] * bb[3];
+ c1_nxt3 -= aa[1] * bb[3];
+ c0_nxt4 -= aa[0] * bb[4];
+ c1_nxt4 -= aa[1] * bb[4];
+ c0_nxt5 -= aa[0] * bb[5];
+ c1_nxt5 -= aa[1] * bb[5];
+ c0_nxt6 -= aa[0] * bb[6];
+ c1_nxt6 -= aa[1] * bb[6];
+ c0_nxt7 -= aa[0] * bb[7];
+ c1_nxt7 -= aa[1] * bb[7];
+
+ aa += 2;
+ bb += 8;
+ }
+
+ a -= 16;
+ b -= 64;
+
+ b0 = *(b + 0);
+ b8 = *(b + 8);
+ b9 = *(b + 9);
+ b16 = *(b + 16);
+ b17 = *(b + 17);
+ b18 = *(b + 18);
+ b24 = *(b + 24);
+ b25 = *(b + 25);
+ b26 = *(b + 26);
+ b27 = *(b + 27);
+ b32 = *(b + 32);
+ b33 = *(b + 33);
+ b34 = *(b + 34);
+ b35 = *(b + 35);
+ b36 = *(b + 36);
+ b40 = *(b + 40);
+ b41 = *(b + 41);
+ b42 = *(b + 42);
+ b43 = *(b + 43);
+ b44 = *(b + 44);
+ b45 = *(b + 45);
+ b48 = *(b + 48);
+ b49 = *(b + 49);
+ b50 = *(b + 50);
+ b51 = *(b + 51);
+ b52 = *(b + 52);
+ b53 = *(b + 53);
+ b54 = *(b + 54);
+ b56 = *(b + 56);
+ b57 = *(b + 57);
+ b58 = *(b + 58);
+ b59 = *(b + 59);
+ b60 = *(b + 60);
+ b61 = *(b + 61);
+ b62 = *(b + 62);
+ b63 = *(b + 63);
+
+ c0_nxt7 *= b63;
+ c1_nxt7 *= b63;
+
+ c0_nxt6 -= c0_nxt7 * b62;
+ c1_nxt6 -= c1_nxt7 * b62;
+
+ c0_nxt6 *= b54;
+ c1_nxt6 *= b54;
+
+ c0_nxt5 -= c0_nxt7 * b61;
+ c1_nxt5 -= c1_nxt7 * b61;
+
+ c0_nxt5 -= c0_nxt6 * b53;
+ c1_nxt5 -= c1_nxt6 * b53;
+
+ c0_nxt5 *= b45;
+ c1_nxt5 *= b45;
+
+ c0_nxt4 -= c0_nxt7 * b60;
+ c1_nxt4 -= c1_nxt7 * b60;
+
+ c0_nxt4 -= c0_nxt6 * b52;
+ c1_nxt4 -= c1_nxt6 * b52;
+
+ c0_nxt4 -= c0_nxt5 * b44;
+ c1_nxt4 -= c1_nxt5 * b44;
+
+ c0_nxt4 *= b36;
+ c1_nxt4 *= b36;
+
+ c0_nxt3 -= c0_nxt7 * b59;
+ c1_nxt3 -= c1_nxt7 * b59;
+
+ c0_nxt3 -= c0_nxt6 * b51;
+ c1_nxt3 -= c1_nxt6 * b51;
+
+ c0_nxt3 -= c0_nxt5 * b43;
+ c1_nxt3 -= c1_nxt5 * b43;
+
+ c0_nxt3 -= c0_nxt4 * b35;
+ c1_nxt3 -= c1_nxt4 * b35;
+
+ c0_nxt3 *= b27;
+ c1_nxt3 *= b27;
+
+ c0_nxt2 -= c0_nxt7 * b58;
+ c1_nxt2 -= c1_nxt7 * b58;
+
+ c0_nxt2 -= c0_nxt6 * b50;
+ c1_nxt2 -= c1_nxt6 * b50;
+
+ c0_nxt2 -= c0_nxt5 * b42;
+ c1_nxt2 -= c1_nxt5 * b42;
+
+ c0_nxt2 -= c0_nxt4 * b34;
+ c1_nxt2 -= c1_nxt4 * b34;
+
+ c0_nxt2 -= c0_nxt3 * b26;
+ c1_nxt2 -= c1_nxt3 * b26;
+
+ c0_nxt2 *= b18;
+ c1_nxt2 *= b18;
+
+ c0_nxt1 -= c0_nxt7 * b57;
+ c1_nxt1 -= c1_nxt7 * b57;
+
+ c0_nxt1 -= c0_nxt6 * b49;
+ c1_nxt1 -= c1_nxt6 * b49;
+
+ c0_nxt1 -= c0_nxt5 * b41;
+ c1_nxt1 -= c1_nxt5 * b41;
+
+ c0_nxt1 -= c0_nxt4 * b33;
+ c1_nxt1 -= c1_nxt4 * b33;
+
+ c0_nxt1 -= c0_nxt3 * b25;
+ c1_nxt1 -= c1_nxt3 * b25;
+
+ c0_nxt1 -= c0_nxt2 * b17;
+ c1_nxt1 -= c1_nxt2 * b17;
+
+ c0_nxt1 *= b9;
+ c1_nxt1 *= b9;
+
+ c0 -= c0_nxt7 * b56;
+ c1 -= c1_nxt7 * b56;
+
+ c0 -= c0_nxt6 * b48;
+ c1 -= c1_nxt6 * b48;
+
+ c0 -= c0_nxt5 * b40;
+ c1 -= c1_nxt5 * b40;
+
+ c0 -= c0_nxt4 * b32;
+ c1 -= c1_nxt4 * b32;
+
+ c0 -= c0_nxt3 * b24;
+ c1 -= c1_nxt3 * b24;
+
+ c0 -= c0_nxt2 * b16;
+ c1 -= c1_nxt2 * b16;
+
+ c0 -= c0_nxt1 * b8;
+ c1 -= c1_nxt1 * b8;
+
+ c0 *= b0;
+ c1 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c0_nxt1;
+ *(a + 3) = c1_nxt1;
+ *(a + 4) = c0_nxt2;
+ *(a + 5) = c1_nxt2;
+ *(a + 6) = c0_nxt3;
+ *(a + 7) = c1_nxt3;
+ *(a + 8) = c0_nxt4;
+ *(a + 9) = c1_nxt4;
+ *(a + 10) = c0_nxt5;
+ *(a + 11) = c1_nxt5;
+ *(a + 12) = c0_nxt6;
+ *(a + 13) = c1_nxt6;
+ *(a + 14) = c0_nxt7;
+ *(a + 15) = c1_nxt7;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + 1 * ldc) = c0_nxt1;
+ *(c + 1 + 1 * ldc) = c1_nxt1;
+ *(c + 0 + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+ *(c + 0 + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+ *(c + 0 + 4 * ldc) = c0_nxt4;
+ *(c + 1 + 4 * ldc) = c1_nxt4;
+ *(c + 0 + 5 * ldc) = c0_nxt5;
+ *(c + 1 + 5 * ldc) = c1_nxt5;
+ *(c + 0 + 6 * ldc) = c0_nxt6;
+ *(c + 1 + 6 * ldc) = c1_nxt6;
+ *(c + 0 + 7 * ldc) = c0_nxt7;
+ *(c + 1 + 7 * ldc) = c1_nxt7;
+}
+
+static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
+ FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + 0 + 1 * ldc);
+ c1_nxt1 = *(c + 1 + 1 * ldc);
+ c0_nxt2 = *(c + 0 + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 0 + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c0_nxt1 -= aa[0] * bb[1];
+ c1_nxt1 -= aa[1] * bb[1];
+ c0_nxt2 -= aa[0] * bb[2];
+ c1_nxt2 -= aa[1] * bb[2];
+ c0_nxt3 -= aa[0] * bb[3];
+ c1_nxt3 -= aa[1] * bb[3];
+
+ aa += 2;
+ bb += 4;
+ }
+
+ a -= 8;
+ b -= 16;
+
+ b0 = *b;
+ b4 = *(b + 4);
+ b5 = *(b + 5);
+ b8 = *(b + 8);
+ b9 = *(b + 9);
+ b10 = *(b + 10);
+ b12 = *(b + 12);
+ b13 = *(b + 13);
+ b14 = *(b + 14);
+ b15 = *(b + 15);
+
+ c0_nxt3 *= b15;
+ c1_nxt3 *= b15;
+
+ c0_nxt2 = (c0_nxt2 - c0_nxt3 * b14) * b10;
+ c1_nxt2 = (c1_nxt2 - c1_nxt3 * b14) * b10;
+
+ c0_nxt1 = ((c0_nxt1 - c0_nxt3 * b13) - c0_nxt2 * b9) * b5;
+ c1_nxt1 = ((c1_nxt1 - c1_nxt3 * b13) - c1_nxt2 * b9) * b5;
+
+ c0 = (((c0 - c0_nxt3 * b12) - c0_nxt2 * b8) - c0_nxt1 * b4) * b0;
+ c1 = (((c1 - c1_nxt3 * b12) - c1_nxt2 * b8) - c1_nxt1 * b4) * b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c0_nxt1;
+ *(a + 3) = c1_nxt1;
+ *(a + 4) = c0_nxt2;
+ *(a + 5) = c1_nxt2;
+ *(a + 6) = c0_nxt3;
+ *(a + 7) = c1_nxt3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + 1 * ldc) = c0_nxt1;
+ *(c + 1 + 1 * ldc) = c1_nxt1;
+ *(c + 0 + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+ *(c + 0 + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+}
+
+static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt = *(c + 0 + ldc);
+ c1_nxt = *(c + 1 + ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c0_nxt -= aa[0] * bb[1];
+ c1_nxt -= aa[1] * bb[1];
+
+ aa += 2;
+ bb += 2;
+ }
+
+ a -= 4;
+ b -= 4;
+
+ b3 = *(b + 3);
+ b2 = *(b + 2);
+ b0 = *b;
+
+ c0_nxt *= b3;
+ c1_nxt *= b3;
+
+ c0 -= c0_nxt * b2;
+ c1 -= c1_nxt * b2;
+
+ c0 *= b0;
+ c1 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c0_nxt;
+ *(a + 3) = c1_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+}
+
+static void ssolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT b0, c0, c1;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+
+ aa += 2;
+ bb += 1;
+ }
+
+ a -= 2;
+ b -= 1;
+
+ b0 = *b;
+
+ c0 *= b0;
+ c1 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+}
+
+static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35;
+ FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54;
+ FLOAT b56, b57, b58, b59, b60, b61, b62, b63;
+ FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
+ c4 = *(c + 4 * ldc);
+ c5 = *(c + 5 * ldc);
+ c6 = *(c + 6 * ldc);
+ c7 = *(c + 7 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[0] * bb[1];
+ c2 -= aa[0] * bb[2];
+ c3 -= aa[0] * bb[3];
+ c4 -= aa[0] * bb[4];
+ c5 -= aa[0] * bb[5];
+ c6 -= aa[0] * bb[6];
+ c7 -= aa[0] * bb[7];
+
+ aa += 1;
+ bb += 8;
+ }
+
+ a -= 8;
+ b -= 64;
+
+ b0 = *(b + 0);
+ b8 = *(b + 8);
+ b9 = *(b + 9);
+ b16 = *(b + 16);
+ b17 = *(b + 17);
+ b18 = *(b + 18);
+ b24 = *(b + 24);
+ b25 = *(b + 25);
+ b26 = *(b + 26);
+ b27 = *(b + 27);
+ b32 = *(b + 32);
+ b33 = *(b + 33);
+ b34 = *(b + 34);
+ b35 = *(b + 35);
+ b36 = *(b + 36);
+ b40 = *(b + 40);
+ b41 = *(b + 41);
+ b42 = *(b + 42);
+ b43 = *(b + 43);
+ b44 = *(b + 44);
+ b45 = *(b + 45);
+ b48 = *(b + 48);
+ b49 = *(b + 49);
+ b50 = *(b + 50);
+ b51 = *(b + 51);
+ b52 = *(b + 52);
+ b53 = *(b + 53);
+ b54 = *(b + 54);
+ b56 = *(b + 56);
+ b57 = *(b + 57);
+ b58 = *(b + 58);
+ b59 = *(b + 59);
+ b60 = *(b + 60);
+ b61 = *(b + 61);
+ b62 = *(b + 62);
+ b63 = *(b + 63);
+
+ c7 *= b63;
+
+ c6 -= c7 * b62;
+ c6 *= b54;
+
+ c5 -= c7 * b61;
+ c5 -= c6 * b53;
+ c5 *= b45;
+
+ c4 -= c7 * b60;
+ c4 -= c6 * b52;
+ c4 -= c5 * b44;
+ c4 *= b36;
+
+ c3 -= c7 * b59;
+ c3 -= c6 * b51;
+ c3 -= c5 * b43;
+ c3 -= c4 * b35;
+ c3 *= b27;
+
+ c2 -= c7 * b58;
+ c2 -= c6 * b50;
+ c2 -= c5 * b42;
+ c2 -= c4 * b34;
+ c2 -= c3 * b26;
+ c2 *= b18;
+
+ c1 -= c7 * b57;
+ c1 -= c6 * b49;
+ c1 -= c5 * b41;
+ c1 -= c4 * b33;
+ c1 -= c3 * b25;
+ c1 -= c2 * b17;
+ c1 *= b9;
+
+ c0 -= c7 * b56;
+ c0 -= c6 * b48;
+ c0 -= c5 * b40;
+ c0 -= c4 * b32;
+ c0 -= c3 * b24;
+ c0 -= c2 * b16;
+ c0 -= c1 * b8;
+ c0 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c2;
+ *(a + 3) = c3;
+ *(a + 4) = c4;
+ *(a + 5) = c5;
+ *(a + 6) = c6;
+ *(a + 7) = c7;
+
+ *(c + 0) = c0;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
+ *(c + 4 * ldc) = c4;
+ *(c + 5 * ldc) = c5;
+ *(c + 6 * ldc) = c6;
+ *(c + 7 * ldc) = c7;
+}
+
+static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
+ FLOAT c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[0] * bb[1];
+ c2 -= aa[0] * bb[2];
+ c3 -= aa[0] * bb[3];
+
+ aa += 1;
+ bb += 4;
+ }
+
+ a -= 4;
+ b -= 16;
+
+ b0 = *b;
+ b4 = *(b + 4);
+ b5 = *(b + 5);
+ b8 = *(b + 8);
+ b9 = *(b + 9);
+ b10 = *(b + 10);
+ b12 = *(b + 12);
+ b13 = *(b + 13);
+ b14 = *(b + 14);
+ b15 = *(b + 15);
+
+ c3 *= b15;
+ c2 = (c2 - c3 * b14) * b10;
+ c1 = ((c1 - c3 * b13) - c2 * b9) * b5;
+ c0 = (((c0 - c3 * b12) - c2 * b8) - c1 * b4) * b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c2;
+ *(a + 3) = c3;
+
+ *(c) = c0;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
+}
+
+static void ssolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ FLOAT b0, b2, b3, c0, c1;
+
+ c0 = *(c + 0);
+ c1 = *(c + ldc);
+
+ for (k = 0; k < bk; k++)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[0] * bb[1];
+
+ aa += 1;
+ bb += 2;
+ }
+
+ a -= 2;
+ b -= 4;
+
+ b3 = *(b + 3);
+ b2 = *(b + 2);
+ b0 = *b;
+
+ c1 *= b3;
+
+ c0 -= c1 * b2;
+ c0 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + ldc) = c1;
+}
+
+static void ssolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ BLASLONG k;
+
+ for (k = 0; k < bk; k++)
+ {
+ *c -= a[k] * b[k];
+ }
+
+ *c *= *(a - 1);
+ *(b - 1) = *c;
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ FLOAT *c, BLASLONG ldc, BLASLONG offset)
+{
+ FLOAT *aa, *cc;
+ BLASLONG i, j, kk;
+
+ kk = n - offset;
+ c += n * ldc;
+ b += n * k;
+
+ if (n & 7)
+ {
+ if (n & 1)
+ {
+ aa = a;
+ b -= k;
+ c -= ldc;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ ssolve_8x1_rt_msa(aa + 8 * kk, b + kk, cc, (k - kk));
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ ssolve_4x1_rt_msa(aa + 4 * kk, b + kk, cc, (k - kk));
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ ssolve_2x1_rt_msa(aa + 2 * kk, b + kk, cc, (k - kk));
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ ssolve_1x1_rt_msa(b + kk, aa + kk, cc, (k - kk));
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ kk -= 1;
+ }
+
+ if (n & 2)
+ {
+ aa = a;
+ b -= 2 * k;
+ c -= 2 * ldc;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ ssolve_8x2_rt_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk));
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ ssolve_4x2_rt_msa(aa + 4 * kk, b + 2 * kk, cc, ldc, (k - kk));
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ ssolve_2x2_rt_msa(aa + 2 * kk, b + 2 * kk, cc, ldc, (k - kk));
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ ssolve_1x2_rt_msa(aa + kk, b + 2 * kk, cc, ldc, (k - kk));
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ kk -= 2;
+ }
+
+ if (n & 4)
+ {
+ aa = a;
+ b -= 4 * k;
+ c -= 4 * ldc;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ ssolve_8x4_rt_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk));
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ ssolve_4x4_rt_msa(aa + 4 * kk, b + 4 * kk, cc, ldc, (k - kk));
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ ssolve_2x4_rt_msa(aa + 2 * kk, b + 4 * kk, cc, ldc, (k - kk));
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ ssolve_1x4_rt_msa(aa + kk, b + 4 * kk, cc, ldc, (k - kk));
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ kk -= 4;
+ }
+ }
+
+ for (j = (n >> 3); j--;)
+ {
+ aa = a;
+ b -= 8 * k;
+ c -= 8 * ldc;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ ssolve_8x8_rt_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk));
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ ssolve_4x8_rt_msa(aa + 4 * kk, b + 8 * kk, cc, ldc, (k - kk));
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ ssolve_2x8_rt_msa(aa + 2 * kk, b + 8 * kk, cc, ldc, (k - kk));
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ ssolve_1x8_rt_msa(aa + kk, b + 8 * kk, cc, ldc, (k - kk));
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ kk -= 8;
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/swap.c b/kernel/mips/swap.c
new file mode 100644
index 000000000..23f7a3580
--- /dev/null
+++ b/kernel/mips/swap.c
@@ -0,0 +1,55 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+ FLOAT temp;
+
+ if ( n < 0 ) return(0);
+
+ while(i < n)
+ {
+
+ temp = x[ix] ;
+ x[ix] = y[iy] ;
+ y[iy] = temp ;
+
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/mips/symv_L.c b/kernel/mips/symv_L.c
new file mode 100644
index 000000000..6a83d73f9
--- /dev/null
+++ b/kernel/mips/symv_L.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG ix,iy;
+ BLASLONG jx,jy;
+ BLASLONG j;
+ FLOAT temp1;
+ FLOAT temp2;
+
+#if 0
+ if ( m != offset )
+ printf("Symv_L: m=%d offset=%d\n",m,offset);
+#endif
+
+ jx = 0;
+ jy = 0;
+
+ for (j=0; j
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0;
+ FLOAT maxf;
+ BLASLONG inc_x2;
+
+ if (n <= 0 || inc_x <= 0) return(0.0);
+
+ inc_x2 = 2 * inc_x;
+
+ maxf = CABS1(x,0);
+ ix += inc_x2;
+ i++;
+
+ while(i < n)
+ {
+ if( CABS1(x,ix) > maxf )
+ {
+ maxf = CABS1(x,ix);
+ }
+ ix += inc_x2;
+ i++;
+ }
+ return(maxf);
+}
+
+
diff --git a/kernel/mips/zamin.c b/kernel/mips/zamin.c
new file mode 100644
index 000000000..97c07da81
--- /dev/null
+++ b/kernel/mips/zamin.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0;
+ FLOAT minf;
+ BLASLONG inc_x2;
+
+ if (n <= 0 || inc_x <= 0) return(0.0);
+
+ inc_x2 = 2 * inc_x;
+
+ minf = CABS1(x,0);
+ ix += inc_x2;
+ i++;
+
+ while(i < n)
+ {
+ if( CABS1(x,ix) < minf )
+ {
+ minf = CABS1(x,ix);
+ }
+ ix += inc_x2;
+ i++;
+ }
+ return(minf);
+}
+
+
diff --git a/kernel/mips/zasum.c b/kernel/mips/zasum.c
new file mode 100644
index 000000000..77a2ed685
--- /dev/null
+++ b/kernel/mips/zasum.c
@@ -0,0 +1,62 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ FLOAT sumf = 0.0;
+ BLASLONG inc_x2;
+
+ if (n <= 0 || inc_x <= 0) return(sumf);
+
+ inc_x2 = 2 * inc_x;
+
+ n *= inc_x2;
+ while(i < n)
+ {
+ sumf += CABS1(x,i);
+ i += inc_x2;
+ }
+ return(sumf);
+}
+
+
diff --git a/kernel/mips/zasum_msa.c b/kernel/mips/zasum_msa.c
new file mode 100644
index 000000000..c84d48ecb
--- /dev/null
+++ b/kernel/mips/zasum_msa.c
@@ -0,0 +1,170 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include
+#include "macros_msa.h"
+
+#define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec))
+
+#define PROCESS_ZD(inc_val) \
+ if (n > 8) \
+ { \
+ n -= 8; \
+ \
+ LD_DP8_INC(x, inc_val, src0, src1, src2, \
+ src3, src4, src5, src6, src7); \
+ \
+ sum_abs0 = AND_VEC_D(src0); \
+ sum_abs1 = AND_VEC_D(src1); \
+ sum_abs2 = AND_VEC_D(src2); \
+ sum_abs3 = AND_VEC_D(src3); \
+ sum_abs0 += AND_VEC_D(src4); \
+ sum_abs1 += AND_VEC_D(src5); \
+ sum_abs2 += AND_VEC_D(src6); \
+ sum_abs3 += AND_VEC_D(src7); \
+ } \
+ else \
+ { \
+ sum_abs0 = zero_v; \
+ sum_abs1 = zero_v; \
+ sum_abs2 = zero_v; \
+ sum_abs3 = zero_v; \
+ } \
+ \
+ for (i = (n >> 3); i--;) \
+ { \
+ LD_DP8_INC(x, inc_val, src0, src1, src2, \
+ src3, src4, src5, src6, src7); \
+ \
+ sum_abs0 += AND_VEC_D(src0); \
+ sum_abs1 += AND_VEC_D(src1); \
+ sum_abs2 += AND_VEC_D(src2); \
+ sum_abs3 += AND_VEC_D(src3); \
+ sum_abs0 += AND_VEC_D(src4); \
+ sum_abs1 += AND_VEC_D(src5); \
+ sum_abs2 += AND_VEC_D(src6); \
+ sum_abs3 += AND_VEC_D(src7); \
+ } \
+ \
+ if (n & 7) \
+ { \
+ if ((n & 4) && (n & 2) && (n & 1)) \
+ { \
+ LD_DP7_INC(x, inc_val, src0, src1, src2, \
+ src3, src4, src5, src6); \
+ \
+ sum_abs0 += AND_VEC_D(src0); \
+ sum_abs1 += AND_VEC_D(src1); \
+ sum_abs2 += AND_VEC_D(src2); \
+ sum_abs3 += AND_VEC_D(src3); \
+ sum_abs0 += AND_VEC_D(src4); \
+ sum_abs1 += AND_VEC_D(src5); \
+ sum_abs2 += AND_VEC_D(src6); \
+ } \
+ else if ((n & 4) && (n & 2)) \
+ { \
+ LD_DP6_INC(x, inc_val, src0, src1, src2, \
+ src3, src4, src5); \
+ \
+ sum_abs0 += AND_VEC_D(src0); \
+ sum_abs1 += AND_VEC_D(src1); \
+ sum_abs2 += AND_VEC_D(src2); \
+ sum_abs3 += AND_VEC_D(src3); \
+ sum_abs0 += AND_VEC_D(src4); \
+ sum_abs1 += AND_VEC_D(src5); \
+ } \
+ else if ((n & 4) && (n & 1)) \
+ { \
+ LD_DP5_INC(x, inc_val, src0, src1, src2, \
+ src3, src4); \
+ \
+ sum_abs0 += AND_VEC_D(src0); \
+ sum_abs1 += AND_VEC_D(src1); \
+ sum_abs2 += AND_VEC_D(src2); \
+ sum_abs3 += AND_VEC_D(src3); \
+ sum_abs0 += AND_VEC_D(src4); \
+ } \
+ else if ((n & 2) && (n & 1)) \
+ { \
+ LD_DP3_INC(x, inc_val, src0, src1, src2); \
+ \
+ sum_abs0 += AND_VEC_D(src0); \
+ sum_abs1 += AND_VEC_D(src1); \
+ sum_abs2 += AND_VEC_D(src2); \
+ } \
+ else if (n & 4) \
+ { \
+ LD_DP4_INC(x, inc_val, src0, src1, src2, \
+ src3); \
+ \
+ sum_abs0 += AND_VEC_D(src0); \
+ sum_abs1 += AND_VEC_D(src1); \
+ sum_abs2 += AND_VEC_D(src2); \
+ sum_abs3 += AND_VEC_D(src3); \
+ } \
+ else if (n & 2) \
+ { \
+ LD_DP2_INC(x, inc_val, src0, src1); \
+ \
+ sum_abs0 += AND_VEC_D(src0); \
+ sum_abs1 += AND_VEC_D(src1); \
+ } \
+ else if (n & 1) \
+ { \
+ src0 = LD_DP(x); \
+ \
+ sum_abs0 += AND_VEC_D(src0); \
+ } \
+ } \
+ \
+ sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; \
+ sumf = sum_abs0[0] + sum_abs0[1];
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i;
+ FLOAT sumf = 0.0;
+ v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
+ v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
+ v2f64 zero_v = {0};
+ v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
+
+ if (n <= 0 || inc_x <= 0) return (sumf);
+
+ if (1 == inc_x)
+ {
+ PROCESS_ZD(2);
+ }
+ else
+ {
+ inc_x *= 2;
+ PROCESS_ZD(inc_x);
+ }
+
+ return (sumf);
+}
diff --git a/kernel/mips/zaxpby.c b/kernel/mips/zaxpby.c
new file mode 100644
index 000000000..97452e942
--- /dev/null
+++ b/kernel/mips/zaxpby.c
@@ -0,0 +1,113 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i=0;
+ BLASLONG ix,iy;
+ FLOAT temp;
+ BLASLONG inc_x2, inc_y2;
+
+ if ( n <= 0 ) return(0);
+
+ ix = 0;
+ iy = 0;
+
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ if ( beta_r == 0.0 && beta_i == 0.0)
+ {
+ if ( alpha_r == 0.0 && alpha_i == 0.0 )
+ {
+
+ while(i < n)
+ {
+ y[iy] = 0.0 ;
+ y[iy+1] = 0.0 ;
+ iy += inc_y2 ;
+ i++ ;
+ }
+
+ }
+ else
+ {
+
+ while(i < n)
+ {
+ y[iy] = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) ;
+ y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) ;
+ ix += inc_x2 ;
+ iy += inc_y2 ;
+ i++ ;
+ }
+
+
+ }
+
+ }
+ else
+ {
+ if ( alpha_r == 0.0 && alpha_i == 0.0 )
+ {
+
+ while(i < n)
+ {
+ temp = ( beta_r * y[iy] - beta_i * y[iy+1] ) ;
+ y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy] ) ;
+ y[iy] = temp;
+ iy += inc_y2 ;
+ i++ ;
+ }
+
+ }
+ else
+ {
+
+ while(i < n)
+ {
+ temp = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) + ( beta_r * y[iy] - beta_i * y[iy+1] ) ;
+ y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) + ( beta_r * y[iy+1] + beta_i * y[iy] ) ;
+ y[iy] = temp;
+ ix += inc_x2 ;
+ iy += inc_y2 ;
+ i++ ;
+ }
+
+
+ }
+
+
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/mips/zaxpy.c b/kernel/mips/zaxpy.c
new file mode 100644
index 000000000..f0fbab4a2
--- /dev/null
+++ b/kernel/mips/zaxpy.c
@@ -0,0 +1,64 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0;
+ BLASLONG ix,iy;
+ BLASLONG inc_x2;
+ BLASLONG inc_y2;
+
+ if ( n < 0 ) return(0);
+ if ( da_r == 0.0 && da_i == 0.0 ) return(0);
+
+ ix = 0;
+ iy = 0;
+
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ while(i < n)
+ {
+#if !defined(CONJ)
+ y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
+ y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
+#else
+ y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
+ y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
+#endif
+ ix += inc_x2 ;
+ iy += inc_y2 ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/mips/zcopy.c b/kernel/mips/zcopy.c
new file mode 100644
index 000000000..6bb6e33b6
--- /dev/null
+++ b/kernel/mips/zcopy.c
@@ -0,0 +1,56 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+ BLASLONG inc_x2;
+ BLASLONG inc_y2;
+
+ if ( n < 0 ) return(0);
+
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ while(i < n)
+ {
+
+ y[iy] = x[ix] ;
+ y[iy+1] = x[ix+1] ;
+ ix += inc_x2;
+ iy += inc_y2;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/mips/zdot.c b/kernel/mips/zdot.c
new file mode 100644
index 000000000..da9ec7076
--- /dev/null
+++ b/kernel/mips/zdot.c
@@ -0,0 +1,75 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#ifndef _MSC_VER
+#include
+FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+ FLOAT dot[2];
+ OPENBLAS_COMPLEX_FLOAT result;
+ BLASLONG inc_x2;
+ BLASLONG inc_y2;
+
+ dot[0]=0.0;
+ dot[1]=0.0;
+
+ CREAL(result) = 0.0 ;
+ CIMAG(result) = 0.0 ;
+
+ if ( n < 1 ) return(result);
+
+ inc_x2 = 2 * inc_x ;
+ inc_y2 = 2 * inc_y ;
+
+ while(i < n)
+ {
+#if !defined(CONJ)
+ dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ;
+ dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ;
+#else
+ dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ;
+ dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ;
+#endif
+ ix += inc_x2 ;
+ iy += inc_y2 ;
+ i++ ;
+
+ }
+ CREAL(result) = dot[0];
+ CIMAG(result) = dot[1];
+ return(result);
+
+}
+
+
diff --git a/kernel/mips/zdot_msa.c b/kernel/mips/zdot_msa.c
new file mode 100644
index 000000000..b94509392
--- /dev/null
+++ b/kernel/mips/zdot_msa.c
@@ -0,0 +1,227 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#if !defined(CONJ)
+ #define OP2 +=
+ #define OP3 -
+ #define OP4 +
+#else
+ #define OP2 -=
+ #define OP3 +
+ #define OP4 -
+#endif
+
+#define DOT16_KERNEL(OPR0, OPR1) \
+ dot0 += (vx0r * vy0r); \
+ dot0 OPR0## = (vx0i * vy0i); \
+ dot1 OPR1## = (vx0i * vy0r); \
+ dot1 += (vx0r * vy0i); \
+ \
+ dot0 += (vx1r * vy1r); \
+ dot0 OPR0## = (vx1i * vy1i); \
+ dot1 OPR1## = (vx1i * vy1r); \
+ dot1 += (vx1r * vy1i); \
+ \
+ dot0 += (vx2r * vy2r); \
+ dot0 OPR0## = (vx2i * vy2i); \
+ dot1 OPR1## = (vx2i * vy2r); \
+ dot1 += (vx2r * vy2i); \
+ \
+ dot0 += (vx3r * vy3r); \
+ dot0 OPR0## = (vx3i * vy3i); \
+ dot1 OPR1## = (vx3i * vy3r); \
+ dot1 += (vx3r * vy3i);
+
+#define DOT12_KERNEL(OPR0, OPR1) \
+ dot0 += (vx0r * vy0r); \
+ dot0 OPR0## = (vx0i * vy0i); \
+ dot1 OPR1## = (vx0i * vy0r); \
+ dot1 += (vx0r * vy0i); \
+ \
+ dot0 += (vx1r * vy1r); \
+ dot0 OPR0## = (vx1i * vy1i); \
+ dot1 OPR1## = (vx1i * vy1r); \
+ dot1 += (vx1r * vy1i); \
+ \
+ dot0 += (vx2r * vy2r); \
+ dot0 OPR0## = (vx2i * vy2i); \
+ dot1 OPR1## = (vx2i * vy2r); \
+ dot1 += (vx2r * vy2i);
+
+#define DOT8_KERNEL(OPR0, OPR1) \
+ dot0 += (vx0r * vy0r); \
+ dot0 OPR0## = (vx0i * vy0i); \
+ dot1 OPR1## = (vx0i * vy0r); \
+ dot1 += (vx0r * vy0i); \
+ \
+ dot0 += (vx1r * vy1r); \
+ dot0 OPR0## = (vx1i * vy1i); \
+ dot1 OPR1## = (vx1i * vy1r); \
+ dot1 += (vx1r * vy1i);
+
+#define DOT4_KERNEL(OPR0, OPR1) \
+ dot0 += (vx0r * vy0r); \
+ dot0 OPR0## = (vx0i * vy0i); \
+ dot1 OPR1## = (vx0i * vy0r); \
+ dot1 += (vx0r * vy0i);
+
+/* return double, x,y double */
+/* zdotc - CONJ */
+/* zdotu - !CONJ */
+#ifndef _MSC_VER
+#include
+FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+ BLASLONG i = 0;
+ FLOAT dot[2];
+ BLASLONG inc_x2;
+ BLASLONG inc_y2;
+ v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
+ v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
+ v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
+ v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
+ v2f64 dot0 = {0, 0};
+ v2f64 dot1 = {0, 0};
+ v2f64 zero = {0, 0};
+ openblas_complex_double result;
+
+ dot[0] = 0.0;
+ dot[1] = 0.0;
+
+ __real__(result) = 0.0;
+ __imag__(result) = 0.0;
+
+ if ( n < 1 ) return(result);
+
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ for (i = (n >> 3); i--;)
+ {
+ LD_DP8_INC(x, inc_x2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+ LD_DP8_INC(y, inc_y2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+ PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
+ PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i);
+ PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i);
+
+ PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+ PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
+ PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i);
+ PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i);
+
+ #if !defined(CONJ)
+ DOT16_KERNEL(-, +);
+ #else
+ DOT16_KERNEL(+, -);
+ #endif
+ }
+
+ if (n & 7)
+ {
+ if ((n & 4) && (n & 2))
+ {
+ LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3);
+ LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3);
+ LD_DP2_INC(x, inc_x2, vx4, vx5);
+ LD_DP2_INC(y, inc_y2, vy4, vy5);
+
+ PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
+ PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i);
+
+ PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+ PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
+ PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i);
+
+ #if !defined(CONJ)
+ DOT12_KERNEL(-, +);
+ #else
+ DOT12_KERNEL(+, -);
+ #endif
+ }
+ else if (n & 4)
+ {
+ LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3);
+ LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3);
+
+ PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
+
+ PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+ PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
+
+ #if !defined(CONJ)
+ DOT8_KERNEL(-, +);
+ #else
+ DOT8_KERNEL(+, -);
+ #endif
+ }
+ else if (n & 2)
+ {
+ LD_DP2_INC(x, inc_x2, vx0, vx1);
+ LD_DP2_INC(y, inc_y2, vy0, vy1);
+ PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+
+ #if !defined(CONJ)
+ DOT4_KERNEL(-, +);
+ #else
+ DOT4_KERNEL(+, -);
+ #endif
+ }
+
+ if (n & 1)
+ {
+ vx0 = LD_DP(x);
+ vy0 = LD_DP(y);
+ PCKEVOD_D2_DP(zero, vx0, vx0r, vx0i);
+ PCKEVOD_D2_DP(zero, vy0, vy0r, vy0i);
+
+ #if !defined(CONJ)
+ DOT4_KERNEL(-, +);
+ #else
+ DOT4_KERNEL(+, -);
+ #endif
+ }
+ }
+
+ dot[0] += (dot0[0] + dot0[1]);
+ dot[1] += (dot1[0] + dot1[1]);
+
+ __real__(result) = dot[0];
+ __imag__(result) = dot[1];
+
+ return(result);
+}
diff --git a/kernel/mips/zgemm_kernel_4x4_msa.c b/kernel/mips/zgemm_kernel_4x4_msa.c
new file mode 100644
index 000000000..a185c69dd
--- /dev/null
+++ b/kernel/mips/zgemm_kernel_4x4_msa.c
@@ -0,0 +1,1589 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#define ZGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \
+ LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \
+ \
+ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
+ PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \
+ \
+ /* 0th col */ \
+ SPLATI_D2_DP(src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ res1_r OP0## = src_a1r * src_br; \
+ res1_r OP1## = src_a1i * src_bi; \
+ res1_i OP2## = OP4 src_a1r * src_bi; \
+ res1_i OP3## = src_a1i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_D2_DP(src_b1, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = OP4 src_a0r * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+ \
+ res3_r OP0## = src_a1r * src_br; \
+ res3_r OP1## = src_a1i * src_bi; \
+ res3_i OP2## = OP4 src_a1r * src_bi; \
+ res3_i OP3## = src_a1i * src_br; \
+ \
+ /* 2nd col */ \
+ SPLATI_D2_DP(src_b2, src_br, src_bi); \
+ res4_r OP0## = src_a0r * src_br; \
+ res4_r OP1## = src_a0i * src_bi; \
+ res4_i OP2## = OP4 src_a0r * src_bi; \
+ res4_i OP3## = src_a0i * src_br; \
+ \
+ res5_r OP0## = src_a1r * src_br; \
+ res5_r OP1## = src_a1i * src_bi; \
+ res5_i OP2## = OP4 src_a1r * src_bi; \
+ res5_i OP3## = src_a1i * src_br; \
+ \
+ /* 3rd col */ \
+ SPLATI_D2_DP(src_b3, src_br, src_bi); \
+ res6_r OP0## = src_a0r * src_br; \
+ res6_r OP1## = src_a0i * src_bi; \
+ res6_i OP2## = OP4 src_a0r * src_bi; \
+ res6_i OP3## = src_a0i * src_br; \
+ \
+ res7_r OP0## = src_a1r * src_br; \
+ res7_r OP1## = src_a1i * src_bi; \
+ res7_i OP2## = OP4 src_a1r * src_bi; \
+ res7_i OP3## = src_a1i * src_br; \
+}
+
+#define ZGEMM_KERNEL_2X4_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_DP2_INC(pa0, 2, src_a0, src_a1); \
+ LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \
+ \
+ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th col */ \
+ SPLATI_D2_DP(src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_D2_DP(src_b1, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = OP4 src_a0r * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+ \
+ /* 2nd col */ \
+ SPLATI_D2_DP(src_b2, src_br, src_bi); \
+ res4_r OP0## = src_a0r * src_br; \
+ res4_r OP1## = src_a0i * src_bi; \
+ res4_i OP2## = OP4 src_a0r * src_bi; \
+ res4_i OP3## = src_a0i * src_br; \
+ \
+ /* 3rd col */ \
+ SPLATI_D2_DP(src_b3, src_br, src_bi); \
+ res6_r OP0## = src_a0r * src_br; \
+ res6_r OP1## = src_a0i * src_bi; \
+ res6_i OP2## = OP4 src_a0r * src_bi; \
+ res6_i OP3## = src_a0i * src_br; \
+}
+
+#define ZGEMM_KERNEL_1X4_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ src_a0 = LD_DP(pa0); \
+ LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \
+ \
+ PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th and 1st col */ \
+ PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ /* 2nd and 3rd col */ \
+ PCKEVOD_D2_DP(src_b3, src_b2, src_br, src_bi); \
+ res1_r OP0## = src_a0r * src_br; \
+ res1_r OP1## = src_a0i * src_bi; \
+ res1_i OP2## = OP4 src_a0r * src_bi; \
+ res1_i OP3## = src_a0i * src_br; \
+}
+
+#define ZGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \
+ LD_DP2_INC(pb0, 2, src_b0, src_b1); \
+ \
+ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
+ PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \
+ \
+ /* 0th col */ \
+ SPLATI_D2_DP(src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ res1_r OP0## = src_a1r * src_br; \
+ res1_r OP1## = src_a1i * src_bi; \
+ res1_i OP2## = OP4 src_a1r * src_bi; \
+ res1_i OP3## = src_a1i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_D2_DP(src_b1, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = OP4 src_a0r * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+ \
+ res3_r OP0## = src_a1r * src_br; \
+ res3_r OP1## = src_a1i * src_bi; \
+ res3_i OP2## = OP4 src_a1r * src_bi; \
+ res3_i OP3## = src_a1i * src_br; \
+}
+
+#define ZGEMM_KERNEL_2X2_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_DP2_INC(pa0, 2, src_a0, src_a1); \
+ LD_DP2_INC(pb0, 2, src_b0, src_b1); \
+ \
+ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th col */ \
+ SPLATI_D2_DP(src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_D2_DP(src_b1, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = OP4 src_a0r * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+}
+
+#define ZGEMM_KERNEL_1X2_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ src_a0 = LD_DP(pa0); \
+ LD_DP2_INC(pb0, 2, src_b0, src_b1); \
+ \
+ PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th and 1st col */ \
+ PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+}
+
+#define ZGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \
+ src_b0 = LD_DP(pb0); \
+ \
+ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
+ PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \
+ \
+ /* 0th col */ \
+ SPLATI_D2_DP(src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ res1_r OP0## = src_a1r * src_br; \
+ res1_r OP1## = src_a1i * src_bi; \
+ res1_i OP2## = OP4 src_a1r * src_bi; \
+ res1_i OP3## = src_a1i * src_br; \
+}
+
+#define ZGEMM_KERNEL_2X1_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_DP2_INC(pa0, 2, src_a0, src_a1); \
+ src_b0 = LD_DP(pb0); \
+ \
+ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th col */ \
+ SPLATI_D2_DP(src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+}
+
+#define ZGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ /* 0th col */ \
+ a0_r = pa0[0]; \
+ a0_i = pa0[1]; \
+ b0_r = pb0[0]; \
+ b0_i = pb0[1]; \
+ \
+ res0 OP0## = a0_r * b0_r; \
+ res0 OP1## = a0_i * b0_i; \
+ res1 OP2## = OP4 a0_r * b0_i; \
+ res1 OP3## = a0_i * b0_r; \
+}
+
+#define ZGEMM_SCALE_4X4_MSA \
+{ \
+ LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r += alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i += alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \
+ \
+ PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \
+ PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r += alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i += alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
+ \
+ LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i += alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ dst1_r += alpha_r * res5_r; \
+ dst1_r -= alpha_i * res5_i; \
+ dst1_i += alpha_r * res5_i; \
+ dst1_i += alpha_i * res5_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); \
+ \
+ PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \
+ PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i += alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ dst1_r += alpha_r * res7_r; \
+ dst1_r -= alpha_i * res7_i; \
+ dst1_i += alpha_r * res7_i; \
+ dst1_i += alpha_i * res7_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \
+}
+
+#define ZGEMM_SCALE_2X4_MSA \
+{ \
+ LD_DP2(pc0, 2, dst0, dst1); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ LD_DP2(pc1, 2, dst2, dst3); \
+ \
+ PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP2_INC(dst0, dst1, pc0, 2); \
+ ST_DP2_INC(dst2, dst3, pc1, 2); \
+ \
+ LD_DP2(pc2, 2, dst0, dst1); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i += alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ LD_DP2(pc3, 2, dst2, dst3); \
+ \
+ PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i += alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP2_INC(dst0, dst1, pc2, 2); \
+ ST_DP2_INC(dst2, dst3, pc3, 2); \
+}
+
+#define ZGEMM_SCALE_1X4_MSA \
+{ \
+ dst0 = LD_DP(pc0); \
+ dst1 = LD_DP(pc1); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ dst2 = LD_DP(pc2); \
+ dst3 = LD_DP(pc3); \
+ \
+ PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res1_r; \
+ dst0_r -= alpha_i * res1_i; \
+ dst0_i += alpha_r * res1_i; \
+ dst0_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP(dst0, pc0); \
+ ST_DP(dst1, pc1); \
+ ST_DP(dst2, pc2); \
+ ST_DP(dst3, pc3); \
+}
+
+#define ZGEMM_SCALE_4X2_MSA \
+{ \
+ LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r += alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i += alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \
+ \
+ PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \
+ PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r += alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i += alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
+}
+
+#define ZGEMM_SCALE_2X2_MSA \
+{ \
+ LD_DP2(pc0, 2, dst0, dst1); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_DP2_INC(dst0, dst1, pc0, 2); \
+ \
+ LD_DP2(pc1, 2, dst2, dst3); \
+ \
+ PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP2_INC(dst2, dst3, pc1, 2); \
+}
+
+#define ZGEMM_SCALE_1X2_MSA \
+{ \
+ dst0 = LD_DP(pc0); \
+ dst1 = LD_DP(pc1); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_DP(dst0, pc0); \
+ ST_DP(dst1, pc1); \
+}
+
+#define ZGEMM_SCALE_4X1_MSA \
+{ \
+ LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r += alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i += alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
+}
+
+#define ZGEMM_SCALE_2X1_MSA \
+{ \
+ LD_DP2(pc0, 2, dst0, dst1); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_DP2_INC(dst0, dst1, pc0, 2); \
+}
+
+#define ZGEMM_SCALE_1X1 \
+{ \
+ pc0[0] += alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] += alphar * res1; \
+ pc0[1] += alphai * res0; \
+}
+
+#define ZGEMM_TRMM_SCALE_4X4_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r = alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i = alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r = alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i = alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
+ \
+ dst0_r = alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i = alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ dst1_r = alpha_r * res5_r; \
+ dst1_r -= alpha_i * res5_i; \
+ dst1_i = alpha_r * res5_i; \
+ dst1_i += alpha_i * res5_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ dst0_r = alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i = alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ dst1_r = alpha_r * res7_r; \
+ dst1_r -= alpha_i * res7_i; \
+ dst1_i = alpha_r * res7_i; \
+ dst1_i += alpha_i * res7_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \
+}
+
+#define ZGEMM_TRMM_SCALE_2X4_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP2_INC(dst0, dst1, pc0, 2); \
+ ST_DP2_INC(dst2, dst3, pc1, 2); \
+ \
+ dst0_r = alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i = alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ dst0_r = alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i = alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP2_INC(dst0, dst1, pc2, 2); \
+ ST_DP2_INC(dst2, dst3, pc3, 2); \
+}
+
+#define ZGEMM_TRMM_SCALE_1X4_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ dst0_r = alpha_r * res1_r; \
+ dst0_r -= alpha_i * res1_i; \
+ dst0_i = alpha_r * res1_i; \
+ dst0_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP(dst0, pc0); \
+ ST_DP(dst1, pc1); \
+ ST_DP(dst2, pc2); \
+ ST_DP(dst3, pc3); \
+}
+
+#define ZGEMM_TRMM_SCALE_4X2_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r = alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i = alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r = alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i = alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
+}
+
+#define ZGEMM_TRMM_SCALE_2X2_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_DP2_INC(dst0, dst1, pc0, 2); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP2_INC(dst2, dst3, pc1, 2); \
+}
+
+#define ZGEMM_TRMM_SCALE_1X2_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_DP(dst0, pc0); \
+ ST_DP(dst1, pc1); \
+}
+
+#define ZGEMM_TRMM_SCALE_4X1_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r = alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i = alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
+}
+
+#define ZGEMM_TRMM_SCALE_2X1_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_DP2_INC(dst0, dst1, pc0, 2); \
+}
+
+#define ZGEMM_TRMM_SCALE_1X1 \
+{ \
+ pc0[0] = alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] = alphar * res1; \
+ pc0[1] += alphai * res0; \
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
+ FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc
+#ifdef TRMMKERNEL
+ , BLASLONG offset
+#endif
+ )
+{
+ BLASLONG i, j, l, temp;
+#if defined(TRMMKERNEL)
+ BLASLONG off;
+#endif
+ FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
+ FLOAT res0, res1, a0_r, a0_i, b0_r, b0_i;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1, src_b2, src_b3;
+ v2f64 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi;
+ v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v2f64 dst0_r, dst0_i, dst1_r, dst1_i, alpha_r, alpha_i;
+ v2f64 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i;
+ v2f64 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i;
+
+ alpha_r = COPY_DOUBLE_TO_VECTOR(alphar);
+ alpha_i = COPY_DOUBLE_TO_VECTOR(alphai);
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off = -offset;
+#endif
+
+ for (j = (n >> 2); j--;)
+ {
+ pc0 = C;
+ pc1 = pc0 + 2 * ldc;
+ pc2 = pc1 + 2 * ldc;
+ pc3 = pc2 + 2 * ldc;
+
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 2); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 4;
+ pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_4X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_4X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_4X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_4X4_MSA(, -, , -, -);
+#endif
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_4X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_4X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_4X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_4X4_MSA(+, -, -, -,);
+#endif
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_4X4_MSA
+#else
+ ZGEMM_SCALE_4X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2 * 4;
+ pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 2;
+ pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_2X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_2X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_2X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_2X4_MSA(, -, , -, -);
+#endif
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_2X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_2X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_2X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_2X4_MSA(+, -, -, -,);
+#endif
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_2X4_MSA
+#else
+ ZGEMM_SCALE_2X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2 * 2;
+ pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 1;
+ pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_1X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_1X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_1X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_1X4_MSA(, -, , -, -);
+#endif
+
+ pa0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_1X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_1X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_1X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_1X4_MSA(+, -, -, -,);
+#endif
+
+ pa0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_1X4_MSA
+#else
+ ZGEMM_SCALE_1X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2 * 1;
+ pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ pc1 += 2;
+ pc2 += 2;
+ pc3 += 2;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 4; // number of values in A
+#endif
+
+ l = k << 3;
+ B = B + l;
+ i = ldc << 3;
+ C = C + i;
+ }
+
+ if (n & 2)
+ {
+ pc0 = C;
+ pc1 = pc0 + 2 * ldc;
+
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 2); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 4;
+ pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_4X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_4X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_4X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_4X2_MSA(, -, , -, -);
+#endif
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_4X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_4X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_4X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_4X2_MSA(+, -, -, -,);
+#endif
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_4X2_MSA
+#else
+ ZGEMM_SCALE_4X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2 * 4;
+ pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 2;
+ pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_2X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_2X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_2X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_2X2_MSA(, -, , -, -);
+#endif
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_2X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_2X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_2X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_2X2_MSA(+, -, -, -,);
+#endif
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_2X2_MSA
+#else
+ ZGEMM_SCALE_2X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2 * 2;
+ pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 1;
+ pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_1X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_1X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_1X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_1X2_MSA(, -, , -, -);
+#endif
+
+ pa0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_1X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_1X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_1X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_1X2_MSA(+, -, -, -,);
+#endif
+
+ pa0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_1X2_MSA
+#else
+ ZGEMM_SCALE_1X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2 * 1;
+ pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ pc1 += 2;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 2; // number of values in A
+#endif
+
+ l = k << 2;
+ B = B + l;
+ i = ldc << 2;
+ C = C + i;
+ }
+
+ if (n & 1)
+ {
+ pc0 = C;
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 2); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 4;
+ pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_4X1_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_4X1_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_4X1_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_4X1_MSA(, -, , -, -);
+#endif
+
+ pb0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_4X1_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_4X1_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_4X1_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_4X1_MSA(+, -, -, -,);
+#endif
+
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_4X1_MSA
+#else
+ ZGEMM_SCALE_4X1_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2 * 4;
+ pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 2;
+ pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_2X1_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_2X1_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_2X1_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_2X1_MSA(, -, , -, -);
+#endif
+
+ pb0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_2X1_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_2X1_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_2X1_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_2X1_MSA(+, -, -, -,);
+#endif
+
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_2X1_MSA
+#else
+ ZGEMM_SCALE_2X1_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2 * 2;
+ pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 1;
+ pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_1X1(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_1X1(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_1X1(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_1X1(, -, , -, -);
+#endif
+
+ pa0 += 2;
+ pb0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_1X1(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_1X1(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_1X1(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_1X1(+, -, -, -,);
+#endif
+
+ pa0 += 2;
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_1X1
+#else
+ ZGEMM_SCALE_1X1
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2 * 1;
+ pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 1; // number of values in A
+#endif
+
+ l = k << 1;
+ B = B + l;
+ i = ldc << 1;
+ C = C + i;
+ }
+ return 0;
+}
diff --git a/kernel/mips/zgemm_ncopy_4_msa.c b/kernel/mips/zgemm_ncopy_4_msa.c
new file mode 100644
index 000000000..3ef46a571
--- /dev/null
+++ b/kernel/mips/zgemm_ncopy_4_msa.c
@@ -0,0 +1,144 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
+ v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
+ v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
+
+ psrc0 = src;
+ pdst = dst;
+ lda *= 2;
+
+ for (j = (n >> 2); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc0 += 4 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+ LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
+ LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
+
+ ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2);
+ ST_DP8_INC(src2, src6, src10, src14, src3, src7, src11, src15,
+ pdst, 2);
+ }
+
+ if (m & 2)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src4, src5);
+ LD_DP2_INC(psrc3, 2, src8, src9);
+ LD_DP2_INC(psrc4, 2, src12, src13);
+
+ ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2);
+ }
+
+ if (m & 1)
+ {
+ src0 = LD_DP(psrc1);
+ src4 = LD_DP(psrc2);
+ src8 = LD_DP(psrc3);
+ src12 = LD_DP(psrc4);
+ psrc1 += 2;
+ psrc2 += 2;
+ psrc3 += 2;
+ psrc4 += 2;
+
+ ST_DP4_INC(src0, src4, src8, src12, pdst, 2);
+ }
+ }
+
+ if (n & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc0 += 2 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+
+ ST_DP8_INC(src0, src4, src1, src5, src2, src6, src3, src7, pdst, 2);
+ }
+
+ if (m & 2)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src4, src5);
+
+ ST_DP4_INC(src0, src4, src1, src5, pdst, 2);
+ }
+
+ if (m & 1)
+ {
+ src0 = LD_DP(psrc1);
+ src4 = LD_DP(psrc2);
+ psrc1 += 2;
+ psrc2 += 2;
+
+ ST_DP2_INC(src0, src4, pdst, 2);
+ }
+ }
+
+ if (n & 1)
+ {
+ psrc1 = psrc0;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ ST_DP4_INC(src0, src1, src2, src3, pdst, 2);
+ }
+
+ if (m & 2)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ ST_DP2_INC(src0, src1, pdst, 2);
+ }
+
+ if (m & 1)
+ {
+ src0 = LD_DP(psrc1);
+ ST_DP(src0, pdst);
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/zgemm_tcopy_4_msa.c b/kernel/mips/zgemm_tcopy_4_msa.c
new file mode 100644
index 000000000..70314cb21
--- /dev/null
+++ b/kernel/mips/zgemm_tcopy_4_msa.c
@@ -0,0 +1,161 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
+ FLOAT *pdst0, *pdst1, *pdst2, *pdst3;
+ v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
+ v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
+
+ psrc0 = src;
+ pdst0 = dst;
+ lda *= 2;
+
+ pdst2 = dst + 2 * m * (n & ~3);
+ pdst3 = dst + 2 * m * (n & ~1);
+
+ for (j = (m >> 2); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc0 += 4 * lda;
+
+ pdst1 = pdst0;
+ pdst0 += 32;
+
+ for (i = (n >> 2); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+ LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
+ LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
+
+ ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
+ ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
+ pdst1 + 16, 2);
+ pdst1 += m * 8;
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
+
+ ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
+ }
+
+ if (n & 1)
+ {
+ src0 = LD_DP(psrc1);
+ src1 = LD_DP(psrc2);
+ src2 = LD_DP(psrc3);
+ src3 = LD_DP(psrc4);
+ psrc1 += 2;
+ psrc2 += 2;
+ psrc3 += 2;
+ psrc4 += 2;
+
+ ST_DP4_INC(src0, src1, src2, src3, pdst3, 2);
+ }
+ }
+
+ if (m & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc0 += 2 * lda;
+
+ pdst1 = pdst0;
+ pdst0 += 16;
+
+ for (i = (n >> 2); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+
+ ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
+
+ pdst1 += m * 8;
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+
+ ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
+ }
+
+ if (n & 1)
+ {
+ src0 = LD_DP(psrc1);
+ src1 = LD_DP(psrc2);
+
+ ST_DP2_INC(src0, src1, pdst3, 2);
+
+ psrc1 += 2;
+ psrc2 += 2;
+ }
+ }
+
+ if (m & 1)
+ {
+ psrc1 = psrc0;
+ pdst1 = pdst0;
+
+ for (i = (n >> 2); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ ST_DP4(src0, src1, src2, src3, pdst1, 2);
+
+ pdst1 += m * 8;
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ ST_DP2_INC(src0, src1, pdst2, 2);
+ }
+
+ if (n & 1)
+ {
+ src0 = LD_DP(psrc1);
+ ST_DP(src0, pdst3);
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/zgemv_n.c b/kernel/mips/zgemv_n.c
new file mode 100644
index 000000000..9bf1f6b42
--- /dev/null
+++ b/kernel/mips/zgemv_n.c
@@ -0,0 +1,147 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG ix,iy;
+ BLASLONG j;
+ FLOAT *a_ptr;
+ FLOAT temp_r,temp_i;
+ BLASLONG inc_x2,inc_y2;
+ BLASLONG lda2;
+ BLASLONG i2;
+
+ lda2 = 2*lda;
+
+ ix = 0;
+ a_ptr = a;
+
+ if ( inc_x == 1 && inc_y == 1 )
+ {
+
+ for (j=0; j> 2); j--;) \
+ { \
+ ZLOAD_X4_SCALE() \
+ \
+ k = 0; \
+ y = y_org; \
+ \
+ for (i = (m >> 2); i--;) \
+ { \
+ ZLOAD_Y4() \
+ ZGEMV_N_4x4() \
+ ZSTORE_Y4() \
+ \
+ k += 2 * 4; \
+ y += inc_y2 * 4; \
+ } \
+ \
+ if (m & 2) \
+ { \
+ ZLOAD_Y2() \
+ ZGEMV_N_2x4() \
+ ZSTORE_Y2() \
+ \
+ k += 2 * 2; \
+ y += inc_y2 * 2; \
+ } \
+ \
+ if (m & 1) \
+ { \
+ temp0_r = tp4r[0]; \
+ temp1_r = tp4r[1]; \
+ temp2_r = tp5r[0]; \
+ temp3_r = tp5r[1]; \
+ \
+ temp0_i = tp4i[0]; \
+ temp1_i = tp4i[1]; \
+ temp2_i = tp5i[0]; \
+ temp3_i = tp5i[1]; \
+ \
+ ZGEMV_N_1x4() \
+ k += 2; \
+ y += inc_y2; \
+ } \
+ \
+ pa0 += 4 * lda2; \
+ pa1 += 4 * lda2; \
+ pa2 += 4 * lda2; \
+ pa3 += 4 * lda2; \
+ \
+ x += 4 * inc_x2; \
+ } \
+ \
+ if (n & 2) \
+ { \
+ ZLOAD_X2_SCALE() \
+ \
+ k = 0; \
+ y = y_org; \
+ \
+ for (i = (m >> 2); i--;) \
+ { \
+ ZLOAD_Y4() \
+ ZGEMV_N_4x2() \
+ ZSTORE_Y4() \
+ \
+ k += 2 * 4; \
+ y += inc_y2 * 4; \
+ } \
+ \
+ if (m & 2) \
+ { \
+ ZLOAD_Y2() \
+ ZGEMV_N_2x2() \
+ ZSTORE_Y2() \
+ \
+ k += 2 * 2; \
+ y += inc_y2 * 2; \
+ } \
+ \
+ if (m & 1) \
+ { \
+ temp0_r = tp4r[0]; \
+ temp1_r = tp4r[1]; \
+ \
+ temp0_i = tp4i[0]; \
+ temp1_i = tp4i[1]; \
+ \
+ ZGEMV_N_1x2() \
+ \
+ k += 2; \
+ y += inc_y2; \
+ } \
+ \
+ pa0 += 2 * lda2; \
+ pa1 += 2 * lda2; \
+ \
+ x += 2 * inc_x2; \
+ } \
+ \
+ if (n & 1) \
+ { \
+ ZLOAD_X1_SCALE() \
+ \
+ k = 0; \
+ y = y_org; \
+ \
+ for (i = (m >> 2); i--;) \
+ { \
+ ZLOAD_Y4() \
+ ZGEMV_N_4x1() \
+ ZSTORE_Y4() \
+ \
+ k += 2 * 4; \
+ y += inc_y2 * 4; \
+ } \
+ \
+ if (m & 2) \
+ { \
+ ZLOAD_Y2() \
+ ZGEMV_N_2x1() \
+ ZSTORE_Y2() \
+ \
+ k += 2 * 2; \
+ y += inc_y2 * 2; \
+ } \
+ \
+ if (m & 1) \
+ { \
+ ZGEMV_N_1x1() \
+ \
+ k += 2; \
+ y += inc_y2; \
+ } \
+ \
+ pa0 += lda2; \
+ x += inc_x2; \
+ } \
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y,
+ BLASLONG inc_y2, FLOAT *buffer)
+{
+ BLASLONG i, j, k;
+ FLOAT *y_org = y;
+ FLOAT *pa0, *pa1, *pa2, *pa3;
+ FLOAT temp0_r, temp1_r, temp2_r, temp3_r, temp0_i, temp1_i, temp2_i;
+ FLOAT temp3_i, res0, res1;
+ v2f64 alphar, alphai;
+ v2f64 x0, x1, x2, x3, y0, y1, y2, y3;
+ v2f64 x0r, x1r, x0i, x1i, y0r, y1r, y0i, y1i;
+ v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+ v2f64 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
+ v2f64 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
+ v2f64 tp0r, tp1r, tp2r, tp3r, tp4r, tp5r, tp0i, tp1i, tp2i, tp3i, tp4i, tp5i;
+
+ lda2 = 2 * lda2;
+ inc_x2 = 2 * inc_x2;
+ inc_y2 = 2 * inc_y2;
+
+ pa0 = A;
+ pa1 = A + lda2;
+ pa2 = A + 2 * lda2;
+ pa3 = A + 3 * lda2;
+
+ alphar = COPY_DOUBLE_TO_VECTOR(alpha_r);
+ alphai = COPY_DOUBLE_TO_VECTOR(alpha_i);
+
+ if ((2 == inc_x2) && (2 == inc_y2))
+ {
+ #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_VECTOR
+ #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_VECTOR
+ #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP
+ #define ZLOAD_Y4 ZLOAD_Y4_VECTOR
+ #define ZLOAD_Y2 ZLOAD_Y2_VECTOR
+ #define ZSTORE_Y4 ZSTORE_Y4_VECTOR
+ #define ZSTORE_Y2 ZSTORE_Y2_VECTOR
+
+ ZGEMV_N_MSA();
+
+ #undef ZLOAD_X4_SCALE
+ #undef ZLOAD_X2_SCALE
+ #undef ZLOAD_X1_SCALE
+ #undef ZLOAD_Y4
+ #undef ZLOAD_Y2
+ #undef ZSTORE_Y4
+ #undef ZSTORE_Y2
+ }
+ else if (2 == inc_x2)
+ {
+ #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_VECTOR
+ #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_VECTOR
+ #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP
+ #define ZLOAD_Y4 ZLOAD_Y4_GP
+ #define ZLOAD_Y2 ZLOAD_Y2_GP
+ #define ZSTORE_Y4 ZSTORE_Y4_GP
+ #define ZSTORE_Y2 ZSTORE_Y2_GP
+
+ ZGEMV_N_MSA();
+
+ #undef ZLOAD_X4_SCALE
+ #undef ZLOAD_X2_SCALE
+ #undef ZLOAD_X1_SCALE
+ #undef ZLOAD_Y4
+ #undef ZLOAD_Y2
+ #undef ZSTORE_Y4
+ #undef ZSTORE_Y2
+ }
+ else if (2 == inc_y2)
+ {
+ #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_GP
+ #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_GP
+ #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP
+ #define ZLOAD_Y4 ZLOAD_Y4_VECTOR
+ #define ZLOAD_Y2 ZLOAD_Y2_VECTOR
+ #define ZSTORE_Y4 ZSTORE_Y4_VECTOR
+ #define ZSTORE_Y2 ZSTORE_Y2_VECTOR
+
+ ZGEMV_N_MSA();
+
+ #undef ZLOAD_X4_SCALE
+ #undef ZLOAD_X2_SCALE
+ #undef ZLOAD_X1_SCALE
+ #undef ZLOAD_Y4
+ #undef ZLOAD_Y2
+ #undef ZSTORE_Y4
+ #undef ZSTORE_Y2
+ }
+ else
+ {
+ #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_GP
+ #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_GP
+ #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP
+ #define ZLOAD_Y4 ZLOAD_Y4_GP
+ #define ZLOAD_Y2 ZLOAD_Y2_GP
+ #define ZSTORE_Y4 ZSTORE_Y4_GP
+ #define ZSTORE_Y2 ZSTORE_Y2_GP
+
+ ZGEMV_N_MSA();
+
+ #undef ZLOAD_X4_SCALE
+ #undef ZLOAD_X2_SCALE
+ #undef ZLOAD_X1_SCALE
+ #undef ZLOAD_Y4
+ #undef ZLOAD_Y2
+ #undef ZSTORE_Y4
+ #undef ZSTORE_Y2
+ }
+ return(0);
+}
+
+#undef OP0
+#undef OP1
+#undef OP2
+#undef OP3
+#undef OP4
diff --git a/kernel/mips/zgemv_t.c b/kernel/mips/zgemv_t.c
new file mode 100644
index 000000000..2dfb9d255
--- /dev/null
+++ b/kernel/mips/zgemv_t.c
@@ -0,0 +1,130 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG ix,iy;
+ BLASLONG j;
+ FLOAT *a_ptr;
+ FLOAT temp_r,temp_i;
+ BLASLONG inc_x2,inc_y2;
+ BLASLONG lda2;
+ BLASLONG i2;
+
+ lda2 = 2*lda;
+
+ iy = 0;
+ a_ptr = a;
+
+ if ( inc_x == 1 && inc_y == 1 )
+ {
+
+ for (j=0; j> 2); j--;) \
+ { \
+ tp0r = tp1r = tp2r = tp3r = zero; \
+ tp0i = tp1i = tp2i = tp3i = zero; \
+ \
+ k = 0; \
+ x = srcx_org; \
+ \
+ for (i = (m >> 2); i--;) \
+ { \
+ ZLOAD_X4(); \
+ ZGEMV_T_4x4(); \
+ \
+ k += 2 * 4; \
+ x += inc_x2 * 4; \
+ } \
+ \
+ if (m & 2) \
+ { \
+ ZLOAD_X2(); \
+ ZGEMV_T_2x4(); \
+ \
+ k += 2 * 2; \
+ x += inc_x2 * 2; \
+ } \
+ \
+ temp0r = tp0r[0] + tp0r[1]; \
+ temp1r = tp1r[0] + tp1r[1]; \
+ temp2r = tp2r[0] + tp2r[1]; \
+ temp3r = tp3r[0] + tp3r[1]; \
+ temp0i = tp0i[0] + tp0i[1]; \
+ temp1i = tp1i[0] + tp1i[1]; \
+ temp2i = tp2i[0] + tp2i[1]; \
+ temp3i = tp3i[0] + tp3i[1]; \
+ \
+ if (m & 1) \
+ { \
+ ZGEMV_T_1x4(); \
+ \
+ k += 2; \
+ x += inc_x2; \
+ } \
+ \
+ ZSCALE_STORE_Y4_GP(); \
+ \
+ pa0 += 4 * lda2; \
+ pa1 += 4 * lda2; \
+ pa2 += 4 * lda2; \
+ pa3 += 4 * lda2; \
+ y += 4 * inc_y2; \
+ } \
+ \
+ if (n & 2) \
+ { \
+ tp0r = tp1r = zero; \
+ tp0i = tp1i = zero; \
+ \
+ k = 0; \
+ x = srcx_org; \
+ \
+ for (i = (m >> 2); i--;) \
+ { \
+ ZLOAD_X4(); \
+ ZGEMV_T_4x2(); \
+ \
+ k += 2 * 4; \
+ x += inc_x2 * 4; \
+ } \
+ \
+ if (m & 2) \
+ { \
+ ZLOAD_X2(); \
+ ZGEMV_T_2x2(); \
+ \
+ k += 2 * 2; \
+ x += inc_x2 * 2; \
+ } \
+ \
+ temp0r = tp0r[0] + tp0r[1]; \
+ temp1r = tp1r[0] + tp1r[1]; \
+ temp0i = tp0i[0] + tp0i[1]; \
+ temp1i = tp1i[0] + tp1i[1]; \
+ \
+ if (m & 1) \
+ { \
+ ZGEMV_T_1x2(); \
+ \
+ k += 2; \
+ x += inc_x2; \
+ } \
+ \
+ ZSCALE_STORE_Y2_GP(); \
+ \
+ pa0 += 2 * lda2; \
+ pa1 += 2 * lda2; \
+ y += 2 * inc_y2; \
+ } \
+ \
+ if (n & 1) \
+ { \
+ tp0r = zero; \
+ tp0i = zero; \
+ \
+ k = 0; \
+ x = srcx_org; \
+ \
+ for (i = (m >> 2); i--;) \
+ { \
+ ZLOAD_X4(); \
+ ZGEMV_T_4x1(); \
+ \
+ k += 2 * 4; \
+ x += inc_x2 * 4; \
+ } \
+ \
+ if (m & 2) \
+ { \
+ ZLOAD_X2(); \
+ ZGEMV_T_2x1(); \
+ \
+ k += 2 * 2; \
+ x += inc_x2 * 2; \
+ } \
+ \
+ temp0r = tp0r[0] + tp0r[1]; \
+ temp0i = tp0i[0] + tp0i[1]; \
+ \
+ if (m & 1) \
+ { \
+ ZGEMV_T_1x1(); \
+ \
+ k += 2; \
+ x += inc_x2; \
+ } \
+ \
+ ZSCALE_STORE_Y1_GP(); \
+ \
+ pa0 += lda2; \
+ y += inc_y2; \
+ } \
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai,
+ FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+ BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i, j, k;
+ BLASLONG inc_x2, inc_y2, lda2;
+ FLOAT *pa0, *pa1, *pa2, *pa3;
+ FLOAT *srcx_org = x;
+ FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i;
+ FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i;
+ v2f64 zero = {0};
+ v2f64 x0, x1, x2, x3, x0r, x1r, x0i, x1i;
+ v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+ v2f64 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
+ v2f64 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
+ v2f64 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i;
+
+ lda2 = 2 * lda;
+
+ pa0 = A;
+ pa1 = A + lda2;
+ pa2 = A + 2 * lda2;
+ pa3 = A + 3 * lda2;
+
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ if (2 == inc_x2)
+ {
+ #define ZLOAD_X4 ZLOAD_X4_VECTOR
+ #define ZLOAD_X2 ZLOAD_X2_VECTOR
+
+ ZGEMV_T_MSA();
+
+ #undef ZLOAD_X4
+ #undef ZLOAD_X2
+ }
+ else
+ {
+ #define ZLOAD_X4 ZLOAD_X4_GP
+ #define ZLOAD_X2 ZLOAD_X2_GP
+
+ ZGEMV_T_MSA();
+
+ #undef ZLOAD_X4
+ #undef ZLOAD_X2
+ }
+
+ return(0);
+}
+
+#undef OP0
+#undef OP1
+#undef OP2
diff --git a/kernel/mips/znrm2.c b/kernel/mips/znrm2.c
new file mode 100644
index 000000000..85be39cd1
--- /dev/null
+++ b/kernel/mips/znrm2.c
@@ -0,0 +1,97 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ FLOAT scale = 0.0;
+ FLOAT ssq = 1.0;
+ BLASLONG inc_x2;
+ FLOAT temp;
+
+ if (n <= 0 || inc_x <= 0) return(0.0);
+
+ inc_x2 = 2 * inc_x;
+
+ n *= inc_x2;
+ while(i < n)
+ {
+
+ if ( x[i] != 0.0 )
+ {
+ temp = ABS( x[i] );
+ if ( scale < temp )
+ {
+ ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+ scale = temp ;
+ }
+ else
+ {
+ ssq += ( temp / scale ) * ( temp / scale );
+ }
+
+ }
+
+ if ( x[i+1] != 0.0 )
+ {
+ temp = ABS( x[i+1] );
+ if ( scale < temp )
+ {
+ ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+ scale = temp ;
+ }
+ else
+ {
+ ssq += ( temp / scale ) * ( temp / scale );
+ }
+
+ }
+
+
+ i += inc_x2;
+ }
+ scale = scale * sqrt( ssq );
+ return(scale);
+
+}
+
+
diff --git a/kernel/mips/zomatcopy_cn.c b/kernel/mips/zomatcopy_cn.c
new file mode 100644
index 000000000..bf6d3c70d
--- /dev/null
+++ b/kernel/mips/zomatcopy_cn.c
@@ -0,0 +1,62 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+ BLASLONG i,j,ia;
+ FLOAT *aptr,*bptr;
+
+ if ( rows <= 0 ) return(0);
+ if ( cols <= 0 ) return(0);
+
+ aptr = a;
+ bptr = b;
+
+ lda *= 2;
+ ldb *= 2;
+
+ for ( i=0; i
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+ FLOAT temp[2];
+ BLASLONG inc_x2;
+ BLASLONG inc_y2;
+
+ if ( n < 0 ) return(0);
+
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ while(i < n)
+ {
+
+ temp[0] = x[ix] ;
+ temp[1] = x[ix+1] ;
+ x[ix] = y[iy] ;
+ x[ix+1] = y[iy+1] ;
+ y[iy] = temp[0] ;
+ y[iy+1] = temp[1] ;
+
+ ix += inc_x2 ;
+ iy += inc_y2 ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/mips64/KERNEL.I6400 b/kernel/mips64/KERNEL.I6400
new file mode 100644
index 000000000..abf44814a
--- /dev/null
+++ b/kernel/mips64/KERNEL.I6400
@@ -0,0 +1 @@
+include $(KERNELDIR)/../mips/KERNEL.P5600
diff --git a/kernel/mips64/KERNEL.P6600 b/kernel/mips64/KERNEL.P6600
new file mode 100644
index 000000000..abf44814a
--- /dev/null
+++ b/kernel/mips64/KERNEL.P6600
@@ -0,0 +1 @@
+include $(KERNELDIR)/../mips/KERNEL.P5600
diff --git a/kernel/mips64/axpy.S b/kernel/mips64/axpy.S
index 32694a99d..5d9728a48 100644
--- a/kernel/mips64/axpy.S
+++ b/kernel/mips64/axpy.S
@@ -225,7 +225,9 @@
.align 3
.L20:
+ beqz INCY, .L27
dsra I, N, 3
+
move YY, Y
blez I, .L25
@@ -405,5 +407,19 @@
j $31
NOP
+ .align 3
+
+.L27:
+ LD b1, 0 * SIZE(Y)
+
+.L28:
+ daddiu N, N, -1
+ LD a1, 0 * SIZE(X)
+ daddu X, X, INCX
+ bgtz N, .L28
+ MADD b1, b1, ALPHA, a1
+
+ j .L999
+ ST b1, 0 * SIZE(Y)
EPILOGUE
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index b37a4213b..b9f44db91 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -10,9 +10,9 @@ ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
SGEMMKERNEL = sgemm_kernel_16x8_power8.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
-SGEMMITCOPY = ../generic/gemm_tcopy_16.c
+SGEMMITCOPY = sgemm_tcopy_16_power8.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
-SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
+SGEMMOTCOPY = sgemm_tcopy_8_power8.S
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
@@ -20,17 +20,17 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_16x4_power8.S
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
-DGEMMITCOPY = ../generic/gemm_tcopy_16.c
-DGEMMONCOPY = gemm_ncopy_4.S
-DGEMMOTCOPY = gemm_tcopy_4.S
-DGEMMINCOPYOBJ = dgemm_incopy.o
-DGEMMITCOPYOBJ = dgemm_itcopy.o
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
+DGEMMITCOPY = dgemm_tcopy_16_power8.S
+DGEMMONCOPY = dgemm_ncopy_4_power8.S
+DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
-CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
+CGEMMITCOPY = cgemm_tcopy_8_power8.S
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
@@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
-ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c
+ZGEMMITCOPY = zgemm_tcopy_8_power8.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
ZGEMMINCOPYOBJ = zgemm_incopy.o
@@ -54,7 +54,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
@@ -125,10 +125,10 @@ DDOTKERNEL = ddot.c
#CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = zdot.c
#
-#SNRM2KERNEL = ../arm/nrm2.c
-#DNRM2KERNEL = ../arm/nrm2.c
-#CNRM2KERNEL = ../arm/znrm2.c
-#ZNRM2KERNEL = ../arm/znrm2.c
+SNRM2KERNEL = ../arm/nrm2.c
+DNRM2KERNEL = ../arm/nrm2.c
+CNRM2KERNEL = ../arm/znrm2.c
+ZNRM2KERNEL = ../arm/znrm2.c
#
SROTKERNEL = srot.c
DROTKERNEL = drot.c
@@ -137,7 +137,7 @@ DROTKERNEL = drot.c
#
SSCALKERNEL = sscal.c
DSCALKERNEL = dscal.c
-#CSCALKERNEL = ../arm/zscal.c
+CSCALKERNEL = zscal.c
ZSCALKERNEL = zscal.c
#
SSWAPKERNEL = sswap.c
diff --git a/kernel/power/cgemm_tcopy_8_power8.S b/kernel/power/cgemm_tcopy_8_power8.S
new file mode 100644
index 000000000..b1a7d2b27
--- /dev/null
+++ b/kernel/power/cgemm_tcopy_8_power8.S
@@ -0,0 +1,206 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define M r3
+#define N r4
+#define A r5
+#define LDA r6
+#define B r7
+
+#define A0 r8
+#define A1 r9
+#define A2 r10
+#define A3 r11
+
+#define J r12
+
+#define PREA r14
+#define PREB r15
+#define BO r16
+#define B8 r17
+#define B4 r18
+#define B2 r19
+#define B1 r20
+#define o4 r21
+#define T2 r22
+#define I r23
+#define o16 r24
+#define o32 r25
+#define o48 r26
+#define NOTUS2 r27
+#define M8 r30
+#define T1 r31
+
+#define o0 0
+
+#include "cgemm_tcopy_macros_8_power8.S"
+
+#define STACKSIZE 384
+
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+ cmpwi cr0, M, 0
+ ble- L999
+ cmpwi cr0, N, 0
+ ble- L999
+
+ slwi LDA, LDA, ZBASE_SHIFT
+ slwi M8, M, 3 + ZBASE_SHIFT
+
+ li T2, -8
+ li PREA, -4
+ li PREB, -2
+
+ and B4, N, T2
+ and B2, N, PREA
+ and B1, N, PREB
+
+ mullw B4, B4, M
+ mullw B2, B2, M
+ mullw B1, B1, M
+
+ slwi B4, B4, ZBASE_SHIFT
+ slwi B2, B2, ZBASE_SHIFT
+ slwi B1, B1, ZBASE_SHIFT
+
+ add B4, B4, B
+ add B2, B2, B
+ add B1, B1, B
+
+ li PREA, 384
+ addi PREB, M8, 128
+
+ li o4, 4
+ li o16, 16
+ li o32, 32
+ li o48, 48
+
+#include "cgemm_tcopy_logic_8_power8.S"
+
+L999:
+
+ li r3, 0
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ addi SP, SP, STACKSIZE
+
+ blr
+ EPILOGUE
+
+
diff --git a/kernel/power/cgemm_tcopy_logic_8_power8.S b/kernel/power/cgemm_tcopy_logic_8_power8.S
new file mode 100644
index 000000000..9418908b7
--- /dev/null
+++ b/kernel/power/cgemm_tcopy_logic_8_power8.S
@@ -0,0 +1,247 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+ srawi. I, M, 2
+ ble CCOPYT_L2_BEGIN
+
+
+CCOPYT_L4_BEGIN:
+
+ mr A0, A
+ add A1, A0, LDA
+ add A2, A1, LDA
+ add A3, A2, LDA
+ add A, A3, LDA
+ mr B8, B
+ addi B, B, 64*SIZE
+
+ sradi. J, N, 3
+ ble CCOPYT_L4x4_BEGIN
+
+ mr BO, B8
+
+CCOPYT_L4x8_LOOP:
+
+ dcbt A0, PREA
+ dcbt A1, PREA
+ dcbt A2, PREA
+ dcbt A3, PREA
+ dcbtst BO, M8
+ dcbtst BO, PREB
+ COPY_4x8
+
+ add BO, BO, M8
+
+ addic. J, J, -1
+ ble CCOPYT_L4x4_BEGIN
+
+
+ COPY_4x8
+
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt CCOPYT_L4x8_LOOP
+
+CCOPYT_L4x4_BEGIN:
+
+ andi. T1, N, 4
+ ble CCOPYT_L4x2_BEGIN
+
+ mr BO, B4
+
+ COPY_4x4
+
+
+ addi B4, B4, 32*SIZE
+
+CCOPYT_L4x2_BEGIN:
+
+ andi. T1, N, 2
+ ble CCOPYT_L4x1_BEGIN
+
+ mr BO, B2
+
+ COPY_4x2
+
+
+ addi B2, B2, 16*SIZE
+
+CCOPYT_L4x1_BEGIN:
+
+ andi. T1, N, 1
+ ble CCOPYT_L4_END
+
+ mr BO, B1
+
+ COPY_4x1
+
+
+ addi B1, B1, 8*SIZE
+
+CCOPYT_L4_END:
+
+ addic. I, I, -1
+ bgt CCOPYT_L4_BEGIN
+
+
+
+CCOPYT_L2_BEGIN:
+
+ andi. T1, M, 2
+ ble CCOPYT_L1_BEGIN
+
+ mr A0, A
+ add A1, A0, LDA
+ add A, A1, LDA
+ mr B8, B
+ addi B, B, 32*SIZE
+
+ sradi. J, N, 3
+ ble CCOPYT_L2x4_BEGIN
+
+ mr BO, B8
+
+CCOPYT_L2x8_LOOP:
+
+ COPY_2x8
+
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt CCOPYT_L2x8_LOOP
+
+CCOPYT_L2x4_BEGIN:
+
+ andi. T1, N, 4
+ ble CCOPYT_L2x2_BEGIN
+
+ mr BO, B4
+
+ COPY_2x4
+
+
+ addi B4, B4, 16*SIZE
+
+CCOPYT_L2x2_BEGIN:
+
+ andi. T1, N, 2
+ ble CCOPYT_L2x1_BEGIN
+
+ mr BO, B2
+
+ COPY_2x2
+
+
+ addi B2, B2, 8*SIZE
+
+CCOPYT_L2x1_BEGIN:
+
+ andi. T1, N, 1
+ ble CCOPYT_L2_END
+
+ mr BO, B1
+
+ COPY_2x1
+
+
+ addi B1, B1, 4*SIZE
+
+CCOPYT_L2_END:
+
+
+CCOPYT_L1_BEGIN:
+
+ andi. T1, M, 1
+ ble L999
+
+ mr A0, A
+ add A, A0, LDA
+ mr B8, B
+ addi B, B, 16*SIZE
+
+ sradi. J, N, 3
+ ble CCOPYT_L1x4_BEGIN
+
+ mr BO, B8
+
+CCOPYT_L1x8_LOOP:
+
+ COPY_1x8
+
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt CCOPYT_L1x8_LOOP
+
+CCOPYT_L1x4_BEGIN:
+
+ andi. T1, N, 4
+ ble CCOPYT_L1x2_BEGIN
+
+ mr BO, B4
+
+ COPY_1x4
+
+
+ addi B4, B4, 8*SIZE
+
+CCOPYT_L1x2_BEGIN:
+
+ andi. T1, N, 2
+ ble CCOPYT_L1x1_BEGIN
+
+ mr BO, B2
+
+ COPY_1x2
+
+
+ addi B2, B2, 4*SIZE
+
+CCOPYT_L1x1_BEGIN:
+
+ andi. T1, N, 1
+ ble CCOPYT_L1_END
+
+ mr BO, B1
+
+ COPY_1x1
+
+
+ addi B1, B1, 2*SIZE
+
+CCOPYT_L1_END:
+
diff --git a/kernel/power/cgemm_tcopy_macros_8_power8.S b/kernel/power/cgemm_tcopy_macros_8_power8.S
new file mode 100644
index 000000000..03fda2766
--- /dev/null
+++ b/kernel/power/cgemm_tcopy_macros_8_power8.S
@@ -0,0 +1,385 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro COPY_4x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ lxvw4x vs34, o32, A0
+ lxvw4x vs35, o48, A0
+
+ lxvw4x vs36, o0, A1
+ lxvw4x vs37, o16, A1
+ lxvw4x vs38, o32, A1
+ lxvw4x vs39, o48, A1
+
+ addi A0, A0, 64
+ addi A1, A1, 64
+
+ lxvw4x vs40, o0, A2
+ lxvw4x vs41, o16, A2
+ lxvw4x vs42, o32, A2
+ lxvw4x vs43, o48, A2
+
+ lxvw4x vs44, o0, A3
+ lxvw4x vs45, o16, A3
+ lxvw4x vs46, o32, A3
+ lxvw4x vs47, o48, A3
+
+ mr T1, BO
+ addi A2, A2, 64
+ addi A3, A3, 64
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs36, o0, T1
+ stxvw4x vs37, o16, T1
+ stxvw4x vs38, o32, T1
+ stxvw4x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs40, o0, T1
+ stxvw4x vs41, o16, T1
+ stxvw4x vs42, o32, T1
+ stxvw4x vs43, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs44, o0, T1
+ stxvw4x vs45, o16, T1
+ stxvw4x vs46, o32, T1
+ stxvw4x vs47, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro COPY_4x4
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ addi A0, A0, 32
+
+ lxvw4x vs34, o0, A1
+ lxvw4x vs35, o16, A1
+ addi A1, A1, 32
+
+ lxvw4x vs36, o0, A2
+ lxvw4x vs37, o16, A2
+ addi A2, A2, 32
+
+ lxvw4x vs38, o0, A3
+ lxvw4x vs39, o16, A3
+ addi A3, A3, 32
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs36, o0, T1
+ stxvw4x vs37, o16, T1
+
+ stxvw4x vs38, o32, T1
+ stxvw4x vs39, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro COPY_4x2
+
+ lxvw4x vs32, o0, A0
+ addi A0, A0, 16
+
+ lxvw4x vs33, o0, A1
+ addi A1, A1, 16
+
+ lxvw4x vs34, o0, A2
+ addi A2, A2, 16
+
+ lxvw4x vs35, o0, A3
+ addi A3, A3, 16
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+
+ stxvw4x vs35, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro COPY_4x1
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+ addi A0, A0, 8
+
+ lxsspx vs34, o0, A1
+ lxsspx vs35, o4, A1
+ addi A1, A1, 8
+
+ lxsspx vs36, o0, A2
+ lxsspx vs37, o4, A2
+ addi A2, A2, 8
+
+ lxsspx vs38, o0, A3
+ lxsspx vs39, o4, A3
+ addi A3, A3, 8
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs34, o0, T1
+ stxsspx vs35, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs36, o0, T1
+ stxsspx vs37, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs38, o0, T1
+ stxsspx vs39, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro COPY_2x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ lxvw4x vs34, o32, A0
+ lxvw4x vs35, o48, A0
+ addi A0, A0, 64
+
+ lxvw4x vs36, o0, A1
+ lxvw4x vs37, o16, A1
+ lxvw4x vs38, o32, A1
+ lxvw4x vs39, o48, A1
+ addi A1, A1, 64
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs36, o0, T1
+ stxvw4x vs37, o16, T1
+ stxvw4x vs38, o32, T1
+ stxvw4x vs39, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro COPY_2x4
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ addi A0, A0, 32
+
+ lxvw4x vs34, o0, A1
+ lxvw4x vs35, o16, A1
+ addi A1, A1, 32
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro COPY_2x2
+
+ lxvw4x vs32, o0, A0
+ addi A0, A0, 16
+
+ lxvw4x vs33, o0, A1
+ addi A1, A1, 16
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+ stxvw4x vs33, o16, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro COPY_2x1
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+ addi A0, A0, 8
+
+ lxsspx vs34, o0, A1
+ lxsspx vs35, o4, A1
+ addi A1, A1, 8
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs34, o0, T1
+ stxsspx vs35, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro COPY_1x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ lxvw4x vs34, o32, A0
+ lxvw4x vs35, o48, A0
+ addi A0, A0, 64
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro COPY_1x4
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ addi A0, A0, 32
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro COPY_1x2
+
+ lxvw4x vs32, o0, A0
+ addi A0, A0, 16
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro COPY_1x1
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+ addi A0, A0, 8
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+.endm
+
diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S
index c67f31160..8af7fe389 100644
--- a/kernel/power/dgemm_kernel_16x4_power8.S
+++ b/kernel/power/dgemm_kernel_16x4_power8.S
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
-* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
@@ -131,13 +131,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define o0 0
+#define T4 r12
+#define T3 r11
+
+#define o40 r12
+#define o56 r11
+
+#define o112 r14
#define o8 r15
#define o24 r16
-#define ALPHA r17
+#define o64 r17
#define L r18
#define T1 r19
-#define KK r20
-#define BB r21
+#define o80 r20
+#define o96 r21
#define I r22
#define J r23
#define AO r24
@@ -202,6 +209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
+ std r14, 280(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
@@ -220,6 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
+ stw r14, 212(SP)
#endif
stfd f1, ALPHA_SP
@@ -260,19 +269,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble .L999_H1
#ifdef __64BIT__
- addi ALPHA, SP, 296
+ addi T1, SP, 296
#else
- addi ALPHA, SP, 224
+ addi T1, SP, 224
#endif
- li PRE, 256
+ li PRE, 384
li o8 , 8
li o16, 16
li o24, 24
li o32, 32
li o48, 48
+ li o64, 64
+ li o80, 80
+ li o96, 96
+ li o112, 112
- lxvdsx alpha_r, 0, ALPHA
+ lxvdsx alpha_r, 0, T1
#include "dgemm_logic_16x4_power8.S"
@@ -320,6 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
+ ld r14, 280(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
@@ -338,6 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
+ lwz r14, 212(SP)
#endif
addi SP, SP, STACKSIZE
diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S
index 49c438f61..cacfab1f6 100644
--- a/kernel/power/dgemm_logic_16x4_power8.S
+++ b/kernel/power/dgemm_logic_16x4_power8.S
@@ -33,195 +33,340 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* LAPACK-TEST : OK
**************************************************************************************/
+#define MY_ALIGN .align 3
srawi. J, N, 2
- ble .LDGEMM_L4_END
+ ble LDGEMM_L4_END
-.LDGEMM_L4_BEGIN:
+LDGEMM_L4_BEGIN:
- mr CO, C
+ li T1, 128
+ li T2, 256
mr AO, A
- slwi T1, LDC , 2
- add C, C, T1
+
+ mr CO, C
+ slwi T3, LDC , 2
+ add C, C, T3
+
+ dcbt A, T1
+ dcbt A, T2
+
srawi. I, M, 4
- ble .LDGEMM_L4x16_END
+ ble LDGEMM_L4x16_END
+
+ MY_ALIGN
+LDGEMM_L4x16_BEGIN_FIRST:
-.LDGEMM_L4x16_BEGIN:
+ li L, -128
+ mr T1, CO
+ add T2, T1, LDC
+ add T3, T2, LDC
+ add T4, T3, LDC
+
+ and T1, T1, L
+ and T2, T2, L
+ and T3, T3, L
+ and T4, T4, L
+
+ dcbt T1, r0
+ dcbt T2, r0
+ dcbt T3, r0
+ dcbt T4, r0
mr BO, B
- srawi. L, K, 3
- ble .LDGEMM_L4x16_SUB0
+ srawi. L, K, 2
+
+ addi T1, T1, 128
+ addi T2, T2, 128
+ addi T3, T3, 128
+ addi T4, T4, 128
+
+ dcbt T1, r0
+ dcbt T2, r0
+ dcbt T3, r0
+ dcbt T4, r0
+
+ ble LDGEMM_L4x16_SUB0_FIRST
cmpwi cr0, L, 1
- ble .LDGEMM_L4x16_SUB4
+ ble LDGEMM_L4x16_SUB4_FIRST
-.LDGEMM_L4x16_LOOP_START:
+ MY_ALIGN
+LDGEMM_L4x16_LOOP_START_FIRST:
- dcbt AO, PRE
+ li T2, 512
+ li o40, 40
+ li o56, 56
+
+ dcbt AO, PRE
+ dcbt BO, T2
LOAD4x16_1
- dcbt AO, PRE
+ dcbt AO, PRE
KERNEL4x16_I1
- dcbt AO, PRE
- KERNEL4x16_2
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
+ dcbt AO, PRE
+ addic. L, L, -2
+ KERNEL4x16_L2
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_L1
+ dcbt AO, PRE
+ dcbt BO, T2
+ KERNEL4x16_L2
- addic. L, L, -2
- ble .LDGEMM_L4x16_LOOP_END
+ ble LDGEMM_L4x16_LOOP_END_FIRST
+ mtctr L
- .align 5
+ MY_ALIGN
-.LDGEMM_L4x16_LOOP:
+LDGEMM_L4x16_LOOP_FIRST:
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_L1
+ dcbt AO, PRE
+ KERNEL4x16_L2
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_L1
+ dcbt AO, PRE
+ dcbt BO, T2
+ KERNEL4x16_L2
- addic. L, L, -1
- bgt .LDGEMM_L4x16_LOOP
+ bdnz LDGEMM_L4x16_LOOP_FIRST
-.LDGEMM_L4x16_LOOP_END:
+ MY_ALIGN
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
+LDGEMM_L4x16_LOOP_END_FIRST:
+
+ KERNEL4x16_L1
+ KERNEL4x16_L2
- dcbt AO, PRE
- KERNEL4x16_1
- dcbt AO, PRE
- KERNEL4x16_2
- dcbt AO, PRE
KERNEL4x16_1
KERNEL4x16_E2
- b .LDGEMM_L4x16_SUB1
+ b LDGEMM_L4x16_SUB1_FIRST
-.LDGEMM_L4x16_SUB4:
+LDGEMM_L4x16_SUB4_FIRST:
- dcbt AO, PRE
KERNEL4x16_SUBI1
- dcbt AO, PRE
KERNEL4x16_SUB1
- dcbt AO, PRE
KERNEL4x16_SUB1
- dcbt AO, PRE
KERNEL4x16_SUB1
+ b LDGEMM_L4x16_SUB1_FIRST
+
+LDGEMM_L4x16_SUB0_FIRST:
+
+ andi. L, K, 3
+
+ KERNEL4x16_SUBI1
+
+ addic. L, L, -1
+ ble LDGEMM_L4x16_SAVE_FIRST
+ b LDGEMM_L4x16_SUB2_FIRST
+
+LDGEMM_L4x16_SUB1_FIRST:
+
+ andi. L, K, 3
+ ble LDGEMM_L4x16_SAVE_FIRST
+
+LDGEMM_L4x16_SUB2_FIRST:
+
KERNEL4x16_SUB1
- KERNEL4x16_SUB1
- KERNEL4x16_SUB1
+
+ addic. L, L, -1
+ bgt LDGEMM_L4x16_SUB2_FIRST
+
+ MY_ALIGN
+LDGEMM_L4x16_SAVE_FIRST:
+
+ SAVE4x16
+
+ addic. I, I, -1
+ ble LDGEMM_L4x16_END
+
+LDGEMM_L4x16_END_FIRST:
+
+ MY_ALIGN
+
+LDGEMM_L4x16_BEGIN:
+
+ li L, -128
+
+ mr T1, CO
+ add T2, T1, LDC
+ add T3, T2, LDC
+ add T4, T3, LDC
+
+ and T1, T1, L
+ and T2, T2, L
+ and T3, T3, L
+ and T4, T4, L
+
+ dcbt T1, r0
+ dcbt T2, r0
+ dcbt T3, r0
+ dcbt T4, r0
+
+ mr BO, B
+ srawi. L, K, 1
+
+ addi T1, T1, 128
+ addi T2, T2, 128
+ addi T3, T3, 128
+ addi T4, T4, 128
+
+ dcbt T1, r0
+ dcbt T2, r0
+ dcbt T3, r0
+ dcbt T4, r0
+
+ ble- LDGEMM_L4x16_SUB0
+ cmpwi cr0, L, 1
+ ble- LDGEMM_L4x16_SUB4
+
+ MY_ALIGN
+
+LDGEMM_L4x16_LOOP_START:
+
+ li o40, 40
+ li o56, 56
+
+ dcbt AO, PRE
+ LOAD4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_I1
+ dcbt AO, PRE
+ addic. L, L, -2
+ KERNEL4x16_L2
+
+ ble- LDGEMM_L4x16_LOOP_END
+ mtctr L
+
+ MY_ALIGN
+
+LDGEMM_L4x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL4x16_L1
+ dcbt AO, PRE
+ KERNEL4x16_L2
+
+ bdnz+ LDGEMM_L4x16_LOOP
+
+
+ MY_ALIGN
+
+LDGEMM_L4x16_LOOP_END:
+
+ KERNEL4x16_1
+ KERNEL4x16_E2
+
+ b LDGEMM_L4x16_SUB1
+
+ MY_ALIGN
+
+LDGEMM_L4x16_SUB4:
+
+ KERNEL4x16_SUBI1
KERNEL4x16_SUB1
- b .LDGEMM_L4x16_SUB1
+ b LDGEMM_L4x16_SUB1
-.LDGEMM_L4x16_SUB0:
+ MY_ALIGN
- andi. L, K, 7
+LDGEMM_L4x16_SUB0:
+
+ andi. L, K, 1
KERNEL4x16_SUBI1
addic. L, L, -1
- ble .LDGEMM_L4x16_SAVE
- b .LDGEMM_L4x16_SUB2
+ ble LDGEMM_L4x16_SAVE
+ b LDGEMM_L4x16_SUB2
-.LDGEMM_L4x16_SUB1:
+ MY_ALIGN
- andi. L, K, 7
- ble .LDGEMM_L4x16_SAVE
+LDGEMM_L4x16_SUB1:
-.LDGEMM_L4x16_SUB2:
+ andi. L, K, 1
+ ble LDGEMM_L4x16_SAVE
+
+ MY_ALIGN
+
+LDGEMM_L4x16_SUB2:
KERNEL4x16_SUB1
addic. L, L, -1
- bgt .LDGEMM_L4x16_SUB2
+ bgt LDGEMM_L4x16_SUB2
+
+ MY_ALIGN
-.LDGEMM_L4x16_SAVE:
+LDGEMM_L4x16_SAVE:
SAVE4x16
addic. I, I, -1
- bgt .LDGEMM_L4x16_BEGIN
+ bgt+ LDGEMM_L4x16_BEGIN
-.LDGEMM_L4x16_END:
+LDGEMM_L4x16_END:
-.LDGEMM_L4x8_BEGIN:
+LDGEMM_L4x8_BEGIN:
andi. T2, M, 15
- ble .LDGEMM_L4x1_END
+ ble LDGEMM_L4x1_END
andi. T1, M, 8
- ble .LDGEMM_L4x8_END
+ ble LDGEMM_L4x8_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L4x8_SUB0
+ ble LDGEMM_L4x8_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L4x8_SUB4
+ ble LDGEMM_L4x8_SUB4
-.LDGEMM_L4x8_LOOP_START:
+LDGEMM_L4x8_LOOP_START:
+ dcbt AO, PRE
LOAD4x8_1
KERNEL4x8_I1
+ dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
+ dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
+ dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
+ dcbt AO, PRE
KERNEL4x8_2
addic. L, L, -2
- ble .LDGEMM_L4x8_LOOP_END
+ ble LDGEMM_L4x8_LOOP_END
- .align 5
+ MY_ALIGN
-.LDGEMM_L4x8_LOOP:
+LDGEMM_L4x8_LOOP:
KERNEL4x8_1
+ dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
+ dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
+ dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
+ dcbt AO, PRE
KERNEL4x8_2
addic. L, L, -1
- bgt .LDGEMM_L4x8_LOOP
+ bgt LDGEMM_L4x8_LOOP
-.LDGEMM_L4x8_LOOP_END:
+LDGEMM_L4x8_LOOP_END:
KERNEL4x8_1
KERNEL4x8_2
@@ -233,9 +378,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x8_1
KERNEL4x8_E2
- b .LDGEMM_L4x8_SUB1
+ b LDGEMM_L4x8_SUB1
-.LDGEMM_L4x8_SUB4:
+LDGEMM_L4x8_SUB4:
KERNEL4x8_SUBI1
KERNEL4x8_SUB1
@@ -247,81 +392,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x8_SUB1
KERNEL4x8_SUB1
- b .LDGEMM_L4x8_SUB1
+ b LDGEMM_L4x8_SUB1
-.LDGEMM_L4x8_SUB0:
+LDGEMM_L4x8_SUB0:
andi. L, K, 7
KERNEL4x8_SUBI1
addic. L, L, -1
- ble .LDGEMM_L4x8_SAVE
- b .LDGEMM_L4x8_SUB2
+ ble LDGEMM_L4x8_SAVE
+ b LDGEMM_L4x8_SUB2
-.LDGEMM_L4x8_SUB1:
+LDGEMM_L4x8_SUB1:
andi. L, K, 7
- ble .LDGEMM_L4x8_SAVE
+ ble LDGEMM_L4x8_SAVE
-.LDGEMM_L4x8_SUB2:
+LDGEMM_L4x8_SUB2:
KERNEL4x8_SUB1
addic. L, L, -1
- bgt .LDGEMM_L4x8_SUB2
+ bgt LDGEMM_L4x8_SUB2
-.LDGEMM_L4x8_SAVE:
+LDGEMM_L4x8_SAVE:
SAVE4x8
-.LDGEMM_L4x8_END:
+LDGEMM_L4x8_END:
-.LDGEMM_L4x4_BEGIN:
+LDGEMM_L4x4_BEGIN:
andi. T1, M, 4
- ble .LDGEMM_L4x4_END
+ ble LDGEMM_L4x4_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L4x4_SUB0
+ ble LDGEMM_L4x4_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L4x4_SUB4
+ ble LDGEMM_L4x4_SUB4
-.LDGEMM_L4x4_LOOP_START:
+LDGEMM_L4x4_LOOP_START:
+ dcbt AO, PRE
LOAD4x4_1
KERNEL4x4_I1
KERNEL4x4_2
KERNEL4x4_1
+ dcbt AO, PRE
KERNEL4x4_2
KERNEL4x4_1
KERNEL4x4_2
KERNEL4x4_1
+ dcbt AO, PRE
KERNEL4x4_2
addic. L, L, -2
- ble .LDGEMM_L4x4_LOOP_END
+ ble LDGEMM_L4x4_LOOP_END
- .align 5
+ MY_ALIGN
-.LDGEMM_L4x4_LOOP:
+LDGEMM_L4x4_LOOP:
KERNEL4x4_1
KERNEL4x4_2
KERNEL4x4_1
+ dcbt AO, PRE
KERNEL4x4_2
KERNEL4x4_1
KERNEL4x4_2
KERNEL4x4_1
+ dcbt AO, PRE
KERNEL4x4_2
addic. L, L, -1
- bgt .LDGEMM_L4x4_LOOP
+ bgt LDGEMM_L4x4_LOOP
-.LDGEMM_L4x4_LOOP_END:
+LDGEMM_L4x4_LOOP_END:
KERNEL4x4_1
KERNEL4x4_2
@@ -333,9 +483,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x4_1
KERNEL4x4_E2
- b .LDGEMM_L4x4_SUB1
+ b LDGEMM_L4x4_SUB1
-.LDGEMM_L4x4_SUB4:
+LDGEMM_L4x4_SUB4:
KERNEL4x4_SUBI1
KERNEL4x4_SUB1
@@ -347,48 +497,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x4_SUB1
KERNEL4x4_SUB1
- b .LDGEMM_L4x4_SUB1
+ b LDGEMM_L4x4_SUB1
-.LDGEMM_L4x4_SUB0:
+LDGEMM_L4x4_SUB0:
andi. L, K, 7
KERNEL4x4_SUBI1
addic. L, L, -1
- ble .LDGEMM_L4x4_SAVE
- b .LDGEMM_L4x4_SUB2
+ ble LDGEMM_L4x4_SAVE
+ b LDGEMM_L4x4_SUB2
-.LDGEMM_L4x4_SUB1:
+LDGEMM_L4x4_SUB1:
andi. L, K, 7
- ble .LDGEMM_L4x4_SAVE
+ ble LDGEMM_L4x4_SAVE
-.LDGEMM_L4x4_SUB2:
+LDGEMM_L4x4_SUB2:
KERNEL4x4_SUB1
addic. L, L, -1
- bgt .LDGEMM_L4x4_SUB2
+ bgt LDGEMM_L4x4_SUB2
-.LDGEMM_L4x4_SAVE:
+LDGEMM_L4x4_SAVE:
SAVE4x4
-.LDGEMM_L4x4_END:
+LDGEMM_L4x4_END:
-.LDGEMM_L4x2_BEGIN:
+LDGEMM_L4x2_BEGIN:
andi. T1, M, 2
- ble .LDGEMM_L4x2_END
+ ble LDGEMM_L4x2_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L4x2_SUB0
+ ble LDGEMM_L4x2_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L4x2_SUB4
+ ble LDGEMM_L4x2_SUB4
-.LDGEMM_L4x2_LOOP_START:
+LDGEMM_L4x2_LOOP_START:
LOAD4x2_1
KERNEL4x2_I1
@@ -402,11 +552,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x2_2
addic. L, L, -2
- ble .LDGEMM_L4x2_LOOP_END
+ ble LDGEMM_L4x2_LOOP_END
- .align 5
+ MY_ALIGN
-.LDGEMM_L4x2_LOOP:
+LDGEMM_L4x2_LOOP:
KERNEL4x2_1
KERNEL4x2_2
@@ -419,9 +569,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x2_2
addic. L, L, -1
- bgt .LDGEMM_L4x2_LOOP
+ bgt LDGEMM_L4x2_LOOP
-.LDGEMM_L4x2_LOOP_END:
+LDGEMM_L4x2_LOOP_END:
KERNEL4x2_1
KERNEL4x2_2
@@ -433,9 +583,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x2_1
KERNEL4x2_E2
- b .LDGEMM_L4x2_SUB1
+ b LDGEMM_L4x2_SUB1
-.LDGEMM_L4x2_SUB4:
+LDGEMM_L4x2_SUB4:
KERNEL4x2_SUBI1
KERNEL4x2_SUB1
@@ -447,48 +597,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x2_SUB1
KERNEL4x2_SUB1
- b .LDGEMM_L4x2_SUB1
+ b LDGEMM_L4x2_SUB1
-.LDGEMM_L4x2_SUB0:
+LDGEMM_L4x2_SUB0:
andi. L, K, 7
KERNEL4x2_SUBI1
addic. L, L, -1
- ble .LDGEMM_L4x2_SAVE
- b .LDGEMM_L4x2_SUB2
+ ble LDGEMM_L4x2_SAVE
+ b LDGEMM_L4x2_SUB2
-.LDGEMM_L4x2_SUB1:
+LDGEMM_L4x2_SUB1:
andi. L, K, 7
- ble .LDGEMM_L4x2_SAVE
+ ble LDGEMM_L4x2_SAVE
-.LDGEMM_L4x2_SUB2:
+LDGEMM_L4x2_SUB2:
KERNEL4x2_SUB1
addic. L, L, -1
- bgt .LDGEMM_L4x2_SUB2
+ bgt LDGEMM_L4x2_SUB2
-.LDGEMM_L4x2_SAVE:
+LDGEMM_L4x2_SAVE:
SAVE4x2
-.LDGEMM_L4x2_END:
+LDGEMM_L4x2_END:
-.LDGEMM_L4x1_BEGIN:
+LDGEMM_L4x1_BEGIN:
andi. T1, M, 1
- ble .LDGEMM_L4x1_END
+ ble LDGEMM_L4x1_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L4x1_SUB0
+ ble LDGEMM_L4x1_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L4x1_SUB4
+ ble LDGEMM_L4x1_SUB4
-.LDGEMM_L4x1_LOOP_START:
+LDGEMM_L4x1_LOOP_START:
LOAD4x1_1
KERNEL4x1_I1
@@ -502,11 +652,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x1_2
addic. L, L, -2
- ble .LDGEMM_L4x1_LOOP_END
+ ble LDGEMM_L4x1_LOOP_END
- .align 5
+ MY_ALIGN
-.LDGEMM_L4x1_LOOP:
+LDGEMM_L4x1_LOOP:
KERNEL4x1_1
KERNEL4x1_2
@@ -519,9 +669,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x1_2
addic. L, L, -1
- bgt .LDGEMM_L4x1_LOOP
+ bgt LDGEMM_L4x1_LOOP
-.LDGEMM_L4x1_LOOP_END:
+LDGEMM_L4x1_LOOP_END:
KERNEL4x1_1
KERNEL4x1_2
@@ -533,9 +683,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x1_1
KERNEL4x1_E2
- b .LDGEMM_L4x1_SUB1
+ b LDGEMM_L4x1_SUB1
-.LDGEMM_L4x1_SUB4:
+LDGEMM_L4x1_SUB4:
KERNEL4x1_SUBI1
KERNEL4x1_SUB1
@@ -547,74 +697,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL4x1_SUB1
KERNEL4x1_SUB1
- b .LDGEMM_L4x1_SUB1
+ b LDGEMM_L4x1_SUB1
-.LDGEMM_L4x1_SUB0:
+LDGEMM_L4x1_SUB0:
andi. L, K, 7
KERNEL4x1_SUBI1
addic. L, L, -1
- ble .LDGEMM_L4x1_SAVE
- b .LDGEMM_L4x1_SUB2
+ ble LDGEMM_L4x1_SAVE
+ b LDGEMM_L4x1_SUB2
-.LDGEMM_L4x1_SUB1:
+LDGEMM_L4x1_SUB1:
andi. L, K, 7
- ble .LDGEMM_L4x1_SAVE
+ ble LDGEMM_L4x1_SAVE
-.LDGEMM_L4x1_SUB2:
+LDGEMM_L4x1_SUB2:
KERNEL4x1_SUB1
addic. L, L, -1
- bgt .LDGEMM_L4x1_SUB2
+ bgt LDGEMM_L4x1_SUB2
-.LDGEMM_L4x1_SAVE:
+LDGEMM_L4x1_SAVE:
SAVE4x1
-.LDGEMM_L4x1_END:
+LDGEMM_L4x1_END:
slwi T1, K, 5
add B, B, T1
addic. J, J, -1
- bgt .LDGEMM_L4_BEGIN
+ bgt LDGEMM_L4_BEGIN
andi. T2, N, 3
ble .L999
-.LDGEMM_L4_END:
+LDGEMM_L4_END:
- b .LDGEMM_L2_BEGIN
+ b LDGEMM_L2_BEGIN
.L999_H1:
b .L999
-.LDGEMM_L2_BEGIN:
+LDGEMM_L2_BEGIN:
andi. T1, N, 2
- ble .LDGEMM_L2_END
+ ble LDGEMM_L2_END
mr CO, C
mr AO, A
slwi T1, LDC , 1
add C, C, T1
srawi. I, M, 4
- ble .LDGEMM_L2x16_END
+ ble LDGEMM_L2x16_END
-.LDGEMM_L2x16_BEGIN:
+LDGEMM_L2x16_BEGIN:
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L2x16_SUB0
+ ble LDGEMM_L2x16_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L2x16_SUB4
+ ble LDGEMM_L2x16_SUB4
-.LDGEMM_L2x16_LOOP_START:
+LDGEMM_L2x16_LOOP_START:
dcbt AO, PRE
LOAD2x16_1
@@ -637,11 +787,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x16_2
addic. L, L, -2
- ble .LDGEMM_L2x16_LOOP_END
+ ble LDGEMM_L2x16_LOOP_END
- .align 5
+ MY_ALIGN
-.LDGEMM_L2x16_LOOP:
+LDGEMM_L2x16_LOOP:
dcbt AO, PRE
KERNEL2x16_1
@@ -662,9 +812,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x16_2
addic. L, L, -1
- bgt .LDGEMM_L2x16_LOOP
+ bgt LDGEMM_L2x16_LOOP
-.LDGEMM_L2x16_LOOP_END:
+LDGEMM_L2x16_LOOP_END:
dcbt AO, PRE
KERNEL2x16_1
@@ -683,9 +833,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x16_1
KERNEL2x16_E2
- b .LDGEMM_L2x16_SUB1
+ b LDGEMM_L2x16_SUB1
-.LDGEMM_L2x16_SUB4:
+LDGEMM_L2x16_SUB4:
dcbt AO, PRE
KERNEL2x16_SUBI1
@@ -701,86 +851,95 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x16_SUB1
KERNEL2x16_SUB1
- b .LDGEMM_L2x16_SUB1
+ b LDGEMM_L2x16_SUB1
-.LDGEMM_L2x16_SUB0:
+LDGEMM_L2x16_SUB0:
andi. L, K, 7
KERNEL2x16_SUBI1
addic. L, L, -1
- ble .LDGEMM_L2x16_SAVE
- b .LDGEMM_L2x16_SUB2
+ ble LDGEMM_L2x16_SAVE
+ b LDGEMM_L2x16_SUB2
-.LDGEMM_L2x16_SUB1:
+LDGEMM_L2x16_SUB1:
andi. L, K, 7
- ble .LDGEMM_L2x16_SAVE
+ ble LDGEMM_L2x16_SAVE
-.LDGEMM_L2x16_SUB2:
+LDGEMM_L2x16_SUB2:
KERNEL2x16_SUB1
addic. L, L, -1
- bgt .LDGEMM_L2x16_SUB2
+ bgt LDGEMM_L2x16_SUB2
-.LDGEMM_L2x16_SAVE:
+LDGEMM_L2x16_SAVE:
SAVE2x16
addic. I, I, -1
- bgt .LDGEMM_L2x16_BEGIN
+ bgt LDGEMM_L2x16_BEGIN
-.LDGEMM_L2x16_END:
+LDGEMM_L2x16_END:
-.LDGEMM_L2x8_BEGIN:
+LDGEMM_L2x8_BEGIN:
andi. T2, M, 15
- ble .LDGEMM_L2x1_END
+ ble LDGEMM_L2x1_END
andi. T1, M, 8
- ble .LDGEMM_L2x8_END
+ ble LDGEMM_L2x8_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L2x8_SUB0
+ ble LDGEMM_L2x8_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L2x8_SUB4
+ ble LDGEMM_L2x8_SUB4
-.LDGEMM_L2x8_LOOP_START:
+LDGEMM_L2x8_LOOP_START:
+ dcbt AO, PRE
LOAD2x8_1
KERNEL2x8_I1
+ dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
+ dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
+ dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
+ dcbt AO, PRE
KERNEL2x8_2
addic. L, L, -2
- ble .LDGEMM_L2x8_LOOP_END
+ ble LDGEMM_L2x8_LOOP_END
- .align 5
+ MY_ALIGN
-.LDGEMM_L2x8_LOOP:
+LDGEMM_L2x8_LOOP:
KERNEL2x8_1
+ dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
+ dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
+ dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
+ dcbt AO, PRE
KERNEL2x8_2
addic. L, L, -1
- bgt .LDGEMM_L2x8_LOOP
+ bgt LDGEMM_L2x8_LOOP
-.LDGEMM_L2x8_LOOP_END:
+LDGEMM_L2x8_LOOP_END:
KERNEL2x8_1
KERNEL2x8_2
@@ -792,9 +951,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x8_1
KERNEL2x8_E2
- b .LDGEMM_L2x8_SUB1
+ b LDGEMM_L2x8_SUB1
-.LDGEMM_L2x8_SUB4:
+LDGEMM_L2x8_SUB4:
KERNEL2x8_SUBI1
KERNEL2x8_SUB1
@@ -806,48 +965,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x8_SUB1
KERNEL2x8_SUB1
- b .LDGEMM_L2x8_SUB1
+ b LDGEMM_L2x8_SUB1
-.LDGEMM_L2x8_SUB0:
+LDGEMM_L2x8_SUB0:
andi. L, K, 7
KERNEL2x8_SUBI1
addic. L, L, -1
- ble .LDGEMM_L2x8_SAVE
- b .LDGEMM_L2x8_SUB2
+ ble LDGEMM_L2x8_SAVE
+ b LDGEMM_L2x8_SUB2
-.LDGEMM_L2x8_SUB1:
+LDGEMM_L2x8_SUB1:
andi. L, K, 7
- ble .LDGEMM_L2x8_SAVE
+ ble LDGEMM_L2x8_SAVE
-.LDGEMM_L2x8_SUB2:
+LDGEMM_L2x8_SUB2:
KERNEL2x8_SUB1
addic. L, L, -1
- bgt .LDGEMM_L2x8_SUB2
+ bgt LDGEMM_L2x8_SUB2
-.LDGEMM_L2x8_SAVE:
+LDGEMM_L2x8_SAVE:
SAVE2x8
-.LDGEMM_L2x8_END:
+LDGEMM_L2x8_END:
-.LDGEMM_L2x4_BEGIN:
+LDGEMM_L2x4_BEGIN:
andi. T1, M, 4
- ble .LDGEMM_L2x4_END
+ ble LDGEMM_L2x4_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L2x4_SUB0
+ ble LDGEMM_L2x4_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L2x4_SUB4
+ ble LDGEMM_L2x4_SUB4
-.LDGEMM_L2x4_LOOP_START:
+LDGEMM_L2x4_LOOP_START:
LOAD2x4_1
KERNEL2x4_I1
@@ -861,11 +1020,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x4_2
addic. L, L, -2
- ble .LDGEMM_L2x4_LOOP_END
+ ble LDGEMM_L2x4_LOOP_END
- .align 5
+ MY_ALIGN
-.LDGEMM_L2x4_LOOP:
+LDGEMM_L2x4_LOOP:
KERNEL2x4_1
KERNEL2x4_2
@@ -878,9 +1037,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x4_2
addic. L, L, -1
- bgt .LDGEMM_L2x4_LOOP
+ bgt LDGEMM_L2x4_LOOP
-.LDGEMM_L2x4_LOOP_END:
+LDGEMM_L2x4_LOOP_END:
KERNEL2x4_1
KERNEL2x4_2
@@ -892,9 +1051,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x4_1
KERNEL2x4_E2
- b .LDGEMM_L2x4_SUB1
+ b LDGEMM_L2x4_SUB1
-.LDGEMM_L2x4_SUB4:
+LDGEMM_L2x4_SUB4:
KERNEL2x4_SUBI1
KERNEL2x4_SUB1
@@ -906,48 +1065,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x4_SUB1
KERNEL2x4_SUB1
- b .LDGEMM_L2x4_SUB1
+ b LDGEMM_L2x4_SUB1
-.LDGEMM_L2x4_SUB0:
+LDGEMM_L2x4_SUB0:
andi. L, K, 7
KERNEL2x4_SUBI1
addic. L, L, -1
- ble .LDGEMM_L2x4_SAVE
- b .LDGEMM_L2x4_SUB2
+ ble LDGEMM_L2x4_SAVE
+ b LDGEMM_L2x4_SUB2
-.LDGEMM_L2x4_SUB1:
+LDGEMM_L2x4_SUB1:
andi. L, K, 7
- ble .LDGEMM_L2x4_SAVE
+ ble LDGEMM_L2x4_SAVE
-.LDGEMM_L2x4_SUB2:
+LDGEMM_L2x4_SUB2:
KERNEL2x4_SUB1
addic. L, L, -1
- bgt .LDGEMM_L2x4_SUB2
+ bgt LDGEMM_L2x4_SUB2
-.LDGEMM_L2x4_SAVE:
+LDGEMM_L2x4_SAVE:
SAVE2x4
-.LDGEMM_L2x4_END:
+LDGEMM_L2x4_END:
-.LDGEMM_L2x2_BEGIN:
+LDGEMM_L2x2_BEGIN:
andi. T1, M, 2
- ble .LDGEMM_L2x2_END
+ ble LDGEMM_L2x2_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L2x2_SUB0
+ ble LDGEMM_L2x2_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L2x2_SUB4
+ ble LDGEMM_L2x2_SUB4
-.LDGEMM_L2x2_LOOP_START:
+LDGEMM_L2x2_LOOP_START:
LOAD2x2_1
KERNEL2x2_I1
@@ -961,11 +1120,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x2_2
addic. L, L, -2
- ble .LDGEMM_L2x2_LOOP_END
+ ble LDGEMM_L2x2_LOOP_END
- .align 5
+ MY_ALIGN
-.LDGEMM_L2x2_LOOP:
+LDGEMM_L2x2_LOOP:
KERNEL2x2_1
KERNEL2x2_2
@@ -978,9 +1137,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x2_2
addic. L, L, -1
- bgt .LDGEMM_L2x2_LOOP
+ bgt LDGEMM_L2x2_LOOP
-.LDGEMM_L2x2_LOOP_END:
+LDGEMM_L2x2_LOOP_END:
KERNEL2x2_1
KERNEL2x2_2
@@ -992,9 +1151,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x2_1
KERNEL2x2_E2
- b .LDGEMM_L2x2_SUB1
+ b LDGEMM_L2x2_SUB1
-.LDGEMM_L2x2_SUB4:
+LDGEMM_L2x2_SUB4:
KERNEL2x2_SUBI1
KERNEL2x2_SUB1
@@ -1006,48 +1165,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x2_SUB1
KERNEL2x2_SUB1
- b .LDGEMM_L2x2_SUB1
+ b LDGEMM_L2x2_SUB1
-.LDGEMM_L2x2_SUB0:
+LDGEMM_L2x2_SUB0:
andi. L, K, 7
KERNEL2x2_SUBI1
addic. L, L, -1
- ble .LDGEMM_L2x2_SAVE
- b .LDGEMM_L2x2_SUB2
+ ble LDGEMM_L2x2_SAVE
+ b LDGEMM_L2x2_SUB2
-.LDGEMM_L2x2_SUB1:
+LDGEMM_L2x2_SUB1:
andi. L, K, 7
- ble .LDGEMM_L2x2_SAVE
+ ble LDGEMM_L2x2_SAVE
-.LDGEMM_L2x2_SUB2:
+LDGEMM_L2x2_SUB2:
KERNEL2x2_SUB1
addic. L, L, -1
- bgt .LDGEMM_L2x2_SUB2
+ bgt LDGEMM_L2x2_SUB2
-.LDGEMM_L2x2_SAVE:
+LDGEMM_L2x2_SAVE:
SAVE2x2
-.LDGEMM_L2x2_END:
+LDGEMM_L2x2_END:
-.LDGEMM_L2x1_BEGIN:
+LDGEMM_L2x1_BEGIN:
andi. T1, M, 1
- ble .LDGEMM_L2x1_END
+ ble LDGEMM_L2x1_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L2x1_SUB0
+ ble LDGEMM_L2x1_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L2x1_SUB4
+ ble LDGEMM_L2x1_SUB4
-.LDGEMM_L2x1_LOOP_START:
+LDGEMM_L2x1_LOOP_START:
LOAD2x1_1
KERNEL2x1_I1
@@ -1061,11 +1220,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x1_2
addic. L, L, -2
- ble .LDGEMM_L2x1_LOOP_END
+ ble LDGEMM_L2x1_LOOP_END
- .align 5
+ MY_ALIGN
-.LDGEMM_L2x1_LOOP:
+LDGEMM_L2x1_LOOP:
KERNEL2x1_1
KERNEL2x1_2
@@ -1078,9 +1237,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x1_2
addic. L, L, -1
- bgt .LDGEMM_L2x1_LOOP
+ bgt LDGEMM_L2x1_LOOP
-.LDGEMM_L2x1_LOOP_END:
+LDGEMM_L2x1_LOOP_END:
KERNEL2x1_1
KERNEL2x1_2
@@ -1092,9 +1251,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x1_1
KERNEL2x1_E2
- b .LDGEMM_L2x1_SUB1
+ b LDGEMM_L2x1_SUB1
-.LDGEMM_L2x1_SUB4:
+LDGEMM_L2x1_SUB4:
KERNEL2x1_SUBI1
KERNEL2x1_SUB1
@@ -1106,59 +1265,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL2x1_SUB1
KERNEL2x1_SUB1
- b .LDGEMM_L2x1_SUB1
+ b LDGEMM_L2x1_SUB1
-.LDGEMM_L2x1_SUB0:
+LDGEMM_L2x1_SUB0:
andi. L, K, 7
KERNEL2x1_SUBI1
addic. L, L, -1
- ble .LDGEMM_L2x1_SAVE
- b .LDGEMM_L2x1_SUB2
+ ble LDGEMM_L2x1_SAVE
+ b LDGEMM_L2x1_SUB2
-.LDGEMM_L2x1_SUB1:
+LDGEMM_L2x1_SUB1:
andi. L, K, 7
- ble .LDGEMM_L2x1_SAVE
+ ble LDGEMM_L2x1_SAVE
-.LDGEMM_L2x1_SUB2:
+LDGEMM_L2x1_SUB2:
KERNEL2x1_SUB1
addic. L, L, -1
- bgt .LDGEMM_L2x1_SUB2
+ bgt LDGEMM_L2x1_SUB2
-.LDGEMM_L2x1_SAVE:
+LDGEMM_L2x1_SAVE:
SAVE2x1
-.LDGEMM_L2x1_END:
+LDGEMM_L2x1_END:
slwi T1, K, 4
add B, B, T1
-.LDGEMM_L2_END:
-.LDGEMM_L1_BEGIN:
+LDGEMM_L2_END:
+LDGEMM_L1_BEGIN:
andi. T1, N, 1
- ble .LDGEMM_L1_END
+ ble LDGEMM_L1_END
mr CO, C
mr AO, A
srawi. I, M, 4
- ble .LDGEMM_L1x16_END
+ ble LDGEMM_L1x16_END
-.LDGEMM_L1x16_BEGIN:
+LDGEMM_L1x16_BEGIN:
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L1x16_SUB0
+ ble LDGEMM_L1x16_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L1x16_SUB4
+ ble LDGEMM_L1x16_SUB4
-.LDGEMM_L1x16_LOOP_START:
+LDGEMM_L1x16_LOOP_START:
dcbt AO, PRE
LOAD1x16_1
@@ -1181,11 +1340,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x16_2
addic. L, L, -2
- ble .LDGEMM_L1x16_LOOP_END
+ ble LDGEMM_L1x16_LOOP_END
- .align 5
+ MY_ALIGN
-.LDGEMM_L1x16_LOOP:
+LDGEMM_L1x16_LOOP:
dcbt AO, PRE
KERNEL1x16_1
@@ -1206,9 +1365,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x16_2
addic. L, L, -1
- bgt .LDGEMM_L1x16_LOOP
+ bgt LDGEMM_L1x16_LOOP
-.LDGEMM_L1x16_LOOP_END:
+LDGEMM_L1x16_LOOP_END:
dcbt AO, PRE
KERNEL1x16_1
@@ -1227,9 +1386,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x16_1
KERNEL1x16_E2
- b .LDGEMM_L1x16_SUB1
+ b LDGEMM_L1x16_SUB1
-.LDGEMM_L1x16_SUB4:
+LDGEMM_L1x16_SUB4:
dcbt AO, PRE
KERNEL1x16_SUBI1
@@ -1245,86 +1404,95 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x16_SUB1
KERNEL1x16_SUB1
- b .LDGEMM_L1x16_SUB1
+ b LDGEMM_L1x16_SUB1
-.LDGEMM_L1x16_SUB0:
+LDGEMM_L1x16_SUB0:
andi. L, K, 7
KERNEL1x16_SUBI1
addic. L, L, -1
- ble .LDGEMM_L1x16_SAVE
- b .LDGEMM_L1x16_SUB2
+ ble LDGEMM_L1x16_SAVE
+ b LDGEMM_L1x16_SUB2
-.LDGEMM_L1x16_SUB1:
+LDGEMM_L1x16_SUB1:
andi. L, K, 7
- ble .LDGEMM_L1x16_SAVE
+ ble LDGEMM_L1x16_SAVE
-.LDGEMM_L1x16_SUB2:
+LDGEMM_L1x16_SUB2:
KERNEL1x16_SUB1
addic. L, L, -1
- bgt .LDGEMM_L1x16_SUB2
+ bgt LDGEMM_L1x16_SUB2
-.LDGEMM_L1x16_SAVE:
+LDGEMM_L1x16_SAVE:
SAVE1x16
addic. I, I, -1
- bgt .LDGEMM_L1x16_BEGIN
+ bgt LDGEMM_L1x16_BEGIN
-.LDGEMM_L1x16_END:
+LDGEMM_L1x16_END:
-.LDGEMM_L1x8_BEGIN:
+LDGEMM_L1x8_BEGIN:
andi. T2, M, 15
- ble .LDGEMM_L1x1_END
+ ble LDGEMM_L1x1_END
andi. T1, M, 8
- ble .LDGEMM_L1x8_END
+ ble LDGEMM_L1x8_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L1x8_SUB0
+ ble LDGEMM_L1x8_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L1x8_SUB4
+ ble LDGEMM_L1x8_SUB4
-.LDGEMM_L1x8_LOOP_START:
+LDGEMM_L1x8_LOOP_START:
+ dcbt AO, PRE
LOAD1x8_1
KERNEL1x8_I1
+ dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
+ dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
+ dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
+ dcbt AO, PRE
KERNEL1x8_2
addic. L, L, -2
- ble .LDGEMM_L1x8_LOOP_END
+ ble LDGEMM_L1x8_LOOP_END
- .align 5
+ MY_ALIGN
-.LDGEMM_L1x8_LOOP:
+LDGEMM_L1x8_LOOP:
KERNEL1x8_1
+ dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
+ dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
+ dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
+ dcbt AO, PRE
KERNEL1x8_2
addic. L, L, -1
- bgt .LDGEMM_L1x8_LOOP
+ bgt LDGEMM_L1x8_LOOP
-.LDGEMM_L1x8_LOOP_END:
+LDGEMM_L1x8_LOOP_END:
KERNEL1x8_1
KERNEL1x8_2
@@ -1336,9 +1504,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x8_1
KERNEL1x8_E2
- b .LDGEMM_L1x8_SUB1
+ b LDGEMM_L1x8_SUB1
-.LDGEMM_L1x8_SUB4:
+LDGEMM_L1x8_SUB4:
KERNEL1x8_SUBI1
KERNEL1x8_SUB1
@@ -1350,48 +1518,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x8_SUB1
KERNEL1x8_SUB1
- b .LDGEMM_L1x8_SUB1
+ b LDGEMM_L1x8_SUB1
-.LDGEMM_L1x8_SUB0:
+LDGEMM_L1x8_SUB0:
andi. L, K, 7
KERNEL1x8_SUBI1
addic. L, L, -1
- ble .LDGEMM_L1x8_SAVE
- b .LDGEMM_L1x8_SUB2
+ ble LDGEMM_L1x8_SAVE
+ b LDGEMM_L1x8_SUB2
-.LDGEMM_L1x8_SUB1:
+LDGEMM_L1x8_SUB1:
andi. L, K, 7
- ble .LDGEMM_L1x8_SAVE
+ ble LDGEMM_L1x8_SAVE
-.LDGEMM_L1x8_SUB2:
+LDGEMM_L1x8_SUB2:
KERNEL1x8_SUB1
addic. L, L, -1
- bgt .LDGEMM_L1x8_SUB2
+ bgt LDGEMM_L1x8_SUB2
-.LDGEMM_L1x8_SAVE:
+LDGEMM_L1x8_SAVE:
SAVE1x8
-.LDGEMM_L1x8_END:
+LDGEMM_L1x8_END:
-.LDGEMM_L1x4_BEGIN:
+LDGEMM_L1x4_BEGIN:
andi. T1, M, 4
- ble .LDGEMM_L1x4_END
+ ble LDGEMM_L1x4_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L1x4_SUB0
+ ble LDGEMM_L1x4_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L1x4_SUB4
+ ble LDGEMM_L1x4_SUB4
-.LDGEMM_L1x4_LOOP_START:
+LDGEMM_L1x4_LOOP_START:
LOAD1x4_1
KERNEL1x4_I1
@@ -1405,11 +1573,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x4_2
addic. L, L, -2
- ble .LDGEMM_L1x4_LOOP_END
+ ble LDGEMM_L1x4_LOOP_END
- .align 5
+ MY_ALIGN
-.LDGEMM_L1x4_LOOP:
+LDGEMM_L1x4_LOOP:
KERNEL1x4_1
KERNEL1x4_2
@@ -1422,9 +1590,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x4_2
addic. L, L, -1
- bgt .LDGEMM_L1x4_LOOP
+ bgt LDGEMM_L1x4_LOOP
-.LDGEMM_L1x4_LOOP_END:
+LDGEMM_L1x4_LOOP_END:
KERNEL1x4_1
KERNEL1x4_2
@@ -1436,9 +1604,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x4_1
KERNEL1x4_E2
- b .LDGEMM_L1x4_SUB1
+ b LDGEMM_L1x4_SUB1
-.LDGEMM_L1x4_SUB4:
+LDGEMM_L1x4_SUB4:
KERNEL1x4_SUBI1
KERNEL1x4_SUB1
@@ -1450,48 +1618,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x4_SUB1
KERNEL1x4_SUB1
- b .LDGEMM_L1x4_SUB1
+ b LDGEMM_L1x4_SUB1
-.LDGEMM_L1x4_SUB0:
+LDGEMM_L1x4_SUB0:
andi. L, K, 7
KERNEL1x4_SUBI1
addic. L, L, -1
- ble .LDGEMM_L1x4_SAVE
- b .LDGEMM_L1x4_SUB2
+ ble LDGEMM_L1x4_SAVE
+ b LDGEMM_L1x4_SUB2
-.LDGEMM_L1x4_SUB1:
+LDGEMM_L1x4_SUB1:
andi. L, K, 7
- ble .LDGEMM_L1x4_SAVE
+ ble LDGEMM_L1x4_SAVE
-.LDGEMM_L1x4_SUB2:
+LDGEMM_L1x4_SUB2:
KERNEL1x4_SUB1
addic. L, L, -1
- bgt .LDGEMM_L1x4_SUB2
+ bgt LDGEMM_L1x4_SUB2
-.LDGEMM_L1x4_SAVE:
+LDGEMM_L1x4_SAVE:
SAVE1x4
-.LDGEMM_L1x4_END:
+LDGEMM_L1x4_END:
-.LDGEMM_L1x2_BEGIN:
+LDGEMM_L1x2_BEGIN:
andi. T1, M, 2
- ble .LDGEMM_L1x2_END
+ ble LDGEMM_L1x2_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L1x2_SUB0
+ ble LDGEMM_L1x2_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L1x2_SUB4
+ ble LDGEMM_L1x2_SUB4
-.LDGEMM_L1x2_LOOP_START:
+LDGEMM_L1x2_LOOP_START:
LOAD1x2_1
KERNEL1x2_I1
@@ -1505,11 +1673,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x2_2
addic. L, L, -2
- ble .LDGEMM_L1x2_LOOP_END
+ ble LDGEMM_L1x2_LOOP_END
- .align 5
+ MY_ALIGN
-.LDGEMM_L1x2_LOOP:
+LDGEMM_L1x2_LOOP:
KERNEL1x2_1
KERNEL1x2_2
@@ -1522,9 +1690,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x2_2
addic. L, L, -1
- bgt .LDGEMM_L1x2_LOOP
+ bgt LDGEMM_L1x2_LOOP
-.LDGEMM_L1x2_LOOP_END:
+LDGEMM_L1x2_LOOP_END:
KERNEL1x2_1
KERNEL1x2_2
@@ -1536,9 +1704,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x2_1
KERNEL1x2_E2
- b .LDGEMM_L1x2_SUB1
+ b LDGEMM_L1x2_SUB1
-.LDGEMM_L1x2_SUB4:
+LDGEMM_L1x2_SUB4:
KERNEL1x2_SUBI1
KERNEL1x2_SUB1
@@ -1550,48 +1718,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x2_SUB1
KERNEL1x2_SUB1
- b .LDGEMM_L1x2_SUB1
+ b LDGEMM_L1x2_SUB1
-.LDGEMM_L1x2_SUB0:
+LDGEMM_L1x2_SUB0:
andi. L, K, 7
KERNEL1x2_SUBI1
addic. L, L, -1
- ble .LDGEMM_L1x2_SAVE
- b .LDGEMM_L1x2_SUB2
+ ble LDGEMM_L1x2_SAVE
+ b LDGEMM_L1x2_SUB2
-.LDGEMM_L1x2_SUB1:
+LDGEMM_L1x2_SUB1:
andi. L, K, 7
- ble .LDGEMM_L1x2_SAVE
+ ble LDGEMM_L1x2_SAVE
-.LDGEMM_L1x2_SUB2:
+LDGEMM_L1x2_SUB2:
KERNEL1x2_SUB1
addic. L, L, -1
- bgt .LDGEMM_L1x2_SUB2
+ bgt LDGEMM_L1x2_SUB2
-.LDGEMM_L1x2_SAVE:
+LDGEMM_L1x2_SAVE:
SAVE1x2
-.LDGEMM_L1x2_END:
+LDGEMM_L1x2_END:
-.LDGEMM_L1x1_BEGIN:
+LDGEMM_L1x1_BEGIN:
andi. T1, M, 1
- ble .LDGEMM_L1x1_END
+ ble LDGEMM_L1x1_END
mr BO, B
srawi. L, K, 3
- ble .LDGEMM_L1x1_SUB0
+ ble LDGEMM_L1x1_SUB0
cmpwi cr0, L, 1
- ble .LDGEMM_L1x1_SUB4
+ ble LDGEMM_L1x1_SUB4
-.LDGEMM_L1x1_LOOP_START:
+LDGEMM_L1x1_LOOP_START:
LOAD1x1_1
KERNEL1x1_I1
@@ -1605,11 +1773,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x1_2
addic. L, L, -2
- ble .LDGEMM_L1x1_LOOP_END
+ ble LDGEMM_L1x1_LOOP_END
- .align 5
+ MY_ALIGN
-.LDGEMM_L1x1_LOOP:
+LDGEMM_L1x1_LOOP:
KERNEL1x1_1
KERNEL1x1_2
@@ -1622,9 +1790,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x1_2
addic. L, L, -1
- bgt .LDGEMM_L1x1_LOOP
+ bgt LDGEMM_L1x1_LOOP
-.LDGEMM_L1x1_LOOP_END:
+LDGEMM_L1x1_LOOP_END:
KERNEL1x1_1
KERNEL1x1_2
@@ -1636,9 +1804,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x1_1
KERNEL1x1_E2
- b .LDGEMM_L1x1_SUB1
+ b LDGEMM_L1x1_SUB1
-.LDGEMM_L1x1_SUB4:
+LDGEMM_L1x1_SUB4:
KERNEL1x1_SUBI1
KERNEL1x1_SUB1
@@ -1650,34 +1818,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
KERNEL1x1_SUB1
KERNEL1x1_SUB1
- b .LDGEMM_L1x1_SUB1
+ b LDGEMM_L1x1_SUB1
-.LDGEMM_L1x1_SUB0:
+LDGEMM_L1x1_SUB0:
andi. L, K, 7
KERNEL1x1_SUBI1
addic. L, L, -1
- ble .LDGEMM_L1x1_SAVE
- b .LDGEMM_L1x1_SUB2
+ ble LDGEMM_L1x1_SAVE
+ b LDGEMM_L1x1_SUB2
-.LDGEMM_L1x1_SUB1:
+LDGEMM_L1x1_SUB1:
andi. L, K, 7
- ble .LDGEMM_L1x1_SAVE
+ ble LDGEMM_L1x1_SAVE
-.LDGEMM_L1x1_SUB2:
+LDGEMM_L1x1_SUB2:
KERNEL1x1_SUB1
addic. L, L, -1
- bgt .LDGEMM_L1x1_SUB2
+ bgt LDGEMM_L1x1_SUB2
-.LDGEMM_L1x1_SAVE:
+LDGEMM_L1x1_SAVE:
SAVE1x1
-.LDGEMM_L1x1_END:
+LDGEMM_L1x1_END:
-.LDGEMM_L1_END:
+LDGEMM_L1_END:
diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S
index 27c05e08e..5be517f7c 100644
--- a/kernel/power/dgemm_macros_16x4_power8.S
+++ b/kernel/power/dgemm_macros_16x4_power8.S
@@ -47,88 +47,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvdsx vs24, 0, BO
lxvdsx vs25, o8, BO
- addi AO, AO, 64
-
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
+ lxvd2x vs4, o64, AO
+ lxvd2x vs5, o80, AO
+ lxvd2x vs6, o96, AO
+ lxvd2x vs7, o112, AO
lxvdsx vs26, o16, BO
lxvdsx vs27, o24, BO
- addi AO, AO, 64
+ addi AO, AO, 128
addi BO, BO, 32
.endm
+
.macro KERNEL4x16_I1
- xvmuldp vs32, vs0, vs24
- xvmuldp vs33, vs1, vs24
- xvmuldp vs34, vs2, vs24
- xvmuldp vs35, vs3, vs24
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
- lxvd2x vs8, 0, AO
+ lxvd2x vs8, o0, AO
lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
- xvmuldp vs36, vs4, vs24
- xvmuldp vs37, vs5, vs24
- xvmuldp vs38, vs6, vs24
- xvmuldp vs39, vs7, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
lxvdsx vs28, 0, BO
lxvdsx vs29, o8, BO
- xvmuldp vs40, vs0, vs25
- xvmuldp vs41, vs1, vs25
- xvmuldp vs42, vs2, vs25
- xvmuldp vs43, vs3, vs25
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
- xvmuldp vs44, vs4, vs25
- xvmuldp vs45, vs5, vs25
- xvmuldp vs46, vs6, vs25
- xvmuldp vs47, vs7, vs25
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
- addi AO, AO, 64
- xvmuldp vs48, vs0, vs26
- xvmuldp vs49, vs1, vs26
- xvmuldp vs50, vs2, vs26
- xvmuldp vs51, vs3, vs26
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
- lxvd2x vs12, 0, AO
- lxvd2x vs13, o16, AO
+ lxvd2x vs12, o64, AO
+ lxvd2x vs13, o80, AO
- xvmuldp vs52, vs4, vs26
- xvmuldp vs53, vs5, vs26
- xvmuldp vs54, vs6, vs26
- xvmuldp vs55, vs7, vs26
+ xvmuldp vs52, vs4, vs26
+ xvmuldp vs53, vs5, vs26
+ xvmuldp vs54, vs6, vs26
+ xvmuldp vs55, vs7, vs26
- lxvd2x vs14, o32, AO
- lxvd2x vs15, o48, AO
+ lxvd2x vs14, o96, AO
+ lxvd2x vs15, o112, AO
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
- xvmuldp vs56, vs0, vs27
- xvmuldp vs57, vs1, vs27
- xvmuldp vs58, vs2, vs27
- xvmuldp vs59, vs3, vs27
lxvdsx vs30, o16, BO
lxvdsx vs31, o24, BO
- xvmuldp vs60, vs4, vs27
- xvmuldp vs61, vs5, vs27
- xvmuldp vs62, vs6, vs27
- xvmuldp vs63, vs7, vs27
+ xvmuldp vs60, vs4, vs27
+ xvmuldp vs61, vs5, vs27
+ xvmuldp vs62, vs6, vs27
+ xvmuldp vs63, vs7, vs27
- addi AO, AO, 64
- addi BO, BO, 32
+ addi AO, AO, 128
.endm
+
+
.macro KERNEL4x16_1
xvmaddadp vs32, vs0, vs24
@@ -136,8 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs34, vs2, vs24
xvmaddadp vs35, vs3, vs24
- lxvd2x vs8, 0, AO
+ lxvd2x vs8, o0, AO
lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
xvmaddadp vs36, vs4, vs24
xvmaddadp vs37, vs5, vs24
@@ -152,31 +154,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs42, vs2, vs25
xvmaddadp vs43, vs3, vs25
- lxvd2x vs10, o32, AO
- lxvd2x vs11, o48, AO
xvmaddadp vs44, vs4, vs25
xvmaddadp vs45, vs5, vs25
xvmaddadp vs46, vs6, vs25
xvmaddadp vs47, vs7, vs25
- addi AO, AO, 64
xvmaddadp vs48, vs0, vs26
xvmaddadp vs49, vs1, vs26
xvmaddadp vs50, vs2, vs26
xvmaddadp vs51, vs3, vs26
- lxvd2x vs12, 0, AO
- lxvd2x vs13, o16, AO
+ lxvd2x vs12, o64, AO
+ lxvd2x vs13, o80, AO
xvmaddadp vs52, vs4, vs26
xvmaddadp vs53, vs5, vs26
xvmaddadp vs54, vs6, vs26
xvmaddadp vs55, vs7, vs26
- lxvd2x vs14, o32, AO
- lxvd2x vs15, o48, AO
+ lxvd2x vs14, o96, AO
+ lxvd2x vs15, o112, AO
xvmaddadp vs56, vs0, vs27
xvmaddadp vs57, vs1, vs27
@@ -192,7 +191,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs62, vs6, vs27
xvmaddadp vs63, vs7, vs27
- addi AO, AO, 64
+ addi AO, AO, 128
addi BO, BO, 32
.endm
@@ -228,23 +227,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs46, vs14, vs29
xvmaddadp vs47, vs15, vs29
- addi AO, AO, 64
xvmaddadp vs48, vs8, vs30
xvmaddadp vs49, vs9, vs30
xvmaddadp vs50, vs10, vs30
xvmaddadp vs51, vs11, vs30
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
+ lxvd2x vs4, o64, AO
+ lxvd2x vs5, o80, AO
xvmaddadp vs52, vs12, vs30
xvmaddadp vs53, vs13, vs30
xvmaddadp vs54, vs14, vs30
xvmaddadp vs55, vs15, vs30
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
+ lxvd2x vs6, o96, AO
+ lxvd2x vs7, o112, AO
xvmaddadp vs56, vs8, vs31
xvmaddadp vs57, vs9, vs31
@@ -259,11 +257,144 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs62, vs14, vs31
xvmaddadp vs63, vs15, vs31
- addi AO, AO, 64
+ addi AO, AO, 128
addi BO, BO, 32
.endm
+.macro KERNEL4x16_L1
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ lxvd2x vs8, o0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+
+ lxvd2x vs12, o64, AO
+ lxvd2x vs13, o80, AO
+
+ xvmaddadp vs52, vs4, vs26
+ xvmaddadp vs53, vs5, vs26
+ xvmaddadp vs54, vs6, vs26
+ xvmaddadp vs55, vs7, vs26
+
+ lxvd2x vs14, o96, AO
+ lxvd2x vs15, o112, AO
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmaddadp vs60, vs4, vs27
+ xvmaddadp vs61, vs5, vs27
+ xvmaddadp vs62, vs6, vs27
+ xvmaddadp vs63, vs7, vs27
+
+ addi AO, AO, 128
+
+.endm
+
+.macro KERNEL4x16_L2
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ lxvdsx vs24, o32, BO
+ lxvdsx vs25, o40, BO
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+
+ lxvd2x vs4, o64, AO
+ lxvd2x vs5, o80, AO
+
+ xvmaddadp vs52, vs12, vs30
+ xvmaddadp vs53, vs13, vs30
+ xvmaddadp vs54, vs14, vs30
+ xvmaddadp vs55, vs15, vs30
+
+ lxvd2x vs6, o96, AO
+ lxvd2x vs7, o112, AO
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+
+ lxvdsx vs26, o48, BO
+ lxvdsx vs27, o56, BO
+
+ xvmaddadp vs60, vs12, vs31
+ addi AO, AO, 128
+ xvmaddadp vs61, vs13, vs31
+ xvmaddadp vs62, vs14, vs31
+ addi BO, BO, 64
+ xvmaddadp vs63, vs15, vs31
+
+
+.endm
+
+
.macro KERNEL4x16_E2
@@ -378,15 +509,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvdsx vs26, o16, BO
lxvdsx vs27, o24, BO
- addi AO, AO, 64
- addi BO, BO, 32
- lxvd2x vs4, 0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
+ lxvd2x vs4, o64, AO
+ lxvd2x vs5, o80, AO
+ lxvd2x vs6, o96, AO
+ lxvd2x vs7, o112, AO
- addi AO, AO, 64
xvmaddadp vs32, vs0, vs24
@@ -402,6 +530,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs41, vs1, vs25
xvmaddadp vs42, vs2, vs25
xvmaddadp vs43, vs3, vs25
+ addi BO, BO, 32
xvmaddadp vs44, vs4, vs25
xvmaddadp vs45, vs5, vs25
xvmaddadp vs46, vs6, vs25
@@ -411,6 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs49, vs1, vs26
xvmaddadp vs50, vs2, vs26
xvmaddadp vs51, vs3, vs26
+ addi AO, AO, 128
xvmaddadp vs52, vs4, vs26
xvmaddadp vs53, vs5, vs26
xvmaddadp vs54, vs6, vs26
@@ -429,195 +559,126 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x16
- mr T1, CO
- addi T2, T1, 64
-
-#ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- lxvd2x vs2, o32, T1
- lxvd2x vs3, o48, T1
-
- lxvd2x vs4, 0, T2
- lxvd2x vs5, o16, T2
- lxvd2x vs6, o32, T2
- lxvd2x vs7, o48, T2
-#endif
+ add T2, CO, LDC
+
+ lxvd2x vs0, 0, CO
+ lxvd2x vs1, o16, CO
+ lxvd2x vs2, o32, CO
+ lxvd2x vs3, o48, CO
+ lxvd2x vs4, o64, CO
+ lxvd2x vs5, o80, CO
+ add T3, T2, LDC
+ lxvd2x vs6, o96, CO
+ lxvd2x vs7, o112, CO
+
+ lxvd2x vs8, 0, T2
+ lxvd2x vs9, o16, T2
+ lxvd2x vs10, o32, T2
+ lxvd2x vs11, o48, T2
+ lxvd2x vs12, o64, T2
+ lxvd2x vs13, o80, T2
+ add T4, T3, LDC
+ lxvd2x vs14, o96, T2
+ lxvd2x vs15, o112, T2
+
+ lxvd2x vs24, 0, T3
+ lxvd2x vs25, o16, T3
+ lxvd2x vs26, o32, T3
+ lxvd2x vs27, o48, T3
+ lxvd2x vs28, o64, T3
+ lxvd2x vs29, o80, T3
+ lxvd2x vs30, o96, T3
+ lxvd2x vs31, o112, T3
-#ifndef TRMMKERNEL
xvmaddadp vs0, vs32, alpha_r
+ lxvd2x vs32, 0, T4
xvmaddadp vs1, vs33, alpha_r
+ lxvd2x vs33, o16, T4
xvmaddadp vs2, vs34, alpha_r
+ lxvd2x vs34, o32, T4
xvmaddadp vs3, vs35, alpha_r
+ lxvd2x vs35, o48, T4
xvmaddadp vs4, vs36, alpha_r
+ lxvd2x vs36, o64, T4
xvmaddadp vs5, vs37, alpha_r
+ lxvd2x vs37, o80, T4
xvmaddadp vs6, vs38, alpha_r
+ lxvd2x vs38, o96, T4
xvmaddadp vs7, vs39, alpha_r
-#else
- xvmuldp vs0, vs32, alpha_r
- xvmuldp vs1, vs33, alpha_r
- xvmuldp vs2, vs34, alpha_r
- xvmuldp vs3, vs35, alpha_r
- xvmuldp vs4, vs36, alpha_r
- xvmuldp vs5, vs37, alpha_r
- xvmuldp vs6, vs38, alpha_r
- xvmuldp vs7, vs39, alpha_r
-#endif
-
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
- stxvd2x vs2, o32, T1
- stxvd2x vs3, o48, T1
-
- dcbt T1, PRE
-
- stxvd2x vs4, 0, T2
- stxvd2x vs5, o16, T2
- stxvd2x vs6, o32, T2
- stxvd2x vs7, o48, T2
-
- add T1, T1, LDC
- add T2, T2, LDC
+ lxvd2x vs39, o112, T4
-#ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- lxvd2x vs9, o16, T1
- lxvd2x vs10, o32, T1
- lxvd2x vs11, o48, T1
-
- lxvd2x vs12, 0, T2
- lxvd2x vs13, o16, T2
- lxvd2x vs14, o32, T2
- lxvd2x vs15, o48, T2
-#endif
-
-#ifndef TRMMKERNEL
xvmaddadp vs8, vs40, alpha_r
xvmaddadp vs9, vs41, alpha_r
xvmaddadp vs10, vs42, alpha_r
xvmaddadp vs11, vs43, alpha_r
+
xvmaddadp vs12, vs44, alpha_r
xvmaddadp vs13, vs45, alpha_r
xvmaddadp vs14, vs46, alpha_r
xvmaddadp vs15, vs47, alpha_r
-#else
- xvmuldp vs8, vs40, alpha_r
- xvmuldp vs9, vs41, alpha_r
- xvmuldp vs10, vs42, alpha_r
- xvmuldp vs11, vs43, alpha_r
- xvmuldp vs12, vs44, alpha_r
- xvmuldp vs13, vs45, alpha_r
- xvmuldp vs14, vs46, alpha_r
- xvmuldp vs15, vs47, alpha_r
-#endif
- stxvd2x vs8, 0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
+ xvmaddadp vs24, vs48, alpha_r
+ xvmaddadp vs25, vs49, alpha_r
+ xvmaddadp vs26, vs50, alpha_r
+ xvmaddadp vs27, vs51, alpha_r
- dcbt T1, PRE
+ xvmaddadp vs28, vs52, alpha_r
+ xvmaddadp vs29, vs53, alpha_r
+ xvmaddadp vs30, vs54, alpha_r
+ xvmaddadp vs31, vs55, alpha_r
- stxvd2x vs12, 0, T2
- stxvd2x vs13, o16, T2
- stxvd2x vs14, o32, T2
- stxvd2x vs15, o48, T2
-
- add T1, T1, LDC
- add T2, T2, LDC
+ stxvd2x vs0, 0, CO
+ stxvd2x vs1, o16, CO
+ stxvd2x vs2, o32, CO
+ stxvd2x vs3, o48, CO
-#ifndef TRMMKERNEL
- lxvd2x vs0, 0, T1
- lxvd2x vs1, o16, T1
- lxvd2x vs2, o32, T1
- lxvd2x vs3, o48, T1
+ stxvd2x vs4, o64, CO
+ stxvd2x vs5, o80, CO
+ stxvd2x vs6, o96, CO
+ stxvd2x vs7, o112, CO
- lxvd2x vs4, 0, T2
- lxvd2x vs5, o16, T2
- lxvd2x vs6, o32, T2
- lxvd2x vs7, o48, T2
-#endif
+ xvmaddadp vs32, vs56, alpha_r
+ xvmaddadp vs33, vs57, alpha_r
+ xvmaddadp vs34, vs58, alpha_r
+ xvmaddadp vs35, vs59, alpha_r
-#ifndef TRMMKERNEL
- xvmaddadp vs0, vs48, alpha_r
- xvmaddadp vs1, vs49, alpha_r
- xvmaddadp vs2, vs50, alpha_r
- xvmaddadp vs3, vs51, alpha_r
- xvmaddadp vs4, vs52, alpha_r
- xvmaddadp vs5, vs53, alpha_r
- xvmaddadp vs6, vs54, alpha_r
- xvmaddadp vs7, vs55, alpha_r
-#else
- xvmuldp vs0, vs48, alpha_r
- xvmuldp vs1, vs49, alpha_r
- xvmuldp vs2, vs50, alpha_r
- xvmuldp vs3, vs51, alpha_r
- xvmuldp vs4, vs52, alpha_r
- xvmuldp vs5, vs53, alpha_r
- xvmuldp vs6, vs54, alpha_r
- xvmuldp vs7, vs55, alpha_r
-#endif
+ xvmaddadp vs36, vs60, alpha_r
+ xvmaddadp vs37, vs61, alpha_r
+ xvmaddadp vs38, vs62, alpha_r
+ xvmaddadp vs39, vs63, alpha_r
- stxvd2x vs0, 0, T1
- stxvd2x vs1, o16, T1
- stxvd2x vs2, o32, T1
- stxvd2x vs3, o48, T1
+ addi CO, CO, 128
- dcbt T1, PRE
+ stxvd2x vs8, o0, T2
+ stxvd2x vs9, o16, T2
+ stxvd2x vs10, o32, T2
+ stxvd2x vs11, o48, T2
- stxvd2x vs4, 0, T2
- stxvd2x vs5, o16, T2
- stxvd2x vs6, o32, T2
- stxvd2x vs7, o48, T2
+ stxvd2x vs12, o64, T2
+ stxvd2x vs13, o80, T2
+ stxvd2x vs14, o96, T2
+ stxvd2x vs15, o112, T2
- add T1, T1, LDC
- add T2, T2, LDC
+ stxvd2x vs24, 0, T3
+ stxvd2x vs25, o16, T3
+ stxvd2x vs28, o64, T3
+ stxvd2x vs29, o80, T3
-#ifndef TRMMKERNEL
- lxvd2x vs8, 0, T1
- lxvd2x vs9, o16, T1
- lxvd2x vs10, o32, T1
- lxvd2x vs11, o48, T1
+ stxvd2x vs26, o32, T3
+ stxvd2x vs27, o48, T3
+ stxvd2x vs30, o96, T3
+ stxvd2x vs31, o112, T3
- lxvd2x vs12, 0, T2
- lxvd2x vs13, o16, T2
- lxvd2x vs14, o32, T2
- lxvd2x vs15, o48, T2
-#endif
+ stxvd2x vs32, o0, T4
+ stxvd2x vs33, o16, T4
+ stxvd2x vs34, o32, T4
+ stxvd2x vs35, o48, T4
-#ifndef TRMMKERNEL
- xvmaddadp vs8, vs56, alpha_r
- xvmaddadp vs9, vs57, alpha_r
- xvmaddadp vs10, vs58, alpha_r
- xvmaddadp vs11, vs59, alpha_r
- xvmaddadp vs12, vs60, alpha_r
- xvmaddadp vs13, vs61, alpha_r
- xvmaddadp vs14, vs62, alpha_r
- xvmaddadp vs15, vs63, alpha_r
-#else
- xvmuldp vs8, vs56, alpha_r
- xvmuldp vs9, vs57, alpha_r
- xvmuldp vs10, vs58, alpha_r
- xvmuldp vs11, vs59, alpha_r
- xvmuldp vs12, vs60, alpha_r
- xvmuldp vs13, vs61, alpha_r
- xvmuldp vs14, vs62, alpha_r
- xvmuldp vs15, vs63, alpha_r
-#endif
-
- stxvd2x vs8, 0, T1
- stxvd2x vs9, o16, T1
- stxvd2x vs10, o32, T1
- stxvd2x vs11, o48, T1
-
- dcbt T1, PRE
-
- stxvd2x vs12, 0, T2
- stxvd2x vs13, o16, T2
- stxvd2x vs14, o32, T2
- stxvd2x vs15, o48, T2
+ stxvd2x vs36, o64, T4
+ stxvd2x vs37, o80, T4
+ stxvd2x vs38, o96, T4
+ stxvd2x vs39, o112, T4
- addi CO, CO, 128
.endm
diff --git a/kernel/power/dgemm_ncopy_4_power8.S b/kernel/power/dgemm_ncopy_4_power8.S
new file mode 100644
index 000000000..31966047f
--- /dev/null
+++ b/kernel/power/dgemm_ncopy_4_power8.S
@@ -0,0 +1,228 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define M r3
+#define N r4
+#define A r5
+#define LDA r6
+#define B r7
+
+#define A0 r8
+#define A1 r9
+#define A2 r10
+#define A3 r11
+
+#define J r12
+
+#define PREA r14
+#define PREB r15
+#define BO r16
+#define o64 r17
+#define o80 r18
+#define o96 r19
+#define o112 r20
+#define o8 r21
+#define T2 r22
+#define I r23
+#define o16 r24
+#define o32 r25
+#define o48 r26
+#define NOTU1 r27
+#define NOTU2 r30
+#define T1 r31
+
+#define o0 0
+
+#include "dgemm_ncopy_macros_4_power8.S"
+
+#define STACKSIZE 384
+
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+ cmpwi cr0, M, 0
+ ble- L999
+ cmpwi cr0, N, 0
+ ble- L999
+
+ slwi LDA, LDA, BASE_SHIFT
+
+ li PREA, 384
+ li PREB, 384
+
+ li o8, 8
+ li o16, 16
+ li o32, 32
+ li o48, 48
+ li o64, 64
+ li o80, 80
+ li o96, 96
+ li o112, 112
+
+#include "dgemm_ncopy_logic_4_power8.S"
+
+L999:
+
+ li r3, 0
+
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ addi SP, SP, STACKSIZE
+
+ blr
+ EPILOGUE
+
+
diff --git a/kernel/power/dgemm_ncopy_logic_4_power8.S b/kernel/power/dgemm_ncopy_logic_4_power8.S
new file mode 100644
index 000000000..6944a7818
--- /dev/null
+++ b/kernel/power/dgemm_ncopy_logic_4_power8.S
@@ -0,0 +1,237 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+ mr BO, B
+ srawi. I, N, 2
+ ble DCOPYN_L2_BEGIN
+
+
+DCOPYN_L4_BEGIN:
+
+
+DCOPYN_L4_LOOP:
+
+ mr A0, A
+ add A1, A0, LDA
+ add A2, A1, LDA
+ add A3, A2, LDA
+ add A, A3, LDA
+
+DCOPYN_L4x16_BEGIN:
+
+ srawi. J, M, 4
+ ble DCOPYN_L4x16_END
+
+DCOPYN_L4x16_LOOP:
+
+ dcbt A0, PREA
+ dcbt A1, PREA
+ dcbt A2, PREA
+ dcbt A3, PREA
+ COPY_4x16
+ addic. J, J, -1
+ bgt DCOPYN_L4x16_LOOP
+
+DCOPYN_L4x16_END:
+
+
+DCOPYN_L4x8_BEGIN:
+
+ andi. J, M, 8
+ ble DCOPYN_L4x8_END
+ COPY_4x8
+
+DCOPYN_L4x8_END:
+
+
+DCOPYN_L4x4_BEGIN:
+
+ andi. J, M, 4
+ ble DCOPYN_L4x4_END
+ COPY_4x4
+
+DCOPYN_L4x4_END:
+
+
+DCOPYN_L4x2_BEGIN:
+
+ andi. J, M, 2
+ ble DCOPYN_L4x2_END
+ COPY_4x2
+
+DCOPYN_L4x2_END:
+
+
+DCOPYN_L4x1_BEGIN:
+
+ andi. J, M, 1
+ ble DCOPYN_L4x1_END
+ COPY_4x1
+
+DCOPYN_L4x1_END:
+
+
+DCOPYN_L4_END:
+
+ addic. I, I, -1
+ bgt DCOPYN_L4_LOOP
+
+DCOPYN_L2_BEGIN:
+
+ andi. T1, 4, 2
+ ble DCOPYN_L2_END
+
+DCOPYN_L2_LOOP:
+
+ mr A0, A
+ add A1, A0, LDA
+ add A, A1, LDA
+
+DCOPYN_L2x16_BEGIN:
+
+ srawi. J, M, 4
+ ble DCOPYN_L2x16_END
+
+DCOPYN_L2x16_LOOP:
+
+ COPY_2x16
+ addic. J, J, -1
+ bgt DCOPYN_L2x16_LOOP
+
+DCOPYN_L2x16_END:
+
+
+DCOPYN_L2x8_BEGIN:
+
+ andi. J, M, 8
+ ble DCOPYN_L2x8_END
+ COPY_2x8
+
+DCOPYN_L2x8_END:
+
+
+DCOPYN_L2x4_BEGIN:
+
+ andi. J, M, 4
+ ble DCOPYN_L2x4_END
+ COPY_2x4
+
+DCOPYN_L2x4_END:
+
+
+DCOPYN_L2x2_BEGIN:
+
+ andi. J, M, 2
+ ble DCOPYN_L2x2_END
+ COPY_2x2
+
+DCOPYN_L2x2_END:
+
+
+DCOPYN_L2x1_BEGIN:
+
+ andi. J, M, 1
+ ble DCOPYN_L2x1_END
+ COPY_2x1
+
+DCOPYN_L2x1_END:
+
+
+DCOPYN_L2_END:
+
+
+DCOPYN_L1_BEGIN:
+
+ andi. T1, 4, 1
+ ble DCOPYN_L1_END
+
+DCOPYN_L1_LOOP:
+
+ mr A0, A
+ add A, A0, LDA
+
+DCOPYN_L1x16_BEGIN:
+
+ srawi. J, M, 4
+ ble DCOPYN_L1x16_END
+
+DCOPYN_L1x16_LOOP:
+
+ COPY_1x16
+ addic. J, J, -1
+ bgt DCOPYN_L1x16_LOOP
+
+DCOPYN_L1x16_END:
+
+
+DCOPYN_L1x8_BEGIN:
+
+ andi. J, M, 8
+ ble DCOPYN_L1x8_END
+ COPY_1x8
+
+DCOPYN_L1x8_END:
+
+
+DCOPYN_L1x4_BEGIN:
+
+ andi. J, M, 4
+ ble DCOPYN_L1x4_END
+ COPY_1x4
+
+DCOPYN_L1x4_END:
+
+
+DCOPYN_L1x2_BEGIN:
+
+ andi. J, M, 2
+ ble DCOPYN_L1x2_END
+ COPY_1x2
+
+DCOPYN_L1x2_END:
+
+
+DCOPYN_L1x1_BEGIN:
+
+ andi. J, M, 1
+ ble DCOPYN_L1x1_END
+ COPY_1x1
+
+DCOPYN_L1x1_END:
+
+
+DCOPYN_L1_END:
+
diff --git a/kernel/power/dgemm_ncopy_macros_4_power8.S b/kernel/power/dgemm_ncopy_macros_4_power8.S
new file mode 100644
index 000000000..fafb09877
--- /dev/null
+++ b/kernel/power/dgemm_ncopy_macros_4_power8.S
@@ -0,0 +1,698 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro COPY_4x16
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs8, o0, A1
+ lxvd2x vs24, o0, A3
+ lxvd2x vs16, o0, A2
+
+ lxvd2x vs1, o16, A0
+ lxvd2x vs9, o16, A1
+ lxvd2x vs17, o16, A2
+ lxvd2x vs25, o16, A3
+
+ lxvd2x vs2, o32, A0
+ lxvd2x vs10, o32, A1
+ lxvd2x vs18, o32, A2
+ lxvd2x vs26, o32, A3
+
+ lxvd2x vs3, o48, A0
+ lxvd2x vs11, o48, A1
+ lxvd2x vs19, o48, A2
+ lxvd2x vs27, o48, A3
+
+ lxvd2x vs4, o64, A0
+ lxvd2x vs12, o64, A1
+ lxvd2x vs20, o64, A2
+ lxvd2x vs28, o64, A3
+
+ lxvd2x vs5, o80, A0
+ lxvd2x vs13, o80, A1
+ lxvd2x vs21, o80, A2
+ lxvd2x vs29, o80, A3
+
+ lxvd2x vs6, o96, A0
+ lxvd2x vs14, o96, A1
+ lxvd2x vs22, o96, A2
+ lxvd2x vs30, o96, A3
+
+ lxvd2x vs7, o112, A0
+ lxvd2x vs15, o112, A1
+ lxvd2x vs23, o112, A2
+ lxvd2x vs31, o112, A3
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs16, vs24, 0
+ xxpermdi vs34, vs0, vs8, 3
+ xxpermdi vs35, vs16, vs24, 3
+
+ xxpermdi vs36, vs1, vs9, 0
+ xxpermdi vs37, vs17, vs25, 0
+ xxpermdi vs38, vs1, vs9, 3
+ xxpermdi vs39, vs17, vs25, 3
+
+ xxpermdi vs40, vs2, vs10, 0
+ xxpermdi vs41, vs18, vs26, 0
+ xxpermdi vs42, vs2, vs10, 3
+ xxpermdi vs43, vs18, vs26, 3
+
+ xxpermdi vs44, vs3, vs11, 0
+ xxpermdi vs45, vs19, vs27, 0
+ xxpermdi vs46, vs3, vs11, 3
+ xxpermdi vs47, vs19, vs27, 3
+
+ xxpermdi vs48, vs4, vs12, 0
+ xxpermdi vs49, vs20, vs28, 0
+ xxpermdi vs50, vs4, vs12, 3
+ xxpermdi vs51, vs20, vs28, 3
+
+ xxpermdi vs52, vs5, vs13, 0
+ xxpermdi vs53, vs21, vs29, 0
+ xxpermdi vs54, vs5, vs13, 3
+ xxpermdi vs55, vs21, vs29, 3
+
+ addi A0, A0, 128
+ addi A1, A1, 128
+
+ xxpermdi vs56, vs6, vs14, 0
+ xxpermdi vs57, vs22, vs30, 0
+ xxpermdi vs58, vs6, vs14, 3
+ xxpermdi vs59, vs22, vs30, 3
+
+ addi A3, A3, 128
+ addi A2, A2, 128
+
+ xxpermdi vs60, vs7, vs15, 0
+ xxpermdi vs61, vs23, vs31, 0
+ xxpermdi vs62, vs7, vs15, 3
+ xxpermdi vs63, vs23, vs31, 3
+
+ dcbt BO, PREB
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ stxvd2x vs34, o32, BO
+ stxvd2x vs35, o48, BO
+ stxvd2x vs36, o64, BO
+ stxvd2x vs37, o80, BO
+ stxvd2x vs38, o96, BO
+ stxvd2x vs39, o112, BO
+ addi BO, BO, 128
+
+ dcbt BO, PREB
+
+ stxvd2x vs40, o0, BO
+ stxvd2x vs41, o16, BO
+ stxvd2x vs42, o32, BO
+ stxvd2x vs43, o48, BO
+ stxvd2x vs44, o64, BO
+ stxvd2x vs45, o80, BO
+ stxvd2x vs46, o96, BO
+ stxvd2x vs47, o112, BO
+ addi BO, BO, 128
+
+ dcbt BO, PREB
+
+ stxvd2x vs48, o0, BO
+ stxvd2x vs49, o16, BO
+ stxvd2x vs50, o32, BO
+ stxvd2x vs51, o48, BO
+ stxvd2x vs52, o64, BO
+ stxvd2x vs53, o80, BO
+ stxvd2x vs54, o96, BO
+ stxvd2x vs55, o112, BO
+ addi BO, BO, 128
+
+ dcbt BO, PREB
+
+ stxvd2x vs56, o0, BO
+ stxvd2x vs57, o16, BO
+ stxvd2x vs58, o32, BO
+ stxvd2x vs59, o48, BO
+ stxvd2x vs60, o64, BO
+ stxvd2x vs61, o80, BO
+ stxvd2x vs62, o96, BO
+ stxvd2x vs63, o112, BO
+ addi BO, BO, 128
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro COPY_4x8
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ lxvd2x vs2, o32, A0
+ lxvd2x vs3, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs8, o0, A1
+ lxvd2x vs9, o16, A1
+ lxvd2x vs10, o32, A1
+ lxvd2x vs11, o48, A1
+ addi A1, A1, 64
+
+
+ lxvd2x vs16, o0, A2
+ lxvd2x vs17, o16, A2
+ lxvd2x vs18, o32, A2
+ lxvd2x vs19, o48, A2
+ addi A2, A2, 64
+
+
+ lxvd2x vs24, o0, A3
+ lxvd2x vs25, o16, A3
+ lxvd2x vs26, o32, A3
+ lxvd2x vs27, o48, A3
+ addi A3, A3, 64
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs16, vs24, 0
+ xxpermdi vs34, vs0, vs8, 3
+ xxpermdi vs35, vs16, vs24, 3
+
+ xxpermdi vs36, vs1, vs9, 0
+ xxpermdi vs37, vs17, vs25, 0
+ xxpermdi vs38, vs1, vs9, 3
+ xxpermdi vs39, vs17, vs25, 3
+
+ xxpermdi vs40, vs2, vs10, 0
+ xxpermdi vs41, vs18, vs26, 0
+ xxpermdi vs42, vs2, vs10, 3
+ xxpermdi vs43, vs18, vs26, 3
+
+ xxpermdi vs44, vs3, vs11, 0
+ xxpermdi vs45, vs19, vs27, 0
+ xxpermdi vs46, vs3, vs11, 3
+ xxpermdi vs47, vs19, vs27, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ stxvd2x vs34, o32, BO
+ stxvd2x vs35, o48, BO
+ stxvd2x vs36, o64, BO
+ stxvd2x vs37, o80, BO
+ stxvd2x vs38, o96, BO
+ stxvd2x vs39, o112, BO
+ addi BO, BO, 128
+
+ stxvd2x vs40, o0, BO
+ stxvd2x vs41, o16, BO
+ stxvd2x vs42, o32, BO
+ stxvd2x vs43, o48, BO
+ stxvd2x vs44, o64, BO
+ stxvd2x vs45, o80, BO
+ stxvd2x vs46, o96, BO
+ stxvd2x vs47, o112, BO
+ addi BO, BO, 128
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro COPY_4x4
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ addi A0, A0, 32
+
+
+ lxvd2x vs8, o0, A1
+ lxvd2x vs9, o16, A1
+ addi A1, A1, 32
+
+
+ lxvd2x vs16, o0, A2
+ lxvd2x vs17, o16, A2
+ addi A2, A2, 32
+
+
+ lxvd2x vs24, o0, A3
+ lxvd2x vs25, o16, A3
+ addi A3, A3, 32
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs16, vs24, 0
+ xxpermdi vs34, vs0, vs8, 3
+ xxpermdi vs35, vs16, vs24, 3
+
+ xxpermdi vs36, vs1, vs9, 0
+ xxpermdi vs37, vs17, vs25, 0
+ xxpermdi vs38, vs1, vs9, 3
+ xxpermdi vs39, vs17, vs25, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ stxvd2x vs34, o32, BO
+ stxvd2x vs35, o48, BO
+ stxvd2x vs36, o64, BO
+ stxvd2x vs37, o80, BO
+ stxvd2x vs38, o96, BO
+ stxvd2x vs39, o112, BO
+ addi BO, BO, 128
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro COPY_4x2
+
+ lxvd2x vs0, o0, A0
+ addi A0, A0, 16
+
+
+ lxvd2x vs8, o0, A1
+ addi A1, A1, 16
+
+
+ lxvd2x vs16, o0, A2
+ addi A2, A2, 16
+
+
+ lxvd2x vs24, o0, A3
+ addi A3, A3, 16
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs16, vs24, 0
+ xxpermdi vs34, vs0, vs8, 3
+ xxpermdi vs35, vs16, vs24, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ stxvd2x vs34, o32, BO
+ stxvd2x vs35, o48, BO
+ addi BO, BO, 64
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro COPY_4x1
+
+ lxsdx vs0, o0, A0
+ addi A0, A0, 8
+
+
+ lxsdx vs8, o0, A1
+ addi A1, A1, 8
+
+
+ lxsdx vs16, o0, A2
+ addi A2, A2, 8
+
+
+ lxsdx vs24, o0, A3
+ addi A3, A3, 8
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs16, vs24, 0
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ addi BO, BO, 32
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+.macro COPY_2x16
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ lxvd2x vs2, o32, A0
+ lxvd2x vs3, o48, A0
+ lxvd2x vs4, o64, A0
+ lxvd2x vs5, o80, A0
+ lxvd2x vs6, o96, A0
+ lxvd2x vs7, o112, A0
+ addi A0, A0, 128
+
+
+ lxvd2x vs8, o0, A1
+ lxvd2x vs9, o16, A1
+ lxvd2x vs10, o32, A1
+ lxvd2x vs11, o48, A1
+ lxvd2x vs12, o64, A1
+ lxvd2x vs13, o80, A1
+ lxvd2x vs14, o96, A1
+ lxvd2x vs15, o112, A1
+ addi A1, A1, 128
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs0, vs8, 3
+
+ xxpermdi vs34, vs1, vs9, 0
+ xxpermdi vs35, vs1, vs9, 3
+
+ xxpermdi vs36, vs2, vs10, 0
+ xxpermdi vs37, vs2, vs10, 3
+
+ xxpermdi vs38, vs3, vs11, 0
+ xxpermdi vs39, vs3, vs11, 3
+
+ xxpermdi vs40, vs4, vs12, 0
+ xxpermdi vs41, vs4, vs12, 3
+
+ xxpermdi vs42, vs5, vs13, 0
+ xxpermdi vs43, vs5, vs13, 3
+
+ xxpermdi vs44, vs6, vs14, 0
+ xxpermdi vs45, vs6, vs14, 3
+
+ xxpermdi vs46, vs7, vs15, 0
+ xxpermdi vs47, vs7, vs15, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ stxvd2x vs34, o32, BO
+ stxvd2x vs35, o48, BO
+ stxvd2x vs36, o64, BO
+ stxvd2x vs37, o80, BO
+ stxvd2x vs38, o96, BO
+ stxvd2x vs39, o112, BO
+ addi BO, BO, 128
+
+ stxvd2x vs40, o0, BO
+ stxvd2x vs41, o16, BO
+ stxvd2x vs42, o32, BO
+ stxvd2x vs43, o48, BO
+ stxvd2x vs44, o64, BO
+ stxvd2x vs45, o80, BO
+ stxvd2x vs46, o96, BO
+ stxvd2x vs47, o112, BO
+ addi BO, BO, 128
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro COPY_2x8
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ lxvd2x vs2, o32, A0
+ lxvd2x vs3, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs8, o0, A1
+ lxvd2x vs9, o16, A1
+ lxvd2x vs10, o32, A1
+ lxvd2x vs11, o48, A1
+ addi A1, A1, 64
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs0, vs8, 3
+
+ xxpermdi vs34, vs1, vs9, 0
+ xxpermdi vs35, vs1, vs9, 3
+
+ xxpermdi vs36, vs2, vs10, 0
+ xxpermdi vs37, vs2, vs10, 3
+
+ xxpermdi vs38, vs3, vs11, 0
+ xxpermdi vs39, vs3, vs11, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ stxvd2x vs34, o32, BO
+ stxvd2x vs35, o48, BO
+ stxvd2x vs36, o64, BO
+ stxvd2x vs37, o80, BO
+ stxvd2x vs38, o96, BO
+ stxvd2x vs39, o112, BO
+ addi BO, BO, 128
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro COPY_2x4
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ addi A0, A0, 32
+
+
+ lxvd2x vs8, o0, A1
+ lxvd2x vs9, o16, A1
+ addi A1, A1, 32
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs0, vs8, 3
+
+ xxpermdi vs34, vs1, vs9, 0
+ xxpermdi vs35, vs1, vs9, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ stxvd2x vs34, o32, BO
+ stxvd2x vs35, o48, BO
+ addi BO, BO, 64
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro COPY_2x2
+
+ lxvd2x vs0, o0, A0
+ addi A0, A0, 16
+
+
+ lxvd2x vs8, o0, A1
+ addi A1, A1, 16
+
+
+ xxpermdi vs32, vs0, vs8, 0
+ xxpermdi vs33, vs0, vs8, 3
+
+
+ stxvd2x vs32, o0, BO
+ stxvd2x vs33, o16, BO
+ addi BO, BO, 32
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro COPY_2x1
+
+ lxsdx vs0, o0, A0
+ addi A0, A0, 8
+
+
+ lxsdx vs8, o0, A1
+ addi A1, A1, 8
+
+
+ xxpermdi vs32, vs0, vs8, 0
+
+
+ stxvd2x vs32, o0, BO
+ addi BO, BO, 16
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+.macro COPY_1x16
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ lxvd2x vs2, o32, A0
+ lxvd2x vs3, o48, A0
+ lxvd2x vs4, o64, A0
+ lxvd2x vs5, o80, A0
+ lxvd2x vs6, o96, A0
+ lxvd2x vs7, o112, A0
+ addi A0, A0, 128
+
+
+ stxvd2x vs0, o0, BO
+ stxvd2x vs1, o16, BO
+ stxvd2x vs2, o32, BO
+ stxvd2x vs3, o48, BO
+ addi BO, BO, 64
+
+ stxvd2x vs4, o0, BO
+ stxvd2x vs5, o16, BO
+ stxvd2x vs6, o32, BO
+ stxvd2x vs7, o48, BO
+ addi BO, BO, 64
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro COPY_1x8
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ lxvd2x vs2, o32, A0
+ lxvd2x vs3, o48, A0
+ addi A0, A0, 64
+
+
+ stxvd2x vs0, o0, BO
+ stxvd2x vs1, o16, BO
+ stxvd2x vs2, o32, BO
+ stxvd2x vs3, o48, BO
+ addi BO, BO, 64
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro COPY_1x4
+
+ lxvd2x vs0, o0, A0
+ lxvd2x vs1, o16, A0
+ addi A0, A0, 32
+
+
+ stxvd2x vs0, o0, BO
+ stxvd2x vs1, o16, BO
+ addi BO, BO, 32
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro COPY_1x2
+
+ lxvd2x vs0, o0, A0
+ addi A0, A0, 16
+
+
+ stxvd2x vs0, o0, BO
+ addi BO, BO, 16
+
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro COPY_1x1
+
+ lxsdx vs0, o0, A0
+ addi A0, A0, 8
+
+
+ stxsdx vs0, o0, BO
+ addi BO, BO, 8
+
+
+.endm
+
diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S
new file mode 100644
index 000000000..eb37877e0
--- /dev/null
+++ b/kernel/power/dgemm_tcopy_16_power8.S
@@ -0,0 +1,211 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define M r3
+#define N r4
+#define A r5
+#define LDA r6
+#define B r7
+
+#define A0 r8
+#define A1 r9
+#define A2 r10
+#define A3 r11
+
+#define J r12
+
+#define PREA r14
+#define PREB r15
+#define BO r16
+#define B8 r17
+#define B4 r18
+#define B2 r19
+#define B1 r20
+#define o8 r21
+#define T2 r22
+#define I r23
+#define o16 r24
+#define o32 r25
+#define o48 r26
+#define B16 r29
+#define M16 r30
+#define T1 r31
+
+#define o0 0
+
+#include "dgemm_tcopy_macros_16_power8.S"
+
+#define STACKSIZE 384
+
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+ cmpwi cr0, M, 0
+ ble- L999
+ cmpwi cr0, N, 0
+ ble- L999
+
+ slwi LDA, LDA, BASE_SHIFT
+ slwi M16, M, 4 + BASE_SHIFT
+
+ li T1, -16
+ li T2, -8
+ li PREA, -4
+ li PREB, -2
+
+ and B8, N, T1
+ and B4, N, T2
+ and B2, N, PREA
+ and B1, N, PREB
+
+ mullw B8, B8, M
+ mullw B4, B4, M
+ mullw B2, B2, M
+ mullw B1, B1, M
+
+ slwi B8, B8, BASE_SHIFT
+ slwi B4, B4, BASE_SHIFT
+ slwi B2, B2, BASE_SHIFT
+ slwi B1, B1, BASE_SHIFT
+
+ add B8, B8, B
+ add B4, B4, B
+ add B2, B2, B
+ add B1, B1, B
+
+ li PREA, 384
+ addi PREB, M16, 128
+
+ li o8, 8
+ li o16, 16
+ li o32, 32
+ li o48, 48
+
+#include "dgemm_tcopy_logic_16_power8.S"
+
+L999:
+
+ li r3, 0
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ addi SP, SP, STACKSIZE
+
+ blr
+ EPILOGUE
+
+
diff --git a/kernel/power/dgemm_tcopy_logic_16_power8.S b/kernel/power/dgemm_tcopy_logic_16_power8.S
new file mode 100644
index 000000000..3c34a6167
--- /dev/null
+++ b/kernel/power/dgemm_tcopy_logic_16_power8.S
@@ -0,0 +1,285 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+ srawi. I, M, 2
+ ble DCOPYT_L2_BEGIN
+
+
+DCOPYT_L4_BEGIN:
+
+ mr A0, A
+ add A1, A0, LDA
+ add A2, A1, LDA
+ add A3, A2, LDA
+ add A, A3, LDA
+ mr B16, B
+ addi B, B, 64*SIZE
+
+ sradi. J, N, 4
+ ble DCOPYT_L4x8_BEGIN
+
+ mr BO, B16
+ addi T2, M16, 384
+ mtctr J
+
+ .align 5
+
+DCOPYT_L4x16_LOOP:
+
+ addi T1, M16, 256
+
+ dcbt A0, PREA
+ dcbt A1, PREA
+ dcbt A2, PREA
+ dcbt A3, PREA
+
+ dcbt BO, M16
+ dcbt BO, PREB
+ dcbt BO, T1
+ dcbt BO, T2
+
+ COPY_4x16
+
+ add BO, BO, M16
+
+ // addic. J, J, -1
+ bdnz+ DCOPYT_L4x16_LOOP
+
+DCOPYT_L4x8_BEGIN:
+
+ andi. T1, N, 8
+ ble DCOPYT_L4x4_BEGIN
+
+ mr BO, B8
+
+ COPY_4x8
+
+
+ addi B8, B8, 32*SIZE
+
+DCOPYT_L4x4_BEGIN:
+
+ andi. T1, N, 4
+ ble DCOPYT_L4x2_BEGIN
+
+ mr BO, B4
+
+ COPY_4x4
+
+
+ addi B4, B4, 16*SIZE
+
+DCOPYT_L4x2_BEGIN:
+
+ andi. T1, N, 2
+ ble DCOPYT_L4x1_BEGIN
+
+ mr BO, B2
+
+ COPY_4x2
+
+
+ addi B2, B2, 8*SIZE
+
+DCOPYT_L4x1_BEGIN:
+
+ andi. T1, N, 1
+ ble DCOPYT_L4_END
+
+ mr BO, B1
+
+ COPY_4x1
+
+
+ addi B1, B1, 4*SIZE
+
+DCOPYT_L4_END:
+
+ addic. I, I, -1
+ bgt DCOPYT_L4_BEGIN
+
+
+
+DCOPYT_L2_BEGIN:
+
+ andi. T1, M, 2
+ ble DCOPYT_L1_BEGIN
+
+ mr A0, A
+ add A1, A0, LDA
+ add A, A1, LDA
+ mr B16, B
+ addi B, B, 32*SIZE
+
+ sradi. J, N, 4
+ ble DCOPYT_L2x8_BEGIN
+
+ mr BO, B16
+
+DCOPYT_L2x16_LOOP:
+
+ COPY_2x16
+
+ add BO, BO, M16
+
+ addic. J, J, -1
+ bgt DCOPYT_L2x16_LOOP
+
+DCOPYT_L2x8_BEGIN:
+
+ andi. T1, N, 8
+ ble DCOPYT_L2x4_BEGIN
+
+ mr BO, B8
+
+ COPY_2x8
+
+
+ addi B8, B8, 16*SIZE
+
+DCOPYT_L2x4_BEGIN:
+
+ andi. T1, N, 4
+ ble DCOPYT_L2x2_BEGIN
+
+ mr BO, B4
+
+ COPY_2x4
+
+
+ addi B4, B4, 8*SIZE
+
+DCOPYT_L2x2_BEGIN:
+
+ andi. T1, N, 2
+ ble DCOPYT_L2x1_BEGIN
+
+ mr BO, B2
+
+ COPY_2x2
+
+
+ addi B2, B2, 4*SIZE
+
+DCOPYT_L2x1_BEGIN:
+
+ andi. T1, N, 1
+ ble DCOPYT_L2_END
+
+ mr BO, B1
+
+ COPY_2x1
+
+
+ addi B1, B1, 2*SIZE
+
+DCOPYT_L2_END:
+
+
+DCOPYT_L1_BEGIN:
+
+ andi. T1, M, 1
+ ble L999
+
+ mr A0, A
+ add A, A0, LDA
+ mr B16, B
+ addi B, B, 16*SIZE
+
+ sradi. J, N, 4
+ ble DCOPYT_L1x8_BEGIN
+
+ mr BO, B16
+
+DCOPYT_L1x16_LOOP:
+
+ COPY_1x16
+
+ add BO, BO, M16
+
+ addic. J, J, -1
+ bgt DCOPYT_L1x16_LOOP
+
+DCOPYT_L1x8_BEGIN:
+
+ andi. T1, N, 8
+ ble DCOPYT_L1x4_BEGIN
+
+ mr BO, B8
+
+ COPY_1x8
+
+
+ addi B8, B8, 8*SIZE
+
+DCOPYT_L1x4_BEGIN:
+
+ andi. T1, N, 4
+ ble DCOPYT_L1x2_BEGIN
+
+ mr BO, B4
+
+ COPY_1x4
+
+
+ addi B4, B4, 4*SIZE
+
+DCOPYT_L1x2_BEGIN:
+
+ andi. T1, N, 2
+ ble DCOPYT_L1x1_BEGIN
+
+ mr BO, B2
+
+ COPY_1x2
+
+
+ addi B2, B2, 2*SIZE
+
+DCOPYT_L1x1_BEGIN:
+
+ andi. T1, N, 1
+ ble DCOPYT_L1_END
+
+ mr BO, B1
+
+ COPY_1x1
+
+
+ addi B1, B1, 1*SIZE
+
+DCOPYT_L1_END:
+
diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S
new file mode 100644
index 000000000..333e23105
--- /dev/null
+++ b/kernel/power/dgemm_tcopy_macros_16_power8.S
@@ -0,0 +1,608 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro COPY_4x16
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+ lxvd2x vs40, o0, A1
+ lxvd2x vs41, o16, A1
+ lxvd2x vs42, o32, A1
+ lxvd2x vs43, o48, A1
+ addi A1, A1, 64
+
+ lxvd2x vs48, o0, A2
+ lxvd2x vs49, o16, A2
+ lxvd2x vs50, o32, A2
+ lxvd2x vs51, o48, A2
+ addi A2, A2, 64
+
+ lxvd2x vs56, o0, A3
+ lxvd2x vs57, o16, A3
+ lxvd2x vs58, o32, A3
+ lxvd2x vs59, o48, A3
+ addi A3, A3, 64
+
+ lxvd2x vs36, o0, A0
+ lxvd2x vs37, o16, A0
+ lxvd2x vs38, o32, A0
+ lxvd2x vs39, o48, A0
+ addi A0, A0, 64
+
+ lxvd2x vs44, o0, A1
+ lxvd2x vs45, o16, A1
+ lxvd2x vs46, o32, A1
+ lxvd2x vs47, o48, A1
+ addi A1, A1, 64
+
+ lxvd2x vs52, o0, A2
+ lxvd2x vs53, o16, A2
+ lxvd2x vs54, o32, A2
+ lxvd2x vs55, o48, A2
+ addi A2, A2, 64
+
+ lxvd2x vs60, o0, A3
+ lxvd2x vs61, o16, A3
+ lxvd2x vs62, o32, A3
+ lxvd2x vs63, o48, A3
+ addi A3, A3, 64
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs40, o0, T1
+ stxvd2x vs41, o16, T1
+ stxvd2x vs42, o32, T1
+ stxvd2x vs43, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs44, o0, T1
+ stxvd2x vs45, o16, T1
+ stxvd2x vs46, o32, T1
+ stxvd2x vs47, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs48, o0, T1
+ stxvd2x vs49, o16, T1
+ stxvd2x vs50, o32, T1
+ stxvd2x vs51, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs52, o0, T1
+ stxvd2x vs53, o16, T1
+ stxvd2x vs54, o32, T1
+ stxvd2x vs55, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs56, o0, T1
+ stxvd2x vs57, o16, T1
+ stxvd2x vs58, o32, T1
+ stxvd2x vs59, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs60, o0, T1
+ stxvd2x vs61, o16, T1
+ stxvd2x vs62, o32, T1
+ stxvd2x vs63, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro COPY_4x8
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs36, o0, A1
+ lxvd2x vs37, o16, A1
+ lxvd2x vs38, o32, A1
+ lxvd2x vs39, o48, A1
+ addi A1, A1, 64
+
+
+ lxvd2x vs40, o0, A2
+ lxvd2x vs41, o16, A2
+ lxvd2x vs42, o32, A2
+ lxvd2x vs43, o48, A2
+ addi A2, A2, 64
+
+
+ lxvd2x vs44, o0, A3
+ lxvd2x vs45, o16, A3
+ lxvd2x vs46, o32, A3
+ lxvd2x vs47, o48, A3
+ addi A3, A3, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs40, o0, T1
+ stxvd2x vs41, o16, T1
+ stxvd2x vs42, o32, T1
+ stxvd2x vs43, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs44, o0, T1
+ stxvd2x vs45, o16, T1
+ stxvd2x vs46, o32, T1
+ stxvd2x vs47, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro COPY_4x4
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ addi A0, A0, 32
+
+
+ lxvd2x vs34, o0, A1
+ lxvd2x vs35, o16, A1
+ addi A1, A1, 32
+
+
+ lxvd2x vs36, o0, A2
+ lxvd2x vs37, o16, A2
+ addi A2, A2, 32
+
+
+ lxvd2x vs38, o0, A3
+ lxvd2x vs39, o16, A3
+ addi A3, A3, 32
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro COPY_4x2
+
+ lxvd2x vs32, o0, A0
+ addi A0, A0, 16
+
+
+ lxvd2x vs33, o0, A1
+ addi A1, A1, 16
+
+
+ lxvd2x vs34, o0, A2
+ addi A2, A2, 16
+
+
+ lxvd2x vs35, o0, A3
+ addi A3, A3, 16
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+
+ stxvd2x vs33, o16, T1
+
+ stxvd2x vs34, o32, T1
+
+ stxvd2x vs35, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro COPY_4x1
+
+ lxsdx vs32, o0, A0
+ addi A0, A0, 8
+
+
+ lxsdx vs33, o0, A1
+ addi A1, A1, 8
+
+
+ lxsdx vs34, o0, A2
+ addi A2, A2, 8
+
+
+ lxsdx vs35, o0, A3
+ addi A3, A3, 8
+
+
+ mr T1, BO
+
+ stxsdx vs32, o0, T1
+
+ stxsdx vs33, o8, T1
+
+ addi T1, T1, 16
+
+ stxsdx vs34, o0, T1
+
+ stxsdx vs35, o8, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+.macro COPY_2x16
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+ lxvd2x vs36, o0, A0
+ lxvd2x vs37, o16, A0
+ lxvd2x vs38, o32, A0
+ lxvd2x vs39, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs40, o0, A1
+ lxvd2x vs41, o16, A1
+ lxvd2x vs42, o32, A1
+ lxvd2x vs43, o48, A1
+ addi A1, A1, 64
+
+ lxvd2x vs44, o0, A1
+ lxvd2x vs45, o16, A1
+ lxvd2x vs46, o32, A1
+ lxvd2x vs47, o48, A1
+ addi A1, A1, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs40, o0, T1
+ stxvd2x vs41, o16, T1
+ stxvd2x vs42, o32, T1
+ stxvd2x vs43, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs44, o0, T1
+ stxvd2x vs45, o16, T1
+ stxvd2x vs46, o32, T1
+ stxvd2x vs47, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro COPY_2x8
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs36, o0, A1
+ lxvd2x vs37, o16, A1
+ lxvd2x vs38, o32, A1
+ lxvd2x vs39, o48, A1
+ addi A1, A1, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro COPY_2x4
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ addi A0, A0, 32
+
+
+ lxvd2x vs34, o0, A1
+ lxvd2x vs35, o16, A1
+ addi A1, A1, 32
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro COPY_2x2
+
+ lxvd2x vs32, o0, A0
+ addi A0, A0, 16
+
+
+ lxvd2x vs33, o0, A1
+ addi A1, A1, 16
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+
+ stxvd2x vs33, o16, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro COPY_2x1
+
+ lxsdx vs32, o0, A0
+ addi A0, A0, 8
+
+
+ lxsdx vs33, o0, A1
+ addi A1, A1, 8
+
+
+ mr T1, BO
+
+ stxsdx vs32, o0, T1
+
+ stxsdx vs33, o8, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+.macro COPY_1x16
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+ lxvd2x vs36, o0, A0
+ lxvd2x vs37, o16, A0
+ lxvd2x vs38, o32, A0
+ lxvd2x vs39, o48, A0
+ addi A0, A0, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro COPY_1x8
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro COPY_1x4
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ addi A0, A0, 32
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro COPY_1x2
+
+ lxvd2x vs32, o0, A0
+ addi A0, A0, 16
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro COPY_1x1
+
+ lxsdx vs32, o0, A0
+ addi A0, A0, 8
+
+
+ mr T1, BO
+
+ stxsdx vs32, o0, T1
+
+.endm
+
diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S
index 2294128a2..e9dbd991e 100644
--- a/kernel/power/dtrmm_kernel_16x4_power8.S
+++ b/kernel/power/dtrmm_kernel_16x4_power8.S
@@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define PRE r30
#define T2 r31
-#include "dgemm_macros_16x4_power8.S"
+#include "dtrmm_macros_16x4_power8.S"
#ifndef NEEDPARAM
diff --git a/kernel/power/dtrmm_macros_16x4_power8.S b/kernel/power/dtrmm_macros_16x4_power8.S
new file mode 100644
index 000000000..079144a90
--- /dev/null
+++ b/kernel/power/dtrmm_macros_16x4_power8.S
@@ -0,0 +1,3431 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************
+* Macros for N=4, M=16 *
+*********************************************************************/
+
+.macro LOAD4x16_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_I1
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+ addi AO, AO, 64
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+
+ xvmuldp vs52, vs4, vs26
+ xvmuldp vs53, vs5, vs26
+ xvmuldp vs54, vs6, vs26
+ xvmuldp vs55, vs7, vs26
+
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmuldp vs60, vs4, vs27
+ xvmuldp vs61, vs5, vs27
+ xvmuldp vs62, vs6, vs27
+ xvmuldp vs63, vs7, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_1
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+ addi AO, AO, 64
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+
+ xvmaddadp vs52, vs4, vs26
+ xvmaddadp vs53, vs5, vs26
+ xvmaddadp vs54, vs6, vs26
+ xvmaddadp vs55, vs7, vs26
+
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmaddadp vs60, vs4, vs27
+ xvmaddadp vs61, vs5, vs27
+ xvmaddadp vs62, vs6, vs27
+ xvmaddadp vs63, vs7, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_2
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+ addi AO, AO, 64
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+
+ xvmaddadp vs52, vs12, vs30
+ xvmaddadp vs53, vs13, vs30
+ xvmaddadp vs54, vs14, vs30
+ xvmaddadp vs55, vs15, vs30
+
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ xvmaddadp vs60, vs12, vs31
+ xvmaddadp vs61, vs13, vs31
+ xvmaddadp vs62, vs14, vs31
+ xvmaddadp vs63, vs15, vs31
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+ xvmaddadp vs52, vs12, vs30
+ xvmaddadp vs53, vs13, vs30
+ xvmaddadp vs54, vs14, vs30
+ xvmaddadp vs55, vs15, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+ xvmaddadp vs60, vs12, vs31
+ xvmaddadp vs61, vs13, vs31
+ xvmaddadp vs62, vs14, vs31
+ xvmaddadp vs63, vs15, vs31
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+ xvmuldp vs52, vs4, vs26
+ xvmuldp vs53, vs5, vs26
+ xvmuldp vs54, vs6, vs26
+ xvmuldp vs55, vs7, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+ xvmuldp vs60, vs4, vs27
+ xvmuldp vs61, vs5, vs27
+ xvmuldp vs62, vs6, vs27
+ xvmuldp vs63, vs7, vs27
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+ xvmaddadp vs52, vs4, vs26
+ xvmaddadp vs53, vs5, vs26
+ xvmaddadp vs54, vs6, vs26
+ xvmaddadp vs55, vs7, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+ xvmaddadp vs60, vs4, vs27
+ xvmaddadp vs61, vs5, vs27
+ xvmaddadp vs62, vs6, vs27
+ xvmaddadp vs63, vs7, vs27
+
+.endm
+
+.macro SAVE4x16
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+ xvmaddadp vs4, vs36, alpha_r
+ xvmaddadp vs5, vs37, alpha_r
+ xvmaddadp vs6, vs38, alpha_r
+ xvmaddadp vs7, vs39, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+ xvmuldp vs4, vs36, alpha_r
+ xvmuldp vs5, vs37, alpha_r
+ xvmuldp vs6, vs38, alpha_r
+ xvmuldp vs7, vs39, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+
+ lxvd2x vs12, 0, T2
+ lxvd2x vs13, o16, T2
+ lxvd2x vs14, o32, T2
+ lxvd2x vs15, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+ xvmaddadp vs12, vs44, alpha_r
+ xvmaddadp vs13, vs45, alpha_r
+ xvmaddadp vs14, vs46, alpha_r
+ xvmaddadp vs15, vs47, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+ xvmuldp vs12, vs44, alpha_r
+ xvmuldp vs13, vs45, alpha_r
+ xvmuldp vs14, vs46, alpha_r
+ xvmuldp vs15, vs47, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+
+ stxvd2x vs12, 0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+ xvmaddadp vs1, vs49, alpha_r
+ xvmaddadp vs2, vs50, alpha_r
+ xvmaddadp vs3, vs51, alpha_r
+ xvmaddadp vs4, vs52, alpha_r
+ xvmaddadp vs5, vs53, alpha_r
+ xvmaddadp vs6, vs54, alpha_r
+ xvmaddadp vs7, vs55, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+ xvmuldp vs1, vs49, alpha_r
+ xvmuldp vs2, vs50, alpha_r
+ xvmuldp vs3, vs51, alpha_r
+ xvmuldp vs4, vs52, alpha_r
+ xvmuldp vs5, vs53, alpha_r
+ xvmuldp vs6, vs54, alpha_r
+ xvmuldp vs7, vs55, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+
+ lxvd2x vs12, 0, T2
+ lxvd2x vs13, o16, T2
+ lxvd2x vs14, o32, T2
+ lxvd2x vs15, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+ xvmaddadp vs9, vs57, alpha_r
+ xvmaddadp vs10, vs58, alpha_r
+ xvmaddadp vs11, vs59, alpha_r
+ xvmaddadp vs12, vs60, alpha_r
+ xvmaddadp vs13, vs61, alpha_r
+ xvmaddadp vs14, vs62, alpha_r
+ xvmaddadp vs15, vs63, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+ xvmuldp vs9, vs57, alpha_r
+ xvmuldp vs10, vs58, alpha_r
+ xvmuldp vs11, vs59, alpha_r
+ xvmuldp vs12, vs60, alpha_r
+ xvmuldp vs13, vs61, alpha_r
+ xvmuldp vs14, vs62, alpha_r
+ xvmuldp vs15, vs63, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+
+ stxvd2x vs12, 0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ addi CO, CO, 128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8 *
+*********************************************************************/
+
+.macro LOAD4x8_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_I1
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_1
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_2
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+ xvmaddadp vs50, vs10, vs30
+ xvmaddadp vs51, vs11, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+ xvmaddadp vs58, vs10, vs31
+ xvmaddadp vs59, vs11, vs31
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+ xvmuldp vs50, vs2, vs26
+ xvmuldp vs51, vs3, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+ xvmuldp vs58, vs2, vs27
+ xvmuldp vs59, vs3, vs27
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+ xvmaddadp vs50, vs2, vs26
+ xvmaddadp vs51, vs3, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+ xvmaddadp vs58, vs2, vs27
+ xvmaddadp vs59, vs3, vs27
+
+.endm
+
+.macro SAVE4x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+ xvmaddadp vs1, vs49, alpha_r
+ xvmaddadp vs2, vs50, alpha_r
+ xvmaddadp vs3, vs51, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+ xvmuldp vs1, vs49, alpha_r
+ xvmuldp vs2, vs50, alpha_r
+ xvmuldp vs3, vs51, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+ xvmaddadp vs9, vs57, alpha_r
+ xvmaddadp vs10, vs58, alpha_r
+ xvmaddadp vs11, vs59, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+ xvmuldp vs9, vs57, alpha_r
+ xvmuldp vs10, vs58, alpha_r
+ xvmuldp vs11, vs59, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ addi CO, CO, 64
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=4 *
+*********************************************************************/
+
+.macro LOAD4x4_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+
+.endm
+
+.macro KERNEL4x4_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+
+.endm
+
+.macro KERNEL4x4_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+ xvmaddadp vs48, vs8, vs30
+ xvmaddadp vs49, vs9, vs30
+
+ xvmaddadp vs56, vs8, vs31
+ xvmaddadp vs57, vs9, vs31
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+ xvmuldp vs48, vs0, vs26
+ xvmuldp vs49, vs1, vs26
+
+ xvmuldp vs56, vs0, vs27
+ xvmuldp vs57, vs1, vs27
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+ xvmaddadp vs48, vs0, vs26
+ xvmaddadp vs49, vs1, vs26
+
+ xvmaddadp vs56, vs0, vs27
+ xvmaddadp vs57, vs1, vs27
+
+.endm
+
+.macro SAVE4x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+ xvmaddadp vs1, vs49, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+ xvmuldp vs1, vs49, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+ xvmaddadp vs9, vs57, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+ xvmuldp vs9, vs57, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+
+ addi CO, CO, 32
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=2 *
+*********************************************************************/
+
+.macro LOAD4x2_1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+ xvmuldp vs48, vs0, vs26
+
+ xvmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x2_1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+ lxvdsx vs30, o16, BO
+ lxvdsx vs31, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+ xvmaddadp vs48, vs0, vs26
+
+ xvmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x2_2
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+ xvmaddadp vs48, vs8, vs30
+
+ xvmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+ xvmaddadp vs48, vs8, vs30
+
+ xvmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+ xvmuldp vs48, vs0, vs26
+
+ xvmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+ lxvdsx vs26, o16, BO
+ lxvdsx vs27, o24, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 32
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+ xvmaddadp vs48, vs0, vs26
+
+ xvmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro SAVE4x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs48, alpha_r
+#else
+ xvmuldp vs0, vs48, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs56, alpha_r
+#else
+ xvmuldp vs8, vs56, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+
+ addi CO, CO, 16
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=1 *
+*********************************************************************/
+
+.macro LOAD4x1_1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+ lxsdx vs30, o16, BO
+ lxsdx vs31, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+ xsmuldp vs48, vs0, vs26
+
+ xsmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x1_1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+ lxsdx vs30, o16, BO
+ lxsdx vs31, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+ xsmaddadp vs48, vs0, vs26
+
+ xsmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x1_2
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+ xsmaddadp vs48, vs8, vs30
+
+ xsmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+ xsmaddadp vs48, vs8, vs30
+
+ xsmaddadp vs56, vs8, vs31
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+ xsmuldp vs48, vs0, vs26
+
+ xsmuldp vs56, vs0, vs27
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+ lxsdx vs26, o16, BO
+ lxsdx vs27, o24, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 32
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+ xsmaddadp vs48, vs0, vs26
+
+ xsmaddadp vs56, vs0, vs27
+
+.endm
+
+.macro SAVE4x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs32, alpha_r
+#else
+ xsmuldp vs0, vs32, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs8, vs40, alpha_r
+#else
+ xsmuldp vs8, vs40, alpha_r
+#endif
+
+ stxsdx vs8, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs48, alpha_r
+#else
+ xsmuldp vs0, vs48, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs8, vs56, alpha_r
+#else
+ xsmuldp vs8, vs56, alpha_r
+#endif
+
+ stxsdx vs8, 0, T1
+
+ addi CO, CO, 8
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=16 *
+*********************************************************************/
+
+.macro LOAD2x16_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+.endm
+
+.macro KERNEL2x16_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+.endm
+
+.macro KERNEL2x16_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+.endm
+
+.macro KERNEL2x16_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+ xvmaddadp vs44, vs12, vs29
+ xvmaddadp vs45, vs13, vs29
+ xvmaddadp vs46, vs14, vs29
+ xvmaddadp vs47, vs15, vs29
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+ xvmuldp vs44, vs4, vs25
+ xvmuldp vs45, vs5, vs25
+ xvmuldp vs46, vs6, vs25
+ xvmuldp vs47, vs7, vs25
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+ xvmaddadp vs44, vs4, vs25
+ xvmaddadp vs45, vs5, vs25
+ xvmaddadp vs46, vs6, vs25
+ xvmaddadp vs47, vs7, vs25
+
+.endm
+
+.macro SAVE2x16
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+ xvmaddadp vs4, vs36, alpha_r
+ xvmaddadp vs5, vs37, alpha_r
+ xvmaddadp vs6, vs38, alpha_r
+ xvmaddadp vs7, vs39, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+ xvmuldp vs4, vs36, alpha_r
+ xvmuldp vs5, vs37, alpha_r
+ xvmuldp vs6, vs38, alpha_r
+ xvmuldp vs7, vs39, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ add T1, T1, LDC
+ add T2, T2, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+
+ lxvd2x vs12, 0, T2
+ lxvd2x vs13, o16, T2
+ lxvd2x vs14, o32, T2
+ lxvd2x vs15, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+ xvmaddadp vs12, vs44, alpha_r
+ xvmaddadp vs13, vs45, alpha_r
+ xvmaddadp vs14, vs46, alpha_r
+ xvmaddadp vs15, vs47, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+ xvmuldp vs12, vs44, alpha_r
+ xvmuldp vs13, vs45, alpha_r
+ xvmuldp vs14, vs46, alpha_r
+ xvmuldp vs15, vs47, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ stxvd2x vs12, 0, T2
+ stxvd2x vs13, o16, T2
+ stxvd2x vs14, o32, T2
+ stxvd2x vs15, o48, T2
+
+ addi CO, CO, 128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8 *
+*********************************************************************/
+
+.macro LOAD2x8_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+.endm
+
+.macro KERNEL2x8_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+.endm
+
+.macro KERNEL2x8_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+ xvmaddadp vs42, vs10, vs29
+ xvmaddadp vs43, vs11, vs29
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+ xvmuldp vs42, vs2, vs25
+ xvmuldp vs43, vs3, vs25
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+ xvmaddadp vs42, vs2, vs25
+ xvmaddadp vs43, vs3, vs25
+
+.endm
+
+.macro SAVE2x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+ lxvd2x vs10, o32, T1
+ lxvd2x vs11, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+ xvmaddadp vs10, vs42, alpha_r
+ xvmaddadp vs11, vs43, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+ xvmuldp vs10, vs42, alpha_r
+ xvmuldp vs11, vs43, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+ stxvd2x vs10, o32, T1
+ stxvd2x vs11, o48, T1
+
+ addi CO, CO, 64
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=4 *
+*********************************************************************/
+
+.macro LOAD2x4_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+.endm
+
+.macro KERNEL2x4_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+.endm
+
+.macro KERNEL2x4_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+ xvmaddadp vs40, vs8, vs29
+ xvmaddadp vs41, vs9, vs29
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+ xvmuldp vs40, vs0, vs25
+ xvmuldp vs41, vs1, vs25
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+ xvmaddadp vs40, vs0, vs25
+ xvmaddadp vs41, vs1, vs25
+
+.endm
+
+.macro SAVE2x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+ lxvd2x vs9, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+ xvmaddadp vs9, vs41, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+ xvmuldp vs9, vs41, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+ stxvd2x vs9, o16, T1
+
+ addi CO, CO, 32
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=2 *
+*********************************************************************/
+
+.macro LOAD2x2_1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x2_1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+ lxvdsx vs29, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x2_2
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+
+ xvmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmuldp vs32, vs0, vs24
+
+ xvmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+ lxvdsx vs25, o8, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 16
+
+
+ xvmaddadp vs32, vs0, vs24
+
+ xvmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro SAVE2x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxvd2x vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs8, vs40, alpha_r
+#else
+ xvmuldp vs8, vs40, alpha_r
+#endif
+
+ stxvd2x vs8, 0, T1
+
+ addi CO, CO, 16
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=1 *
+*********************************************************************/
+
+.macro LOAD2x1_1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x1_1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+ lxsdx vs29, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x1_2
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+ xsmaddadp vs32, vs8, vs28
+
+ xsmaddadp vs40, vs8, vs29
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmuldp vs32, vs0, vs24
+
+ xsmuldp vs40, vs0, vs25
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+ lxsdx vs25, o8, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 16
+
+
+ xsmaddadp vs32, vs0, vs24
+
+ xsmaddadp vs40, vs0, vs25
+
+.endm
+
+.macro SAVE2x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs32, alpha_r
+#else
+ xsmuldp vs0, vs32, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ add T1, T1, LDC
+
+#ifndef TRMMKERNEL
+ lxsdx vs8, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs8, vs40, alpha_r
+#else
+ xsmuldp vs8, vs40, alpha_r
+#endif
+
+ stxsdx vs8, 0, T1
+
+ addi CO, CO, 8
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=16 *
+*********************************************************************/
+
+.macro LOAD1x16_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+.endm
+
+.macro KERNEL1x16_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+.endm
+
+.macro KERNEL1x16_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs12, 0, AO
+ lxvd2x vs13, o16, AO
+ lxvd2x vs14, o32, AO
+ lxvd2x vs15, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+.endm
+
+.macro KERNEL1x16_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+ xvmaddadp vs36, vs12, vs28
+ xvmaddadp vs37, vs13, vs28
+ xvmaddadp vs38, vs14, vs28
+ xvmaddadp vs39, vs15, vs28
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+ xvmuldp vs36, vs4, vs24
+ xvmuldp vs37, vs5, vs24
+ xvmuldp vs38, vs6, vs24
+ xvmuldp vs39, vs7, vs24
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+ lxvd2x vs4, 0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+ xvmaddadp vs36, vs4, vs24
+ xvmaddadp vs37, vs5, vs24
+ xvmaddadp vs38, vs6, vs24
+ xvmaddadp vs39, vs7, vs24
+
+.endm
+
+.macro SAVE1x16
+
+ mr T1, CO
+ addi T2, T1, 64
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+
+ lxvd2x vs4, 0, T2
+ lxvd2x vs5, o16, T2
+ lxvd2x vs6, o32, T2
+ lxvd2x vs7, o48, T2
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+ xvmaddadp vs4, vs36, alpha_r
+ xvmaddadp vs5, vs37, alpha_r
+ xvmaddadp vs6, vs38, alpha_r
+ xvmaddadp vs7, vs39, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+ xvmuldp vs4, vs36, alpha_r
+ xvmuldp vs5, vs37, alpha_r
+ xvmuldp vs6, vs38, alpha_r
+ xvmuldp vs7, vs39, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ stxvd2x vs4, 0, T2
+ stxvd2x vs5, o16, T2
+ stxvd2x vs6, o32, T2
+ stxvd2x vs7, o48, T2
+
+ addi CO, CO, 128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8 *
+*********************************************************************/
+
+.macro LOAD1x8_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+.endm
+
+.macro KERNEL1x8_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+ lxvd2x vs10, o32, AO
+ lxvd2x vs11, o48, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+.endm
+
+.macro KERNEL1x8_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+ xvmaddadp vs34, vs10, vs28
+ xvmaddadp vs35, vs11, vs28
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+ xvmuldp vs34, vs2, vs24
+ xvmuldp vs35, vs3, vs24
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 64
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+ xvmaddadp vs34, vs2, vs24
+ xvmaddadp vs35, vs3, vs24
+
+.endm
+
+.macro SAVE1x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+ lxvd2x vs2, o32, T1
+ lxvd2x vs3, o48, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+ xvmaddadp vs2, vs34, alpha_r
+ xvmaddadp vs3, vs35, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+ xvmuldp vs2, vs34, alpha_r
+ xvmuldp vs3, vs35, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+ stxvd2x vs2, o32, T1
+ stxvd2x vs3, o48, T1
+
+ addi CO, CO, 64
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=4 *
+*********************************************************************/
+
+.macro LOAD1x4_1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+.endm
+
+.macro KERNEL1x4_1
+
+ lxvd2x vs8, 0, AO
+ lxvd2x vs9, o16, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+.endm
+
+.macro KERNEL1x4_2
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+ xvmaddadp vs33, vs9, vs28
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+ xvmuldp vs33, vs1, vs24
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+ lxvd2x vs0, 0, AO
+ lxvd2x vs1, o16, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 32
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+ xvmaddadp vs33, vs1, vs24
+
+.endm
+
+.macro SAVE1x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+ lxvd2x vs1, o16, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+ xvmaddadp vs1, vs33, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+ xvmuldp vs1, vs33, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+ stxvd2x vs1, o16, T1
+
+ addi CO, CO, 32
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=2 *
+*********************************************************************/
+
+.macro LOAD1x2_1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x2_1
+
+ lxvd2x vs8, 0, AO
+
+ lxvdsx vs28, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x2_2
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+ xvmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+ lxvd2x vs0, 0, AO
+
+ lxvdsx vs24, 0, BO
+
+ addi AO, AO, 16
+ addi BO, BO, 8
+
+
+ xvmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro SAVE1x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxvd2x vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xvmaddadp vs0, vs32, alpha_r
+#else
+ xvmuldp vs0, vs32, alpha_r
+#endif
+
+ stxvd2x vs0, 0, T1
+
+ addi CO, CO, 16
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=1 *
+*********************************************************************/
+
+.macro LOAD1x1_1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x1_1
+
+ lxsdx vs8, 0, AO
+
+ lxsdx vs28, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x1_2
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+ xsmaddadp vs32, vs8, vs28
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmuldp vs32, vs0, vs24
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+ lxsdx vs0, 0, AO
+
+ lxsdx vs24, 0, BO
+
+ addi AO, AO, 8
+ addi BO, BO, 8
+
+
+ xsmaddadp vs32, vs0, vs24
+
+.endm
+
+.macro SAVE1x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+ lxsdx vs0, 0, T1
+#endif
+
+#ifndef TRMMKERNEL
+ xsmaddadp vs0, vs32, alpha_r
+#else
+ xsmuldp vs0, vs32, alpha_r
+#endif
+
+ stxsdx vs0, 0, T1
+
+ addi CO, CO, 8
+
+.endm
+
diff --git a/kernel/power/dtrsm_kernel_LT_16x4_power8.S b/kernel/power/dtrsm_kernel_LT_16x4_power8.S
new file mode 100644
index 000000000..fdfc5ac70
--- /dev/null
+++ b/kernel/power/dtrsm_kernel_LT_16x4_power8.S
@@ -0,0 +1,294 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD lwz
+#else
+#define LOAD ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA 296(SP)
+#define FZERO 304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA 224(SP)
+#define FZERO 232(SP)
+#endif
+
+#define M r3
+#define N r4
+#define K r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A r6
+#define B r7
+#define C r8
+#define LDC r9
+#define OFFSET r10
+#else
+#define A r7
+#define B r8
+#define C r9
+#define LDC r10
+#define OFFSET r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A r8
+#define B r9
+#define C r10
+#define LDC r7
+#define OFFSET r6
+#else
+#define A r7
+#define B r8
+#define C r9
+#define LDC r10
+#define OFFSET r6
+#endif
+#endif
+
+#define o0 0
+
+#define PRE r15
+#define T4 r16
+#define L r17
+#define T3 r18
+#define T2 r19
+#define KK r20
+#define I r21
+#define J r22
+#define AO r23
+#define BO r24
+#define CO r25
+#define o8 r26
+#define o16 r27
+#define o24 r28
+#define o32 r29
+#define o48 r30
+#define T1 r31
+
+#include "dtrsm_macros_LT_16x4_power8.S"
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+#ifdef __64BIT__
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+#else
+ stw r31, 144(SP)
+ stw r30, 148(SP)
+ stw r29, 152(SP)
+ stw r28, 156(SP)
+ stw r27, 160(SP)
+ stw r26, 164(SP)
+ stw r25, 168(SP)
+ stw r24, 172(SP)
+ stw r23, 176(SP)
+ stw r22, 180(SP)
+ stw r21, 184(SP)
+ stw r20, 188(SP)
+ stw r19, 192(SP)
+ stw r18, 196(SP)
+#endif
+
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+
+#if defined(linux) && defined(__64BIT__)
+ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+ lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#else
+ lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+
+
+ cmpwi cr0, M, 0
+ ble L999
+ cmpwi cr0, N, 0
+ ble L999
+ cmpwi cr0, K, 0
+ ble L999
+
+ slwi LDC, LDC, BASE_SHIFT
+
+ li o8, 8
+ li o16, 16
+ li o24, 24
+ li o32, 32
+ li o48, 48
+ li PRE, 384
+
+ mr KK, OFFSET
+
+#include "dtrsm_logic_LT_16x4_power8.S"
+
+L999:
+ addi r3, 0, 0
+
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+#ifdef __64BIT__
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+#else
+ lwz r31, 144(SP)
+ lwz r30, 148(SP)
+ lwz r29, 152(SP)
+ lwz r28, 156(SP)
+ lwz r27, 160(SP)
+ lwz r26, 164(SP)
+ lwz r25, 168(SP)
+ lwz r24, 172(SP)
+ lwz r23, 176(SP)
+ lwz r22, 180(SP)
+ lwz r21, 184(SP)
+ lwz r20, 188(SP)
+ lwz r19, 192(SP)
+ lwz r18, 196(SP)
+#endif
+
+ addi SP, SP, STACKSIZE
+
+ blr
+
+ EPILOGUE
+#endif
diff --git a/kernel/power/dtrsm_logic_LT_16x4_power8.S b/kernel/power/dtrsm_logic_LT_16x4_power8.S
new file mode 100644
index 000000000..04f5fdd90
--- /dev/null
+++ b/kernel/power/dtrsm_logic_LT_16x4_power8.S
@@ -0,0 +1,755 @@
+ srawi. J, N, 2
+ ble DSTRM_LT_L4_END
+
+
+DSTRM_LT_L4_BEGIN:
+
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 2
+ add C, C, T1
+
+ mr KK, OFFSET
+ srawi. I, M, 4
+ ble DSTRM_LT_L4x16_END
+
+
+DSTRM_LT_L4x16_BEGIN:
+
+ mr BO, B
+
+ li L, -128
+
+ mr T1, CO
+ add T2, T1, LDC
+ add T3, T2, LDC
+ add T4, T3, LDC
+
+ and T1, T1, L
+ and T2, T2, L
+ and T3, T3, L
+ and T4, T4, L
+
+ dcbt T1, r0
+ dcbt T2, r0
+ dcbt T3, r0
+ dcbt T4, r0
+
+ addi T1, T1, 128
+ addi T2, T2, 128
+ addi T3, T3, 128
+ addi T4, T4, 128
+
+ dcbt T1, r0
+ dcbt T2, r0
+ dcbt T3, r0
+ dcbt T4, r0
+
+
+DSTRM_LT_L4x16_LOOP_START:
+
+
+ INIT_16x4
+
+
+ addic. L, KK, 0
+ ble- DSTRM_LT_L4x16_SAVE
+ mtctr L
+
+DSTRM_LT_L4x16_LOOP:
+
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL_16x4
+ bdz- DSTRM_LT_L4x16_SAVE
+
+ dcbt AO, PRE
+ KERNEL_16x4
+ bdz- DSTRM_LT_L4x16_SAVE
+
+ dcbt AO, PRE
+ KERNEL_16x4
+ bdz- DSTRM_LT_L4x16_SAVE
+
+ dcbt AO, PRE
+ KERNEL_16x4
+ bdnz+ DSTRM_LT_L4x16_LOOP
+
+
+DSTRM_LT_L4x16_SAVE:
+
+ SOLVE_LT_16x4
+
+ addi CO, CO, 16*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 4+BASE_SHIFT
+ slwi T4, T4, 2+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 16
+
+ addic. I, I, -1
+ bgt DSTRM_LT_L4x16_BEGIN
+
+DSTRM_LT_L4x16_END:
+
+
+DSTRM_LT_L4x8_BEGIN:
+
+ andi. T2, M, 15
+ ble DSTRM_LT_L4x1_END
+
+ andi. T1, M, 8
+ ble DSTRM_LT_L4x8_END
+
+ mr BO, B
+
+
+DSTRM_LT_L4x8_LOOP_START:
+
+
+ INIT_8x4
+
+
+ addic. L, KK, 0
+ ble DSTRM_LT_L4x8_SAVE
+
+DSTRM_LT_L4x8_LOOP:
+
+
+ KERNEL_8x4
+
+ addic. L, L, -1
+ bgt DSTRM_LT_L4x8_LOOP
+
+
+DSTRM_LT_L4x8_SAVE:
+
+ SOLVE_LT_8x4
+
+ addi CO, CO, 8*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 3+BASE_SHIFT
+ slwi T4, T4, 2+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 8
+
+DSTRM_LT_L4x8_END:
+
+
+DSTRM_LT_L4x4_BEGIN:
+
+ andi. T1, M, 4
+ ble DSTRM_LT_L4x4_END
+
+ mr BO, B
+
+
+DSTRM_LT_L4x4_LOOP_START:
+
+
+ INIT_4x4
+
+
+ addic. L, KK, 0
+ ble DSTRM_LT_L4x4_SAVE
+
+DSTRM_LT_L4x4_LOOP:
+
+
+ KERNEL_4x4
+
+ addic. L, L, -1
+ bgt DSTRM_LT_L4x4_LOOP
+
+
+DSTRM_LT_L4x4_SAVE:
+
+ SOLVE_LT_4x4
+
+ addi CO, CO, 4*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 2+BASE_SHIFT
+ slwi T4, T4, 2+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 4
+
+DSTRM_LT_L4x4_END:
+
+
+DSTRM_LT_L4x2_BEGIN:
+
+ andi. T1, M, 2
+ ble DSTRM_LT_L4x2_END
+
+ mr BO, B
+
+
+DSTRM_LT_L4x2_LOOP_START:
+
+
+ INIT_2x4
+
+
+ addic. L, KK, 0
+ ble DSTRM_LT_L4x2_SAVE
+
+DSTRM_LT_L4x2_LOOP:
+
+
+ KERNEL_2x4
+
+ addic. L, L, -1
+ bgt DSTRM_LT_L4x2_LOOP
+
+
+DSTRM_LT_L4x2_SAVE:
+
+ SOLVE_LT_2x4
+
+ addi CO, CO, 2*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 1+BASE_SHIFT
+ slwi T4, T4, 2+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 2
+
+DSTRM_LT_L4x2_END:
+
+
+DSTRM_LT_L4x1_BEGIN:
+
+ andi. T1, M, 1
+ ble DSTRM_LT_L4x1_END
+
+ mr BO, B
+
+
+DSTRM_LT_L4x1_LOOP_START:
+
+
+ INIT_1x4
+
+
+ addic. L, KK, 0
+ ble DSTRM_LT_L4x1_SAVE
+
+DSTRM_LT_L4x1_LOOP:
+
+
+ KERNEL_1x4
+
+ addic. L, L, -1
+ bgt DSTRM_LT_L4x1_LOOP
+
+
+DSTRM_LT_L4x1_SAVE:
+
+ SOLVE_LT_1x4
+
+ addi CO, CO, 1*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 0+BASE_SHIFT
+ slwi T4, T4, 2+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 1
+
+DSTRM_LT_L4x1_END:
+
+ slwi T1, K, 2+BASE_SHIFT
+ add B, B, T1
+
+ addic. J, J, -1
+ bgt DSTRM_LT_L4_BEGIN
+
+ andi. T2, N, 3
+ ble L999
+
+DSTRM_LT_L4_END:
+
+ b DSTRM_LT_L2_BEGIN
+
+L999_H1:
+
+ b L999
+
+
+DSTRM_LT_L2_BEGIN:
+
+ andi. T1, N, 2
+ ble DSTRM_LT_L2_END
+
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 1
+ add C, C, T1
+
+ mr KK, OFFSET
+ srawi. I, M, 4
+ ble DSTRM_LT_L2x16_END
+
+
+DSTRM_LT_L2x16_BEGIN:
+
+ mr BO, B
+
+
+DSTRM_LT_L2x16_LOOP_START:
+
+
+ INIT_16x2
+
+
+ addic. L, KK, 0
+ ble DSTRM_LT_L2x16_SAVE
+
+DSTRM_LT_L2x16_LOOP:
+
+
+ KERNEL_16x2
+
+ addic. L, L, -1
+ bgt DSTRM_LT_L2x16_LOOP
+
+
+DSTRM_LT_L2x16_SAVE:
+
+ SOLVE_LT_16x2
+
+ addi CO, CO, 16*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 4+BASE_SHIFT
+ slwi T4, T4, 1+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 16
+
+ addic. I, I, -1
+ bgt DSTRM_LT_L2x16_BEGIN
+
+DSTRM_LT_L2x16_END:
+
+
+DSTRM_LT_L2x8_BEGIN:
+
+ andi. T2, M, 15
+ ble DSTRM_LT_L2x1_END
+
+ andi. T1, M, 8
+ ble DSTRM_LT_L2x8_END
+
+ mr BO, B
+
+
+DSTRM_LT_L2x8_LOOP_START:
+
+
+ INIT_8x2
+
+
+ addic. L, KK, 0
+ ble DSTRM_LT_L2x8_SAVE
+
+DSTRM_LT_L2x8_LOOP:
+
+
+ KERNEL_8x2
+
+ addic. L, L, -1
+ bgt DSTRM_LT_L2x8_LOOP
+
+
+DSTRM_LT_L2x8_SAVE:
+
+ SOLVE_LT_8x2
+
+ addi CO, CO, 8*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 3+BASE_SHIFT
+ slwi T4, T4, 1+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 8
+
+DSTRM_LT_L2x8_END:
+
+
+DSTRM_LT_L2x4_BEGIN:
+
+ andi. T1, M, 4
+ ble DSTRM_LT_L2x4_END
+
+ mr BO, B
+
+
+DSTRM_LT_L2x4_LOOP_START:
+
+
+ INIT_4x2
+
+
+ addic. L, KK, 0
+ ble DSTRM_LT_L2x4_SAVE
+
+DSTRM_LT_L2x4_LOOP:
+
+
+ KERNEL_4x2
+
+ addic. L, L, -1
+ bgt DSTRM_LT_L2x4_LOOP
+
+
+DSTRM_LT_L2x4_SAVE:
+
+ SOLVE_LT_4x2
+
+ addi CO, CO, 4*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 2+BASE_SHIFT
+ slwi T4, T4, 1+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 4
+
+DSTRM_LT_L2x4_END:
+
+
+DSTRM_LT_L2x2_BEGIN:
+
+ andi. T1, M, 2
+ ble DSTRM_LT_L2x2_END
+
+ mr BO, B
+
+
+DSTRM_LT_L2x2_LOOP_START:
+
+
+ INIT_2x2
+
+
+ addic. L, KK, 0
+ ble DSTRM_LT_L2x2_SAVE
+
+DSTRM_LT_L2x2_LOOP:
+
+
+ KERNEL_2x2
+
+ addic. L, L, -1
+ bgt DSTRM_LT_L2x2_LOOP
+
+
+DSTRM_LT_L2x2_SAVE:
+
+ SOLVE_LT_2x2
+
+ addi CO, CO, 2*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 1+BASE_SHIFT
+ slwi T4, T4, 1+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 2
+
+DSTRM_LT_L2x2_END:
+
+
+DSTRM_LT_L2x1_BEGIN:
+
+ andi. T1, M, 1
+ ble DSTRM_LT_L2x1_END
+
+ mr BO, B
+
+
+DSTRM_LT_L2x1_LOOP_START:
+
+
+ INIT_1x2
+
+
+ addic. L, KK, 0
+ ble DSTRM_LT_L2x1_SAVE
+
+DSTRM_LT_L2x1_LOOP:
+
+
+ KERNEL_1x2
+
+ addic. L, L, -1
+ bgt DSTRM_LT_L2x1_LOOP
+
+
+DSTRM_LT_L2x1_SAVE:
+
+ SOLVE_LT_1x2
+
+ addi CO, CO, 1*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 0+BASE_SHIFT
+ slwi T4, T4, 1+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 1
+
+DSTRM_LT_L2x1_END:
+
+ slwi T1, K, 1+BASE_SHIFT
+ add B, B, T1
+
+DSTRM_LT_L2_END:
+
+DSTRM_LT_L1_BEGIN:
+
+ andi. T1, N, 1
+ ble DSTRM_LT_L1_END
+
+ mr CO, C
+ mr AO, A
+
+ mr KK, OFFSET
+ srawi. I, M, 4
+ ble DSTRM_LT_L1x16_END
+
+
+DSTRM_LT_L1x16_BEGIN:
+
+ mr BO, B
+
+
+DSTRM_LT_L1x16_LOOP_START:
+
+
+ INIT_16x1
+
+
+ addic. L, KK, 0
+ ble DSTRM_LT_L1x16_SAVE
+
+DSTRM_LT_L1x16_LOOP:
+
+
+ KERNEL_16x1
+
+ addic. L, L, -1
+ bgt DSTRM_LT_L1x16_LOOP
+
+
+DSTRM_LT_L1x16_SAVE:
+
+ SOLVE_LT_16x1
+
+ addi CO, CO, 16*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 4+BASE_SHIFT
+ slwi T4, T4, 0+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 16
+
+ addic. I, I, -1
+ bgt DSTRM_LT_L1x16_BEGIN
+
+DSTRM_LT_L1x16_END:
+
+
+DSTRM_LT_L1x8_BEGIN:
+
+ andi. T1, M, 8
+ ble DSTRM_LT_L1x8_END
+
+ mr BO, B
+
+
+DSTRM_LT_L1x8_LOOP_START:
+
+
+ INIT_8x1
+
+
+ addic. L, KK, 0
+ ble DSTRM_LT_L1x8_SAVE
+
+DSTRM_LT_L1x8_LOOP:
+
+
+ KERNEL_8x1
+
+ addic. L, L, -1
+ bgt DSTRM_LT_L1x8_LOOP
+
+
+DSTRM_LT_L1x8_SAVE:
+
+ SOLVE_LT_8x1
+
+ addi CO, CO, 8*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 3+BASE_SHIFT
+ slwi T4, T4, 0+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 8
+
+DSTRM_LT_L1x8_END:
+
+
+DSTRM_LT_L1x4_BEGIN:
+
+ andi. T1, M, 4
+ ble DSTRM_LT_L1x4_END
+
+ mr BO, B
+
+
+DSTRM_LT_L1x4_LOOP_START:
+
+
+ INIT_4x1
+
+
+ addic. L, KK, 0
+ ble DSTRM_LT_L1x4_SAVE
+
+DSTRM_LT_L1x4_LOOP:
+
+
+ KERNEL_4x1
+
+ addic. L, L, -1
+ bgt DSTRM_LT_L1x4_LOOP
+
+
+DSTRM_LT_L1x4_SAVE:
+
+ SOLVE_LT_4x1
+
+ addi CO, CO, 4*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 2+BASE_SHIFT
+ slwi T4, T4, 0+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 4
+
+DSTRM_LT_L1x4_END:
+
+
+DSTRM_LT_L1x2_BEGIN:
+
+ andi. T1, M, 2
+ ble DSTRM_LT_L1x2_END
+
+ mr BO, B
+
+
+DSTRM_LT_L1x2_LOOP_START:
+
+
+ INIT_2x1
+
+
+ addic. L, KK, 0
+ ble DSTRM_LT_L1x2_SAVE
+
+DSTRM_LT_L1x2_LOOP:
+
+
+ KERNEL_2x1
+
+ addic. L, L, -1
+ bgt DSTRM_LT_L1x2_LOOP
+
+
+DSTRM_LT_L1x2_SAVE:
+
+ SOLVE_LT_2x1
+
+ addi CO, CO, 2*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 1+BASE_SHIFT
+ slwi T4, T4, 0+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 2
+
+DSTRM_LT_L1x2_END:
+
+
+DSTRM_LT_L1x1_BEGIN:
+
+ andi. T1, M, 1
+ ble DSTRM_LT_L1x1_END
+
+ mr BO, B
+
+
+DSTRM_LT_L1x1_LOOP_START:
+
+
+ INIT_1x1
+
+
+ addic. L, KK, 0
+ ble DSTRM_LT_L1x1_SAVE
+
+DSTRM_LT_L1x1_LOOP:
+
+
+ KERNEL_1x1
+
+ addic. L, L, -1
+ bgt DSTRM_LT_L1x1_LOOP
+
+
+DSTRM_LT_L1x1_SAVE:
+
+ SOLVE_LT_1x1
+
+ addi CO, CO, 1*SIZE
+
+ sub T3, K, KK
+ sub T4, K, KK
+ slwi T3, T3, 0+BASE_SHIFT
+ slwi T4, T4, 0+BASE_SHIFT
+ add AO, AO, T3
+ add BO, BO, T4
+ addi KK, KK, 1
+
+DSTRM_LT_L1x1_END:
+
+DSTRM_LT_L1_END:
diff --git a/kernel/power/dtrsm_macros_LT_16x4_power8.S b/kernel/power/dtrsm_macros_LT_16x4_power8.S
new file mode 100644
index 000000000..dc47daa3a
--- /dev/null
+++ b/kernel/power/dtrsm_macros_LT_16x4_power8.S
@@ -0,0 +1,4659 @@
+
+.macro INIT_16x4
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+ xvmovdp vs33, vs0
+ xvmovdp vs34, vs0
+ xvmovdp vs35, vs0
+ xvmovdp vs36, vs0
+ xvmovdp vs37, vs0
+ xvmovdp vs38, vs0
+ xvmovdp vs39, vs0
+ xvmovdp vs40, vs0
+ xvmovdp vs41, vs0
+ xvmovdp vs42, vs0
+ xvmovdp vs43, vs0
+ xvmovdp vs44, vs0
+ xvmovdp vs45, vs0
+ xvmovdp vs46, vs0
+ xvmovdp vs47, vs0
+ xvmovdp vs48, vs0
+ xvmovdp vs49, vs0
+ xvmovdp vs50, vs0
+ xvmovdp vs51, vs0
+ xvmovdp vs52, vs0
+ xvmovdp vs53, vs0
+ xvmovdp vs54, vs0
+ xvmovdp vs55, vs0
+ xvmovdp vs56, vs0
+ xvmovdp vs57, vs0
+ xvmovdp vs58, vs0
+ xvmovdp vs59, vs0
+ xvmovdp vs60, vs0
+ xvmovdp vs61, vs0
+ xvmovdp vs62, vs0
+ xvmovdp vs63, vs0
+
+.endm
+
+
+.macro KERNEL_16x4
+
+
+ lxvd2x vs0, o0, AO
+
+ lxvdsx vs16, o0, BO
+ lxvdsx vs17, o8, BO
+ lxvdsx vs18, o16, BO
+ lxvdsx vs19, o24, BO
+
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ addi BO, BO, 32
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs34, vs0, vs18
+ xvmaddadp vs35, vs0, vs19
+ xvmaddadp vs36, vs1, vs16
+ xvmaddadp vs37, vs1, vs17
+ xvmaddadp vs38, vs1, vs18
+ xvmaddadp vs39, vs1, vs19
+ xvmaddadp vs40, vs2, vs16
+ xvmaddadp vs41, vs2, vs17
+ xvmaddadp vs42, vs2, vs18
+ xvmaddadp vs43, vs2, vs19
+ xvmaddadp vs44, vs3, vs16
+ xvmaddadp vs45, vs3, vs17
+ xvmaddadp vs46, vs3, vs18
+ xvmaddadp vs47, vs3, vs19
+ xvmaddadp vs48, vs4, vs16
+ xvmaddadp vs49, vs4, vs17
+ xvmaddadp vs50, vs4, vs18
+ xvmaddadp vs51, vs4, vs19
+ xvmaddadp vs52, vs5, vs16
+ xvmaddadp vs53, vs5, vs17
+ xvmaddadp vs54, vs5, vs18
+ xvmaddadp vs55, vs5, vs19
+ xvmaddadp vs56, vs6, vs16
+ xvmaddadp vs57, vs6, vs17
+ xvmaddadp vs58, vs6, vs18
+ xvmaddadp vs59, vs6, vs19
+ xvmaddadp vs60, vs7, vs16
+ xvmaddadp vs61, vs7, vs17
+ xvmaddadp vs62, vs7, vs18
+ xvmaddadp vs63, vs7, vs19
+
+
+.endm
+
+
+.macro INIT_8x4
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+ xvmovdp vs33, vs0
+ xvmovdp vs34, vs0
+ xvmovdp vs35, vs0
+ xvmovdp vs36, vs0
+ xvmovdp vs37, vs0
+ xvmovdp vs38, vs0
+ xvmovdp vs39, vs0
+ xvmovdp vs40, vs0
+ xvmovdp vs41, vs0
+ xvmovdp vs42, vs0
+ xvmovdp vs43, vs0
+ xvmovdp vs44, vs0
+ xvmovdp vs45, vs0
+ xvmovdp vs46, vs0
+ xvmovdp vs47, vs0
+
+.endm
+
+
+.macro KERNEL_8x4
+
+
+ lxvd2x vs0, o0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO
+ lxvdsx vs17, o8, BO
+ lxvdsx vs18, o16, BO
+ lxvdsx vs19, o24, BO
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs34, vs0, vs18
+ xvmaddadp vs35, vs0, vs19
+ xvmaddadp vs36, vs1, vs16
+ xvmaddadp vs37, vs1, vs17
+ xvmaddadp vs38, vs1, vs18
+ xvmaddadp vs39, vs1, vs19
+ xvmaddadp vs40, vs2, vs16
+ xvmaddadp vs41, vs2, vs17
+ xvmaddadp vs42, vs2, vs18
+ xvmaddadp vs43, vs2, vs19
+ xvmaddadp vs44, vs3, vs16
+ xvmaddadp vs45, vs3, vs17
+ xvmaddadp vs46, vs3, vs18
+ xvmaddadp vs47, vs3, vs19
+
+
+.endm
+
+
+.macro INIT_4x4
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+ xvmovdp vs33, vs0
+ xvmovdp vs34, vs0
+ xvmovdp vs35, vs0
+ xvmovdp vs36, vs0
+ xvmovdp vs37, vs0
+ xvmovdp vs38, vs0
+ xvmovdp vs39, vs0
+
+.endm
+
+
+.macro KERNEL_4x4
+
+
+ lxvd2x vs0, o0, AO
+ lxvd2x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO
+ lxvdsx vs17, o8, BO
+ lxvdsx vs18, o16, BO
+ lxvdsx vs19, o24, BO
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs34, vs0, vs18
+ xvmaddadp vs35, vs0, vs19
+ xvmaddadp vs36, vs1, vs16
+ xvmaddadp vs37, vs1, vs17
+ xvmaddadp vs38, vs1, vs18
+ xvmaddadp vs39, vs1, vs19
+
+
+.endm
+
+
+.macro INIT_2x4
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+ xvmovdp vs33, vs0
+ xvmovdp vs34, vs0
+ xvmovdp vs35, vs0
+
+.endm
+
+
+.macro KERNEL_2x4
+
+
+ lxvd2x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO
+ lxvdsx vs17, o8, BO
+ lxvdsx vs18, o16, BO
+ lxvdsx vs19, o24, BO
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs34, vs0, vs18
+ xvmaddadp vs35, vs0, vs19
+
+
+.endm
+
+
+.macro INIT_1x4
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+ xvmovdp vs33, vs0
+ xvmovdp vs34, vs0
+ xvmovdp vs35, vs0
+
+.endm
+
+
+.macro KERNEL_1x4
+
+
+ lxvdsx vs0, o0, AO
+
+ addi AO, AO, 8
+
+ lxvdsx vs16, o0, BO
+ lxvdsx vs17, o8, BO
+ lxvdsx vs18, o16, BO
+ lxvdsx vs19, o24, BO
+
+ addi BO, BO, 32
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs34, vs0, vs18
+ xvmaddadp vs35, vs0, vs19
+
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 16x4
+##########################################################################################*/
+
+.macro SOLVE_LT_16x4
+
+//############### LOAD B #######################
+
+ mr T1, BO
+ mr T4, BO
+
+ xxpermdi vs0, vs32, vs33, 0
+ xxpermdi vs1, vs34, vs35, 0
+ xxpermdi vs2, vs32, vs33, 3
+ xxpermdi vs3, vs34, vs35, 3
+
+ lxvd2x vs32, o0, T1
+ lxvd2x vs33, o16, T1
+ lxvd2x vs34, o32, T1
+ lxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ xxpermdi vs4, vs36, vs37, 0
+ xxpermdi vs5, vs38, vs39, 0
+ xxpermdi vs6, vs36, vs37, 3
+ xxpermdi vs7, vs38, vs39, 3
+
+ lxvd2x vs36, o0, T1
+ lxvd2x vs37, o16, T1
+ lxvd2x vs38, o32, T1
+ lxvd2x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ xxpermdi vs8, vs40, vs41, 0
+ xxpermdi vs9, vs42, vs43, 0
+ xxpermdi vs10, vs40, vs41, 3
+ xxpermdi vs11, vs42, vs43, 3
+
+ lxvd2x vs40, o0, T1
+ lxvd2x vs41, o16, T1
+ lxvd2x vs42, o32, T1
+ lxvd2x vs43, o48, T1
+
+ addi T1, T1, 64
+
+ xxpermdi vs12, vs44, vs45, 0
+ xxpermdi vs13, vs46, vs47, 0
+ xxpermdi vs14, vs44, vs45, 3
+ xxpermdi vs15, vs46, vs47, 3
+
+ lxvd2x vs44, o0, T1
+ lxvd2x vs45, o16, T1
+ lxvd2x vs46, o32, T1
+ lxvd2x vs47, o48, T1
+
+ addi T1, T1, 64
+
+ xxpermdi vs16, vs48, vs49, 0
+ xxpermdi vs17, vs50, vs51, 0
+ xxpermdi vs18, vs48, vs49, 3
+ xxpermdi vs19, vs50, vs51, 3
+
+ lxvd2x vs48, o0, T1
+ lxvd2x vs49, o16, T1
+ lxvd2x vs50, o32, T1
+ lxvd2x vs51, o48, T1
+
+ addi T1, T1, 64
+
+ xxpermdi vs20, vs52, vs53, 0
+ xxpermdi vs21, vs54, vs55, 0
+ xxpermdi vs22, vs52, vs53, 3
+ xxpermdi vs23, vs54, vs55, 3
+
+ lxvd2x vs52, o0, T1
+ lxvd2x vs53, o16, T1
+ lxvd2x vs54, o32, T1
+ lxvd2x vs55, o48, T1
+
+ addi T1, T1, 64
+
+ xxpermdi vs24, vs56, vs57, 0
+ xxpermdi vs25, vs58, vs59, 0
+ xxpermdi vs26, vs56, vs57, 3
+ xxpermdi vs27, vs58, vs59, 3
+
+ lxvd2x vs56, o0, T1
+ lxvd2x vs57, o16, T1
+ lxvd2x vs58, o32, T1
+ lxvd2x vs59, o48, T1
+
+ addi T1, T1, 64
+
+ xxpermdi vs28, vs60, vs61, 0
+ xxpermdi vs29, vs62, vs63, 0
+ xxpermdi vs30, vs60, vs61, 3
+ xxpermdi vs31, vs62, vs63, 3
+
+
+
+ lxvd2x vs60, o0, T1
+ lxvd2x vs61, o16, T1
+ lxvd2x vs62, o32, T1
+ lxvd2x vs63, o48, T1
+
+//############### OFFSET 0 #######################
+
+ dcbt AO, PRE
+ mr T1, AO
+
+ xvsubdp vs32, vs32, vs0
+ xvsubdp vs33, vs33, vs1
+ xvsubdp vs34, vs34, vs2
+ xvsubdp vs35, vs35, vs3
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvsubdp vs36, vs36, vs4
+ xvsubdp vs37, vs37, vs5
+ xvsubdp vs38, vs38, vs6
+ xvsubdp vs39, vs39, vs7
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ xvsubdp vs40, vs40, vs8
+ xvsubdp vs41, vs41, vs9
+ xvsubdp vs42, vs42, vs10
+ xvsubdp vs43, vs43, vs11
+
+ lxvdsx vs8, o0, T1
+ lxvdsx vs9, o8, T1
+ lxvdsx vs10, o16, T1
+ lxvdsx vs11, o24, T1
+
+ addi T1, T1, 32
+
+ xvsubdp vs44, vs44, vs12
+ xvsubdp vs45, vs45, vs13
+ xvsubdp vs46, vs46, vs14
+ xvsubdp vs47, vs47, vs15
+
+ lxvdsx vs12, o0, T1
+ lxvdsx vs13, o8, T1
+ lxvdsx vs14, o16, T1
+ lxvdsx vs15, o24, T1
+
+ addi T1, T1, 32
+
+ xvsubdp vs48, vs48, vs16
+ xvsubdp vs49, vs49, vs17
+ xvsubdp vs50, vs50, vs18
+ xvsubdp vs51, vs51, vs19
+
+ xvsubdp vs52, vs52, vs20
+ xvsubdp vs53, vs53, vs21
+ xvsubdp vs54, vs54, vs22
+ xvsubdp vs55, vs55, vs23
+
+ xvsubdp vs56, vs56, vs24
+ xvsubdp vs57, vs57, vs25
+ xvsubdp vs58, vs58, vs26
+ xvsubdp vs59, vs59, vs27
+
+ xvsubdp vs60, vs60, vs28
+ xvsubdp vs61, vs61, vs29
+ xvsubdp vs62, vs62, vs30
+ xvsubdp vs63, vs63, vs31
+
+//############### OFFSET 1 #######################
+
+ addi T1, T1, 1*SIZE
+
+ xvmuldp vs32, vs32, vs0
+ xvmuldp vs33, vs33, vs0
+
+ xvnmsubadp vs34, vs32, vs1
+ xvnmsubadp vs35, vs33, vs1
+ xvnmsubadp vs36, vs32, vs2
+ dcbt T1, PRE
+ xvnmsubadp vs37, vs33, vs2
+ xvnmsubadp vs38, vs32, vs3
+ xvnmsubadp vs39, vs33, vs3
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs40, vs32, vs4
+ xvnmsubadp vs41, vs33, vs4
+ xvnmsubadp vs42, vs32, vs5
+ xvnmsubadp vs43, vs33, vs5
+ xvnmsubadp vs44, vs32, vs6
+ xvnmsubadp vs45, vs33, vs6
+ xvnmsubadp vs46, vs32, vs7
+ xvnmsubadp vs47, vs33, vs7
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs48, vs32, vs8
+ xvnmsubadp vs49, vs33, vs8
+ xvnmsubadp vs50, vs32, vs9
+ xvnmsubadp vs51, vs33, vs9
+ xvnmsubadp vs52, vs32, vs10
+ xvnmsubadp vs53, vs33, vs10
+ xvnmsubadp vs54, vs32, vs11
+ xvnmsubadp vs55, vs33, vs11
+
+ lxvdsx vs8, o0, T1
+ lxvdsx vs9, o8, T1
+ lxvdsx vs10, o16, T1
+ lxvdsx vs11, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs56, vs32, vs12
+ xvnmsubadp vs57, vs33, vs12
+ xvnmsubadp vs58, vs32, vs13
+ xvnmsubadp vs59, vs33, vs13
+ xvnmsubadp vs60, vs32, vs14
+ xvnmsubadp vs61, vs33, vs14
+ xvnmsubadp vs62, vs32, vs15
+ xvnmsubadp vs63, vs33, vs15
+
+
+ lxvdsx vs12, o0, T1
+ lxvdsx vs13, o8, T1
+ lxvdsx vs14, o16, T1
+
+ addi T1, T1, 24
+
+//############### OFFSET 2 #######################
+
+ xvmuldp vs34, vs34, vs0
+ xvmuldp vs35, vs35, vs0
+
+ addi T1, T1, 2*SIZE
+
+ xvnmsubadp vs36, vs34, vs1
+ xvnmsubadp vs37, vs35, vs1
+ xvnmsubadp vs38, vs34, vs2
+ dcbt T1, PRE
+ xvnmsubadp vs39, vs35, vs2
+ xvnmsubadp vs40, vs34, vs3
+ xvnmsubadp vs41, vs35, vs3
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs42, vs34, vs4
+ xvnmsubadp vs43, vs35, vs4
+ xvnmsubadp vs44, vs34, vs5
+ xvnmsubadp vs45, vs35, vs5
+ xvnmsubadp vs46, vs34, vs6
+ xvnmsubadp vs47, vs35, vs6
+ xvnmsubadp vs48, vs34, vs7
+ xvnmsubadp vs49, vs35, vs7
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs50, vs34, vs8
+ xvnmsubadp vs51, vs35, vs8
+ xvnmsubadp vs52, vs34, vs9
+ xvnmsubadp vs53, vs35, vs9
+ xvnmsubadp vs54, vs34, vs10
+ xvnmsubadp vs55, vs35, vs10
+ xvnmsubadp vs56, vs34, vs11
+ xvnmsubadp vs57, vs35, vs11
+
+ lxvdsx vs8, o0, T1
+ lxvdsx vs9, o8, T1
+ lxvdsx vs10, o16, T1
+ lxvdsx vs11, o24, T1
+
+ addi T1, T1, 32
+
+
+ xvnmsubadp vs58, vs34, vs12
+ xvnmsubadp vs59, vs35, vs12
+ xvnmsubadp vs60, vs34, vs13
+ xvnmsubadp vs61, vs35, vs13
+ xvnmsubadp vs62, vs34, vs14
+ xvnmsubadp vs63, vs35, vs14
+
+ lxvdsx vs12, o0, T1
+ lxvdsx vs13, o8, T1
+
+ addi T1, T1, 16
+
+//############### OFFSET 3 #######################
+ xvmuldp vs36, vs36, vs0
+ xvmuldp vs37, vs37, vs0
+
+ addi T1, T1, 3*SIZE
+
+ xvnmsubadp vs38, vs36, vs1
+ xvnmsubadp vs39, vs37, vs1
+ xvnmsubadp vs40, vs36, vs2
+ dcbt T1, PRE
+ xvnmsubadp vs41, vs37, vs2
+ xvnmsubadp vs42, vs36, vs3
+ xvnmsubadp vs43, vs37, vs3
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs44, vs36, vs4
+ xvnmsubadp vs45, vs37, vs4
+ xvnmsubadp vs46, vs36, vs5
+ xvnmsubadp vs47, vs37, vs5
+ xvnmsubadp vs48, vs36, vs6
+ xvnmsubadp vs49, vs37, vs6
+ xvnmsubadp vs50, vs36, vs7
+ xvnmsubadp vs51, vs37, vs7
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs52, vs36, vs8
+ xvnmsubadp vs53, vs37, vs8
+ xvnmsubadp vs54, vs36, vs9
+ xvnmsubadp vs55, vs37, vs9
+ xvnmsubadp vs56, vs36, vs10
+ xvnmsubadp vs57, vs37, vs10
+ xvnmsubadp vs58, vs36, vs11
+ xvnmsubadp vs59, vs37, vs11
+
+ lxvdsx vs8, o0, T1
+ lxvdsx vs9, o8, T1
+ lxvdsx vs10, o16, T1
+ lxvdsx vs11, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs60, vs36, vs12
+ xvnmsubadp vs61, vs37, vs12
+ xvnmsubadp vs62, vs36, vs13
+ xvnmsubadp vs63, vs37, vs13
+
+ lxvdsx vs12, o0, T1
+
+ stxvd2x vs32, o0, T4
+ stxvd2x vs33, o16, T4
+ stxvd2x vs34, o32, T4
+ stxvd2x vs35, o48, T4
+
+ addi T4, T4, 64
+
+ addi T1, T1, 8
+
+//############### OFFSET 4 #######################
+ xvmuldp vs38, vs38, vs0
+ xvmuldp vs39, vs39, vs0
+
+ addi T1, T1, 4*SIZE
+
+ xvnmsubadp vs40, vs38, vs1
+ xvnmsubadp vs41, vs39, vs1
+ xvnmsubadp vs42, vs38, vs2
+ dcbt T1, PRE
+ xvnmsubadp vs43, vs39, vs2
+ xvnmsubadp vs44, vs38, vs3
+ xvnmsubadp vs45, vs39, vs3
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs46, vs38, vs4
+ xvnmsubadp vs47, vs39, vs4
+ xvnmsubadp vs48, vs38, vs5
+ xvnmsubadp vs49, vs39, vs5
+ xvnmsubadp vs50, vs38, vs6
+ xvnmsubadp vs51, vs39, vs6
+ xvnmsubadp vs52, vs38, vs7
+ xvnmsubadp vs53, vs39, vs7
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+
+ xvnmsubadp vs54, vs38, vs8
+ xvnmsubadp vs55, vs39, vs8
+ xvnmsubadp vs56, vs38, vs9
+ xvnmsubadp vs57, vs39, vs9
+ xvnmsubadp vs58, vs38, vs10
+ xvnmsubadp vs59, vs39, vs10
+ xvnmsubadp vs60, vs38, vs11
+ xvnmsubadp vs61, vs39, vs11
+
+ lxvdsx vs8, o0, T1
+ lxvdsx vs9, o8, T1
+ lxvdsx vs10, o16, T1
+ lxvdsx vs11, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs62, vs38, vs12
+ xvnmsubadp vs63, vs39, vs12
+
+
+//############### OFFSET 5 #######################
+ xvmuldp vs40, vs40, vs0
+ xvmuldp vs41, vs41, vs0
+
+ addi T1, T1, 5*SIZE
+
+ xvnmsubadp vs42, vs40, vs1
+ xvnmsubadp vs43, vs41, vs1
+ xvnmsubadp vs44, vs40, vs2
+ dcbt T1, PRE
+ xvnmsubadp vs45, vs41, vs2
+ xvnmsubadp vs46, vs40, vs3
+ xvnmsubadp vs47, vs41, vs3
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs48, vs40, vs4
+ xvnmsubadp vs49, vs41, vs4
+ xvnmsubadp vs50, vs40, vs5
+ xvnmsubadp vs51, vs41, vs5
+ xvnmsubadp vs52, vs40, vs6
+ xvnmsubadp vs53, vs41, vs6
+ xvnmsubadp vs54, vs40, vs7
+ xvnmsubadp vs55, vs41, vs7
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs56, vs40, vs8
+ xvnmsubadp vs57, vs41, vs8
+ xvnmsubadp vs58, vs40, vs9
+ xvnmsubadp vs59, vs41, vs9
+ xvnmsubadp vs60, vs40, vs10
+ xvnmsubadp vs61, vs41, vs10
+ xvnmsubadp vs62, vs40, vs11
+ xvnmsubadp vs63, vs41, vs11
+
+
+ lxvdsx vs8, o0, T1
+ lxvdsx vs9, o8, T1
+ lxvdsx vs10, o16, T1
+
+ addi T1, T1, 24
+
+//############### OFFSET 6 #######################
+ xvmuldp vs42, vs42, vs0
+ xvmuldp vs43, vs43, vs0
+
+ addi T1, T1, 6*SIZE
+
+ xvnmsubadp vs44, vs42, vs1
+ xvnmsubadp vs45, vs43, vs1
+ xvnmsubadp vs46, vs42, vs2
+ dcbt T1, PRE
+ xvnmsubadp vs47, vs43, vs2
+ xvnmsubadp vs48, vs42, vs3
+ xvnmsubadp vs49, vs43, vs3
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs50, vs42, vs4
+ xvnmsubadp vs51, vs43, vs4
+ xvnmsubadp vs52, vs42, vs5
+ xvnmsubadp vs53, vs43, vs5
+ xvnmsubadp vs54, vs42, vs6
+ xvnmsubadp vs55, vs43, vs6
+ xvnmsubadp vs56, vs42, vs7
+ xvnmsubadp vs57, vs43, vs7
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs58, vs42, vs8
+ xvnmsubadp vs59, vs43, vs8
+ xvnmsubadp vs60, vs42, vs9
+ xvnmsubadp vs61, vs43, vs9
+ xvnmsubadp vs62, vs42, vs10
+ xvnmsubadp vs63, vs43, vs10
+
+ lxvdsx vs8, o0, T1
+ lxvdsx vs9, o8, T1
+
+ addi T1, T1, 16
+
+ stxvd2x vs36, o0, T4
+ stxvd2x vs37, o16, T4
+ stxvd2x vs38, o32, T4
+ stxvd2x vs39, o48, T4
+
+ addi T4, T4, 64
+
+//############### OFFSET 7 #######################
+ xvmuldp vs44, vs44, vs0
+ xvmuldp vs45, vs45, vs0
+
+ addi T1, T1, 7*SIZE
+
+ xvnmsubadp vs46, vs44, vs1
+ xvnmsubadp vs47, vs45, vs1
+ xvnmsubadp vs48, vs44, vs2
+ dcbt T1, PRE
+ xvnmsubadp vs49, vs45, vs2
+ xvnmsubadp vs50, vs44, vs3
+ xvnmsubadp vs51, vs45, vs3
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs52, vs44, vs4
+ xvnmsubadp vs53, vs45, vs4
+ xvnmsubadp vs54, vs44, vs5
+ xvnmsubadp vs55, vs45, vs5
+ xvnmsubadp vs56, vs44, vs6
+ xvnmsubadp vs57, vs45, vs6
+ xvnmsubadp vs58, vs44, vs7
+ xvnmsubadp vs59, vs45, vs7
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs60, vs44, vs8
+ xvnmsubadp vs61, vs45, vs8
+ xvnmsubadp vs62, vs44, vs9
+ xvnmsubadp vs63, vs45, vs9
+
+ lxvdsx vs8, o0, T1
+
+ addi T1, T1, 8
+
+//############### OFFSET 8 #######################
+ xvmuldp vs46, vs46, vs0
+ xvmuldp vs47, vs47, vs0
+
+ addi T1, T1, 8*SIZE
+
+ xvnmsubadp vs48, vs46, vs1
+ xvnmsubadp vs49, vs47, vs1
+ xvnmsubadp vs50, vs46, vs2
+ dcbt T1, PRE
+ xvnmsubadp vs51, vs47, vs2
+ xvnmsubadp vs52, vs46, vs3
+ xvnmsubadp vs53, vs47, vs3
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs54, vs46, vs4
+ xvnmsubadp vs55, vs47, vs4
+ xvnmsubadp vs56, vs46, vs5
+ xvnmsubadp vs57, vs47, vs5
+ xvnmsubadp vs58, vs46, vs6
+ xvnmsubadp vs59, vs47, vs6
+ xvnmsubadp vs60, vs46, vs7
+ xvnmsubadp vs61, vs47, vs7
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ stxvd2x vs40, o0, T4
+ stxvd2x vs41, o16, T4
+ stxvd2x vs42, o32, T4
+ stxvd2x vs43, o48, T4
+
+ addi T4, T4, 64
+
+ xvnmsubadp vs62, vs46, vs8
+ xvnmsubadp vs63, vs47, vs8
+
+
+//############### OFFSET 9 #######################
+ xvmuldp vs48, vs48, vs0
+ xvmuldp vs49, vs49, vs0
+
+ addi T1, T1, 9*SIZE
+
+ xvnmsubadp vs50, vs48, vs1
+ xvnmsubadp vs51, vs49, vs1
+ xvnmsubadp vs52, vs48, vs2
+ dcbt T1, PRE
+ xvnmsubadp vs53, vs49, vs2
+ xvnmsubadp vs54, vs48, vs3
+ xvnmsubadp vs55, vs49, vs3
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs56, vs48, vs4
+ xvnmsubadp vs57, vs49, vs4
+ xvnmsubadp vs58, vs48, vs5
+ xvnmsubadp vs59, vs49, vs5
+ xvnmsubadp vs60, vs48, vs6
+ xvnmsubadp vs61, vs49, vs6
+ xvnmsubadp vs62, vs48, vs7
+ xvnmsubadp vs63, vs49, vs7
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+
+ addi T1, T1, 24
+
+//############### OFFSET 10 #######################
+ xvmuldp vs50, vs50, vs0
+ xvmuldp vs51, vs51, vs0
+
+ addi T1, T1, 10*SIZE
+
+ xvnmsubadp vs52, vs50, vs1
+ xvnmsubadp vs53, vs51, vs1
+ xvnmsubadp vs54, vs50, vs2
+ dcbt T1, PRE
+ xvnmsubadp vs55, vs51, vs2
+ xvnmsubadp vs56, vs50, vs3
+ xvnmsubadp vs57, vs51, vs3
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs58, vs50, vs4
+ xvnmsubadp vs59, vs51, vs4
+ xvnmsubadp vs60, vs50, vs5
+ xvnmsubadp vs61, vs51, vs5
+ xvnmsubadp vs62, vs50, vs6
+ xvnmsubadp vs63, vs51, vs6
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+
+ addi T1, T1, 16
+
+ stxvd2x vs44, o0, T4
+ stxvd2x vs45, o16, T4
+ stxvd2x vs46, o32, T4
+ stxvd2x vs47, o48, T4
+
+ addi T4, T4, 64
+
+//############### OFFSET 11 #######################
+ xvmuldp vs52, vs52, vs0
+ xvmuldp vs53, vs53, vs0
+
+ addi T1, T1, 11*SIZE
+
+ xvnmsubadp vs54, vs52, vs1
+ xvnmsubadp vs55, vs53, vs1
+ xvnmsubadp vs56, vs52, vs2
+ dcbt T1, PRE
+ xvnmsubadp vs57, vs53, vs2
+ xvnmsubadp vs58, vs52, vs3
+ xvnmsubadp vs59, vs53, vs3
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvnmsubadp vs60, vs52, vs4
+ xvnmsubadp vs61, vs53, vs4
+ xvnmsubadp vs62, vs52, vs5
+ xvnmsubadp vs63, vs53, vs5
+
+ lxvdsx vs4, o0, T1
+
+ addi T1, T1, 8
+
+//############### OFFSET 12 #######################
+ xvmuldp vs54, vs54, vs0
+ xvmuldp vs55, vs55, vs0
+
+ addi T1, T1, 12*SIZE
+
+ xvnmsubadp vs56, vs54, vs1
+ xvnmsubadp vs57, vs55, vs1
+ xvnmsubadp vs58, vs54, vs2
+ dcbt T1, PRE
+ xvnmsubadp vs59, vs55, vs2
+ xvnmsubadp vs60, vs54, vs3
+ xvnmsubadp vs61, vs55, vs3
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ stxvd2x vs48, o0, T4
+ stxvd2x vs49, o16, T4
+ stxvd2x vs50, o32, T4
+ stxvd2x vs51, o48, T4
+
+ addi T4, T4, 64
+
+ xvnmsubadp vs62, vs54, vs4
+ xvnmsubadp vs63, vs55, vs4
+
+
+//############### OFFSET 13 #######################
+ xvmuldp vs56, vs56, vs0
+ xvmuldp vs57, vs57, vs0
+
+ addi T1, T1, 13*SIZE
+
+ xvnmsubadp vs58, vs56, vs1
+ xvnmsubadp vs59, vs57, vs1
+ xvnmsubadp vs60, vs56, vs2
+ xvnmsubadp vs61, vs57, vs2
+ xvnmsubadp vs62, vs56, vs3
+ xvnmsubadp vs63, vs57, vs3
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+
+ addi T1, T1, 24
+
+//############### OFFSET 14 #######################
+ xvmuldp vs58, vs58, vs0
+ xvmuldp vs59, vs59, vs0
+
+ addi T1, T1, 14*SIZE
+
+ xvnmsubadp vs60, vs58, vs1
+ xvnmsubadp vs61, vs59, vs1
+ xvnmsubadp vs62, vs58, vs2
+ xvnmsubadp vs63, vs59, vs2
+
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+
+ addi T1, T1, 16
+
+ stxvd2x vs52, o0, T4
+ stxvd2x vs53, o16, T4
+ stxvd2x vs54, o32, T4
+ stxvd2x vs55, o48, T4
+
+ addi T4, T4, 64
+//############### OFFSET 15 #######################
+ xvmuldp vs60, vs60, vs0
+ xvmuldp vs61, vs61, vs0
+
+ addi T1, T1, 15*SIZE
+
+ xvnmsubadp vs62, vs60, vs1
+ xvnmsubadp vs63, vs61, vs1
+
+ lxvdsx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs62, vs62, vs0
+ xvmuldp vs63, vs63, vs0
+
+
+//############### SAVE B #######################
+
+
+
+ stxvd2x vs56, o0, T4
+ stxvd2x vs57, o16, T4
+ stxvd2x vs58, o32, T4
+ stxvd2x vs59, o48, T4
+
+ addi T4, T4, 64
+
+ stxvd2x vs60, o0, T4
+ stxvd2x vs61, o16, T4
+ stxvd2x vs62, o32, T4
+ stxvd2x vs63, o48, T4
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ stxsdx vs32, o0, T1
+ xxswapd vs32, vs32
+ stxsdx vs34, o8, T1
+ xxswapd vs34, vs34
+ stxsdx vs36, o16, T1
+ xxswapd vs36, vs36
+ stxsdx vs38, o24, T1
+ xxswapd vs38, vs38
+
+ addi T1, T1, 32
+
+ stxsdx vs40, o0, T1
+ xxswapd vs40, vs40
+ stxsdx vs42, o8, T1
+ xxswapd vs42, vs42
+ stxsdx vs44, o16, T1
+ xxswapd vs44, vs44
+ stxsdx vs46, o24, T1
+ xxswapd vs46, vs46
+
+ addi T1, T1, 32
+
+ stxsdx vs48, o0, T1
+ xxswapd vs48, vs48
+ stxsdx vs50, o8, T1
+ xxswapd vs50, vs50
+ stxsdx vs52, o16, T1
+ xxswapd vs52, vs52
+ stxsdx vs54, o24, T1
+ xxswapd vs54, vs54
+
+ addi T1, T1, 32
+
+ stxsdx vs56, o0, T1
+ xxswapd vs56, vs56
+ stxsdx vs58, o8, T1
+ xxswapd vs58, vs58
+ stxsdx vs60, o16, T1
+ xxswapd vs60, vs60
+ stxsdx vs62, o24, T1
+ xxswapd vs62, vs62
+
+ stxsdx vs32, o0, T2
+ stxsdx vs34, o8, T2
+ stxsdx vs36, o16, T2
+ stxsdx vs38, o24, T2
+
+ addi T2, T2, 32
+
+ stxsdx vs40, o0, T2
+ stxsdx vs42, o8, T2
+ stxsdx vs44, o16, T2
+ stxsdx vs46, o24, T2
+
+ addi T2, T2, 32
+
+ stxsdx vs48, o0, T2
+ stxsdx vs50, o8, T2
+ stxsdx vs52, o16, T2
+ stxsdx vs54, o24, T2
+
+ addi T2, T2, 32
+
+ stxsdx vs56, o0, T2
+ stxsdx vs58, o8, T2
+ stxsdx vs60, o16, T2
+ stxsdx vs62, o24, T2
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ add T1, T2, LDC
+ add T2, T1, LDC
+
+
+ stxsdx vs33, o0, T1
+ xxswapd vs33, vs33
+ stxsdx vs35, o8, T1
+ xxswapd vs35, vs35
+ stxsdx vs37, o16, T1
+ xxswapd vs37, vs37
+ stxsdx vs39, o24, T1
+ xxswapd vs39, vs39
+
+ addi T1, T1, 32
+
+ stxsdx vs41, o0, T1
+ xxswapd vs41, vs41
+ stxsdx vs43, o8, T1
+ xxswapd vs43, vs43
+ stxsdx vs45, o16, T1
+ xxswapd vs45, vs45
+ stxsdx vs47, o24, T1
+ xxswapd vs47, vs47
+
+ addi T1, T1, 32
+
+ stxsdx vs49, o0, T1
+ xxswapd vs49, vs49
+ stxsdx vs51, o8, T1
+ xxswapd vs51, vs51
+ stxsdx vs53, o16, T1
+ xxswapd vs53, vs53
+ stxsdx vs55, o24, T1
+ xxswapd vs55, vs55
+
+ addi T1, T1, 32
+
+ stxsdx vs57, o0, T1
+ xxswapd vs57, vs57
+ stxsdx vs59, o8, T1
+ xxswapd vs59, vs59
+ stxsdx vs61, o16, T1
+ xxswapd vs61, vs61
+ stxsdx vs63, o24, T1
+ xxswapd vs63, vs63
+
+ stxsdx vs33, o0, T2
+ stxsdx vs35, o8, T2
+ stxsdx vs37, o16, T2
+ stxsdx vs39, o24, T2
+
+ addi T2, T2, 32
+
+ stxsdx vs41, o0, T2
+ stxsdx vs43, o8, T2
+ stxsdx vs45, o16, T2
+ stxsdx vs47, o24, T2
+
+ addi T2, T2, 32
+
+ stxsdx vs49, o0, T2
+ stxsdx vs51, o8, T2
+ stxsdx vs53, o16, T2
+ stxsdx vs55, o24, T2
+
+ addi T2, T2, 32
+
+ stxsdx vs57, o0, T2
+ stxsdx vs59, o8, T2
+ stxsdx vs61, o16, T2
+ stxsdx vs63, o24, T2
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 8x4
+##########################################################################################*/
+
+.macro SOLVE_LT_8x4
+
+ xxpermdi vs0, vs32, vs33, 0
+ xxpermdi vs1, vs34, vs35, 0
+ xxpermdi vs2, vs32, vs33, 3
+ xxpermdi vs3, vs34, vs35, 3
+
+ xxpermdi vs4, vs36, vs37, 0
+ xxpermdi vs5, vs38, vs39, 0
+ xxpermdi vs6, vs36, vs37, 3
+ xxpermdi vs7, vs38, vs39, 3
+
+ xxpermdi vs8, vs40, vs41, 0
+ xxpermdi vs9, vs42, vs43, 0
+ xxpermdi vs10, vs40, vs41, 3
+ xxpermdi vs11, vs42, vs43, 3
+
+ xxpermdi vs12, vs44, vs45, 0
+ xxpermdi vs13, vs46, vs47, 0
+ xxpermdi vs14, vs44, vs45, 3
+ xxpermdi vs15, vs46, vs47, 3
+
+
+//############### LOAD B #######################
+
+
+ mr T1, BO
+
+ lxvd2x vs32, o0, T1
+ lxvd2x vs33, o16, T1
+ lxvd2x vs34, o32, T1
+ lxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ lxvd2x vs36, o0, T1
+ lxvd2x vs37, o16, T1
+ lxvd2x vs38, o32, T1
+ lxvd2x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ lxvd2x vs40, o0, T1
+ lxvd2x vs41, o16, T1
+ lxvd2x vs42, o32, T1
+ lxvd2x vs43, o48, T1
+
+ addi T1, T1, 64
+
+ lxvd2x vs44, o0, T1
+ lxvd2x vs45, o16, T1
+ lxvd2x vs46, o32, T1
+ lxvd2x vs47, o48, T1
+
+ xvsubdp vs32, vs32, vs0
+ xvsubdp vs33, vs33, vs1
+ xvsubdp vs34, vs34, vs2
+ xvsubdp vs35, vs35, vs3
+ xvsubdp vs36, vs36, vs4
+ xvsubdp vs37, vs37, vs5
+ xvsubdp vs38, vs38, vs6
+ xvsubdp vs39, vs39, vs7
+ xvsubdp vs40, vs40, vs8
+ xvsubdp vs41, vs41, vs9
+ xvsubdp vs42, vs42, vs10
+ xvsubdp vs43, vs43, vs11
+ xvsubdp vs44, vs44, vs12
+ xvsubdp vs45, vs45, vs13
+ xvsubdp vs46, vs46, vs14
+ xvsubdp vs47, vs47, vs15
+
+ mr T1, AO
+
+
+//############### OFFSET 0 #######################
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ xvmuldp vs32, vs32, vs0
+ xvmuldp vs33, vs33, vs0
+
+ xvnmsubadp vs34, vs32, vs1
+ xvnmsubadp vs35, vs33, vs1
+ xvnmsubadp vs36, vs32, vs2
+ xvnmsubadp vs37, vs33, vs2
+ xvnmsubadp vs38, vs32, vs3
+ xvnmsubadp vs39, vs33, vs3
+ xvnmsubadp vs40, vs32, vs4
+ xvnmsubadp vs41, vs33, vs4
+ xvnmsubadp vs42, vs32, vs5
+ xvnmsubadp vs43, vs33, vs5
+ xvnmsubadp vs44, vs32, vs6
+ xvnmsubadp vs45, vs33, vs6
+ xvnmsubadp vs46, vs32, vs7
+ xvnmsubadp vs47, vs33, vs7
+
+//############### OFFSET 1 #######################
+
+ addi T1, T1, 1*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+
+ addi T1, T1, 24
+
+ xvmuldp vs34, vs34, vs0
+ xvmuldp vs35, vs35, vs0
+
+ xvnmsubadp vs36, vs34, vs1
+ xvnmsubadp vs37, vs35, vs1
+ xvnmsubadp vs38, vs34, vs2
+ xvnmsubadp vs39, vs35, vs2
+ xvnmsubadp vs40, vs34, vs3
+ xvnmsubadp vs41, vs35, vs3
+ xvnmsubadp vs42, vs34, vs4
+ xvnmsubadp vs43, vs35, vs4
+ xvnmsubadp vs44, vs34, vs5
+ xvnmsubadp vs45, vs35, vs5
+ xvnmsubadp vs46, vs34, vs6
+ xvnmsubadp vs47, vs35, vs6
+
+//############### OFFSET 2 #######################
+
+ addi T1, T1, 2*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+
+ addi T1, T1, 16
+
+ xvmuldp vs36, vs36, vs0
+ xvmuldp vs37, vs37, vs0
+
+ xvnmsubadp vs38, vs36, vs1
+ xvnmsubadp vs39, vs37, vs1
+ xvnmsubadp vs40, vs36, vs2
+ xvnmsubadp vs41, vs37, vs2
+ xvnmsubadp vs42, vs36, vs3
+ xvnmsubadp vs43, vs37, vs3
+ xvnmsubadp vs44, vs36, vs4
+ xvnmsubadp vs45, vs37, vs4
+ xvnmsubadp vs46, vs36, vs5
+ xvnmsubadp vs47, vs37, vs5
+
+//############### OFFSET 3 #######################
+
+ addi T1, T1, 3*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs38, vs38, vs0
+ xvmuldp vs39, vs39, vs0
+
+ xvnmsubadp vs40, vs38, vs1
+ xvnmsubadp vs41, vs39, vs1
+ xvnmsubadp vs42, vs38, vs2
+ xvnmsubadp vs43, vs39, vs2
+ xvnmsubadp vs44, vs38, vs3
+ xvnmsubadp vs45, vs39, vs3
+ xvnmsubadp vs46, vs38, vs4
+ xvnmsubadp vs47, vs39, vs4
+
+//############### OFFSET 4 #######################
+
+ addi T1, T1, 4*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvmuldp vs40, vs40, vs0
+ xvmuldp vs41, vs41, vs0
+
+ xvnmsubadp vs42, vs40, vs1
+ xvnmsubadp vs43, vs41, vs1
+ xvnmsubadp vs44, vs40, vs2
+ xvnmsubadp vs45, vs41, vs2
+ xvnmsubadp vs46, vs40, vs3
+ xvnmsubadp vs47, vs41, vs3
+
+//############### OFFSET 5 #######################
+
+ addi T1, T1, 5*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+
+ addi T1, T1, 24
+
+ xvmuldp vs42, vs42, vs0
+ xvmuldp vs43, vs43, vs0
+
+ xvnmsubadp vs44, vs42, vs1
+ xvnmsubadp vs45, vs43, vs1
+ xvnmsubadp vs46, vs42, vs2
+ xvnmsubadp vs47, vs43, vs2
+
+//############### OFFSET 6 #######################
+
+ addi T1, T1, 6*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+
+ addi T1, T1, 16
+
+ xvmuldp vs44, vs44, vs0
+ xvmuldp vs45, vs45, vs0
+
+ xvnmsubadp vs46, vs44, vs1
+ xvnmsubadp vs47, vs45, vs1
+
+//############### OFFSET 7 #######################
+
+ addi T1, T1, 7*SIZE
+
+ lxvdsx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs46, vs46, vs0
+ xvmuldp vs47, vs47, vs0
+
+
+//############### SAVE B #######################
+
+
+ mr T1, BO
+
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs40, o0, T1
+ stxvd2x vs41, o16, T1
+ stxvd2x vs42, o32, T1
+ stxvd2x vs43, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs44, o0, T1
+ stxvd2x vs45, o16, T1
+ stxvd2x vs46, o32, T1
+ stxvd2x vs47, o48, T1
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ stxsdx vs32, o0, T1
+ xxswapd vs32, vs32
+ stxsdx vs34, o8, T1
+ xxswapd vs34, vs34
+ stxsdx vs36, o16, T1
+ xxswapd vs36, vs36
+ stxsdx vs38, o24, T1
+ xxswapd vs38, vs38
+
+ addi T1, T1, 32
+
+ stxsdx vs40, o0, T1
+ xxswapd vs40, vs40
+ stxsdx vs42, o8, T1
+ xxswapd vs42, vs42
+ stxsdx vs44, o16, T1
+ xxswapd vs44, vs44
+ stxsdx vs46, o24, T1
+ xxswapd vs46, vs46
+
+ stxsdx vs32, o0, T2
+ stxsdx vs34, o8, T2
+ stxsdx vs36, o16, T2
+ stxsdx vs38, o24, T2
+
+ addi T2, T2, 32
+
+ stxsdx vs40, o0, T2
+ stxsdx vs42, o8, T2
+ stxsdx vs44, o16, T2
+ stxsdx vs46, o24, T2
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ add T1, T2, LDC
+ add T2, T1, LDC
+
+
+ stxsdx vs33, o0, T1
+ xxswapd vs33, vs33
+ stxsdx vs35, o8, T1
+ xxswapd vs35, vs35
+ stxsdx vs37, o16, T1
+ xxswapd vs37, vs37
+ stxsdx vs39, o24, T1
+ xxswapd vs39, vs39
+
+ addi T1, T1, 32
+
+ stxsdx vs41, o0, T1
+ xxswapd vs41, vs41
+ stxsdx vs43, o8, T1
+ xxswapd vs43, vs43
+ stxsdx vs45, o16, T1
+ xxswapd vs45, vs45
+ stxsdx vs47, o24, T1
+ xxswapd vs47, vs47
+
+ stxsdx vs33, o0, T2
+ stxsdx vs35, o8, T2
+ stxsdx vs37, o16, T2
+ stxsdx vs39, o24, T2
+
+ addi T2, T2, 32
+
+ stxsdx vs41, o0, T2
+ stxsdx vs43, o8, T2
+ stxsdx vs45, o16, T2
+ stxsdx vs47, o24, T2
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 4x4
+##########################################################################################*/
+
+.macro SOLVE_LT_4x4
+
+ xxpermdi vs0, vs32, vs33, 0
+ xxpermdi vs1, vs34, vs35, 0
+ xxpermdi vs2, vs32, vs33, 3
+ xxpermdi vs3, vs34, vs35, 3
+
+ xxpermdi vs4, vs36, vs37, 0
+ xxpermdi vs5, vs38, vs39, 0
+ xxpermdi vs6, vs36, vs37, 3
+ xxpermdi vs7, vs38, vs39, 3
+
+
+//############### LOAD B #######################
+
+
+ mr T1, BO
+
+ lxvd2x vs32, o0, T1
+ lxvd2x vs33, o16, T1
+ lxvd2x vs34, o32, T1
+ lxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ lxvd2x vs36, o0, T1
+ lxvd2x vs37, o16, T1
+ lxvd2x vs38, o32, T1
+ lxvd2x vs39, o48, T1
+
+ xvsubdp vs32, vs32, vs0
+ xvsubdp vs33, vs33, vs1
+ xvsubdp vs34, vs34, vs2
+ xvsubdp vs35, vs35, vs3
+ xvsubdp vs36, vs36, vs4
+ xvsubdp vs37, vs37, vs5
+ xvsubdp vs38, vs38, vs6
+ xvsubdp vs39, vs39, vs7
+
+ mr T1, AO
+
+
+//############### OFFSET 0 #######################
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvmuldp vs32, vs32, vs0
+ xvmuldp vs33, vs33, vs0
+
+ xvnmsubadp vs34, vs32, vs1
+ xvnmsubadp vs35, vs33, vs1
+ xvnmsubadp vs36, vs32, vs2
+ xvnmsubadp vs37, vs33, vs2
+ xvnmsubadp vs38, vs32, vs3
+ xvnmsubadp vs39, vs33, vs3
+
+//############### OFFSET 1 #######################
+
+ addi T1, T1, 1*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+
+ addi T1, T1, 24
+
+ xvmuldp vs34, vs34, vs0
+ xvmuldp vs35, vs35, vs0
+
+ xvnmsubadp vs36, vs34, vs1
+ xvnmsubadp vs37, vs35, vs1
+ xvnmsubadp vs38, vs34, vs2
+ xvnmsubadp vs39, vs35, vs2
+
+//############### OFFSET 2 #######################
+
+ addi T1, T1, 2*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+
+ addi T1, T1, 16
+
+ xvmuldp vs36, vs36, vs0
+ xvmuldp vs37, vs37, vs0
+
+ xvnmsubadp vs38, vs36, vs1
+ xvnmsubadp vs39, vs37, vs1
+
+//############### OFFSET 3 #######################
+
+ addi T1, T1, 3*SIZE
+
+ lxvdsx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs38, vs38, vs0
+ xvmuldp vs39, vs39, vs0
+
+
+//############### SAVE B #######################
+
+
+ mr T1, BO
+
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ stxsdx vs32, o0, T1
+ xxswapd vs32, vs32
+ stxsdx vs34, o8, T1
+ xxswapd vs34, vs34
+ stxsdx vs36, o16, T1
+ xxswapd vs36, vs36
+ stxsdx vs38, o24, T1
+ xxswapd vs38, vs38
+
+ stxsdx vs32, o0, T2
+ stxsdx vs34, o8, T2
+ stxsdx vs36, o16, T2
+ stxsdx vs38, o24, T2
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ add T1, T2, LDC
+ add T2, T1, LDC
+
+
+ stxsdx vs33, o0, T1
+ xxswapd vs33, vs33
+ stxsdx vs35, o8, T1
+ xxswapd vs35, vs35
+ stxsdx vs37, o16, T1
+ xxswapd vs37, vs37
+ stxsdx vs39, o24, T1
+ xxswapd vs39, vs39
+
+ stxsdx vs33, o0, T2
+ stxsdx vs35, o8, T2
+ stxsdx vs37, o16, T2
+ stxsdx vs39, o24, T2
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 2x4
+##########################################################################################*/
+
+.macro SOLVE_LT_2x4
+
+ xxpermdi vs0, vs32, vs33, 0
+ xxpermdi vs1, vs34, vs35, 0
+ xxpermdi vs2, vs32, vs33, 3
+ xxpermdi vs3, vs34, vs35, 3
+
+
+//############### LOAD B #######################
+
+
+ mr T1, BO
+
+ lxvd2x vs32, o0, T1
+ lxvd2x vs33, o16, T1
+ lxvd2x vs34, o32, T1
+ lxvd2x vs35, o48, T1
+
+ xvsubdp vs32, vs32, vs0
+ xvsubdp vs33, vs33, vs1
+ xvsubdp vs34, vs34, vs2
+ xvsubdp vs35, vs35, vs3
+
+ mr T1, AO
+
+
+//############### OFFSET 0 #######################
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+
+ addi T1, T1, 16
+
+ xvmuldp vs32, vs32, vs0
+ xvmuldp vs33, vs33, vs0
+
+ xvnmsubadp vs34, vs32, vs1
+ xvnmsubadp vs35, vs33, vs1
+
+//############### OFFSET 1 #######################
+
+ addi T1, T1, 1*SIZE
+
+ lxvdsx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs34, vs34, vs0
+ xvmuldp vs35, vs35, vs0
+
+
+//############### SAVE B #######################
+
+
+ mr T1, BO
+
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ stxsdx vs32, o0, T1
+ xxswapd vs32, vs32
+ stxsdx vs34, o8, T1
+ xxswapd vs34, vs34
+
+ stxsdx vs32, o0, T2
+ stxsdx vs34, o8, T2
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ add T1, T2, LDC
+ add T2, T1, LDC
+
+
+ stxsdx vs33, o0, T1
+ xxswapd vs33, vs33
+ stxsdx vs35, o8, T1
+ xxswapd vs35, vs35
+
+ stxsdx vs33, o0, T2
+ stxsdx vs35, o8, T2
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 1x4
+##########################################################################################*/
+
+.macro SOLVE_LT_1x4
+
+ xxpermdi vs0, vs32, vs33, 0
+ xxpermdi vs1, vs34, vs35, 0
+
+//############### LOAD B #######################
+
+
+ mr T1, BO
+
+ lxvd2x vs32, o0, T1
+ lxvd2x vs33, o16, T1
+
+ xvsubdp vs32, vs32, vs0
+ xvsubdp vs33, vs33, vs1
+
+ mr T1, AO
+
+
+//############### OFFSET 0 #######################
+
+ lxvdsx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs32, vs32, vs0
+ xvmuldp vs33, vs33, vs0
+
+
+//############### SAVE B #######################
+
+
+ mr T1, BO
+
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ stxsdx vs32, o0, T1
+ xxswapd vs32, vs32
+
+ stxsdx vs32, o0, T2
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ add T1, T2, LDC
+ add T2, T1, LDC
+
+
+ stxsdx vs33, o0, T1
+ xxswapd vs33, vs33
+
+ stxsdx vs33, o0, T2
+
+.endm
+
+
+.macro INIT_16x2
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+ xvmovdp vs33, vs0
+ xvmovdp vs34, vs0
+ xvmovdp vs35, vs0
+ xvmovdp vs36, vs0
+ xvmovdp vs37, vs0
+ xvmovdp vs38, vs0
+ xvmovdp vs39, vs0
+ xvmovdp vs40, vs0
+ xvmovdp vs41, vs0
+ xvmovdp vs42, vs0
+ xvmovdp vs43, vs0
+ xvmovdp vs44, vs0
+ xvmovdp vs45, vs0
+ xvmovdp vs46, vs0
+ xvmovdp vs47, vs0
+
+.endm
+
+
+.macro KERNEL_16x2
+
+
+ lxvd2x vs0, o0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvd2x vs4, o0, AO
+ lxvd2x vs5, o16, AO
+ lxvd2x vs6, o32, AO
+ lxvd2x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO
+ lxvdsx vs17, o8, BO
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+ xvmaddadp vs40, vs4, vs16
+ xvmaddadp vs41, vs4, vs17
+ xvmaddadp vs42, vs5, vs16
+ xvmaddadp vs43, vs5, vs17
+ xvmaddadp vs44, vs6, vs16
+ xvmaddadp vs45, vs6, vs17
+ xvmaddadp vs46, vs7, vs16
+ xvmaddadp vs47, vs7, vs17
+
+
+.endm
+
+
+.macro INIT_8x2
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+ xvmovdp vs33, vs0
+ xvmovdp vs34, vs0
+ xvmovdp vs35, vs0
+ xvmovdp vs36, vs0
+ xvmovdp vs37, vs0
+ xvmovdp vs38, vs0
+ xvmovdp vs39, vs0
+
+.endm
+
+
+.macro KERNEL_8x2
+
+
+ lxvd2x vs0, o0, AO
+ lxvd2x vs1, o16, AO
+ lxvd2x vs2, o32, AO
+ lxvd2x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvdsx vs16, o0, BO
+ lxvdsx vs17, o8, BO
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+ xvmaddadp vs36, vs2, vs16
+ xvmaddadp vs37, vs2, vs17
+ xvmaddadp vs38, vs3, vs16
+ xvmaddadp vs39, vs3, vs17
+
+
+.endm
+
+
+.macro INIT_4x2
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+ xvmovdp vs33, vs0
+ xvmovdp vs34, vs0
+ xvmovdp vs35, vs0
+
+.endm
+
+
+.macro KERNEL_4x2
+
+
+ lxvd2x vs0, o0, AO
+ lxvd2x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO
+ lxvdsx vs17, o8, BO
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+ xvmaddadp vs34, vs1, vs16
+ xvmaddadp vs35, vs1, vs17
+
+
+.endm
+
+
+.macro INIT_2x2
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+ xvmovdp vs33, vs0
+
+.endm
+
+
+.macro KERNEL_2x2
+
+
+ lxvd2x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO
+ lxvdsx vs17, o8, BO
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+
+
+.endm
+
+
+.macro INIT_1x2
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+ xvmovdp vs33, vs0
+
+.endm
+
+
+.macro KERNEL_1x2
+
+
+ lxvdsx vs0, o0, AO
+
+ addi AO, AO, 8
+
+ lxvdsx vs16, o0, BO
+ lxvdsx vs17, o8, BO
+
+ addi BO, BO, 16
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs0, vs17
+
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 16x2
+##########################################################################################*/
+
+.macro SOLVE_LT_16x2
+
+ xxpermdi vs0, vs32, vs33, 0
+ xxpermdi vs1, vs32, vs33, 3
+
+ xxpermdi vs2, vs34, vs35, 0
+ xxpermdi vs3, vs34, vs35, 3
+
+ xxpermdi vs4, vs36, vs37, 0
+ xxpermdi vs5, vs36, vs37, 3
+
+ xxpermdi vs6, vs38, vs39, 0
+ xxpermdi vs7, vs38, vs39, 3
+
+ xxpermdi vs8, vs40, vs41, 0
+ xxpermdi vs9, vs40, vs41, 3
+
+ xxpermdi vs10, vs42, vs43, 0
+ xxpermdi vs11, vs42, vs43, 3
+
+ xxpermdi vs12, vs44, vs45, 0
+ xxpermdi vs13, vs44, vs45, 3
+
+ xxpermdi vs14, vs46, vs47, 0
+ xxpermdi vs15, vs46, vs47, 3
+
+
+//############### LOAD B #######################
+
+
+ mr T1, BO
+
+ lxvd2x vs32, o0, T1
+ lxvd2x vs33, o16, T1
+ lxvd2x vs34, o32, T1
+ lxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ lxvd2x vs36, o0, T1
+ lxvd2x vs37, o16, T1
+ lxvd2x vs38, o32, T1
+ lxvd2x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ lxvd2x vs40, o0, T1
+ lxvd2x vs41, o16, T1
+ lxvd2x vs42, o32, T1
+ lxvd2x vs43, o48, T1
+
+ addi T1, T1, 64
+
+ lxvd2x vs44, o0, T1
+ lxvd2x vs45, o16, T1
+ lxvd2x vs46, o32, T1
+ lxvd2x vs47, o48, T1
+
+ xvsubdp vs32, vs32, vs0
+ xvsubdp vs33, vs33, vs1
+ xvsubdp vs34, vs34, vs2
+ xvsubdp vs35, vs35, vs3
+ xvsubdp vs36, vs36, vs4
+ xvsubdp vs37, vs37, vs5
+ xvsubdp vs38, vs38, vs6
+ xvsubdp vs39, vs39, vs7
+ xvsubdp vs40, vs40, vs8
+ xvsubdp vs41, vs41, vs9
+ xvsubdp vs42, vs42, vs10
+ xvsubdp vs43, vs43, vs11
+ xvsubdp vs44, vs44, vs12
+ xvsubdp vs45, vs45, vs13
+ xvsubdp vs46, vs46, vs14
+ xvsubdp vs47, vs47, vs15
+
+ mr T1, AO
+
+
+//############### OFFSET 0 #######################
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs8, o0, T1
+ lxvdsx vs9, o8, T1
+ lxvdsx vs10, o16, T1
+ lxvdsx vs11, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs12, o0, T1
+ lxvdsx vs13, o8, T1
+ lxvdsx vs14, o16, T1
+ lxvdsx vs15, o24, T1
+
+ addi T1, T1, 32
+
+ xvmuldp vs32, vs32, vs0
+ xvnmsubadp vs33, vs32, vs1
+ xvnmsubadp vs34, vs32, vs2
+ xvnmsubadp vs35, vs32, vs3
+ xvnmsubadp vs36, vs32, vs4
+ xvnmsubadp vs37, vs32, vs5
+ xvnmsubadp vs38, vs32, vs6
+ xvnmsubadp vs39, vs32, vs7
+ xvnmsubadp vs40, vs32, vs8
+ xvnmsubadp vs41, vs32, vs9
+ xvnmsubadp vs42, vs32, vs10
+ xvnmsubadp vs43, vs32, vs11
+ xvnmsubadp vs44, vs32, vs12
+ xvnmsubadp vs45, vs32, vs13
+ xvnmsubadp vs46, vs32, vs14
+ xvnmsubadp vs47, vs32, vs15
+
+//############### OFFSET 1 #######################
+
+ addi T1, T1, 1*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs8, o0, T1
+ lxvdsx vs9, o8, T1
+ lxvdsx vs10, o16, T1
+ lxvdsx vs11, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs12, o0, T1
+ lxvdsx vs13, o8, T1
+ lxvdsx vs14, o16, T1
+
+ addi T1, T1, 24
+
+ xvmuldp vs33, vs33, vs0
+ xvnmsubadp vs34, vs33, vs1
+ xvnmsubadp vs35, vs33, vs2
+ xvnmsubadp vs36, vs33, vs3
+ xvnmsubadp vs37, vs33, vs4
+ xvnmsubadp vs38, vs33, vs5
+ xvnmsubadp vs39, vs33, vs6
+ xvnmsubadp vs40, vs33, vs7
+ xvnmsubadp vs41, vs33, vs8
+ xvnmsubadp vs42, vs33, vs9
+ xvnmsubadp vs43, vs33, vs10
+ xvnmsubadp vs44, vs33, vs11
+ xvnmsubadp vs45, vs33, vs12
+ xvnmsubadp vs46, vs33, vs13
+ xvnmsubadp vs47, vs33, vs14
+
+//############### OFFSET 2 #######################
+
+ addi T1, T1, 2*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs8, o0, T1
+ lxvdsx vs9, o8, T1
+ lxvdsx vs10, o16, T1
+ lxvdsx vs11, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs12, o0, T1
+ lxvdsx vs13, o8, T1
+
+ addi T1, T1, 16
+
+ xvmuldp vs34, vs34, vs0
+ xvnmsubadp vs35, vs34, vs1
+ xvnmsubadp vs36, vs34, vs2
+ xvnmsubadp vs37, vs34, vs3
+ xvnmsubadp vs38, vs34, vs4
+ xvnmsubadp vs39, vs34, vs5
+ xvnmsubadp vs40, vs34, vs6
+ xvnmsubadp vs41, vs34, vs7
+ xvnmsubadp vs42, vs34, vs8
+ xvnmsubadp vs43, vs34, vs9
+ xvnmsubadp vs44, vs34, vs10
+ xvnmsubadp vs45, vs34, vs11
+ xvnmsubadp vs46, vs34, vs12
+ xvnmsubadp vs47, vs34, vs13
+
+//############### OFFSET 3 #######################
+
+ addi T1, T1, 3*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs8, o0, T1
+ lxvdsx vs9, o8, T1
+ lxvdsx vs10, o16, T1
+ lxvdsx vs11, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs12, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs35, vs35, vs0
+ xvnmsubadp vs36, vs35, vs1
+ xvnmsubadp vs37, vs35, vs2
+ xvnmsubadp vs38, vs35, vs3
+ xvnmsubadp vs39, vs35, vs4
+ xvnmsubadp vs40, vs35, vs5
+ xvnmsubadp vs41, vs35, vs6
+ xvnmsubadp vs42, vs35, vs7
+ xvnmsubadp vs43, vs35, vs8
+ xvnmsubadp vs44, vs35, vs9
+ xvnmsubadp vs45, vs35, vs10
+ xvnmsubadp vs46, vs35, vs11
+ xvnmsubadp vs47, vs35, vs12
+
+//############### OFFSET 4 #######################
+
+ addi T1, T1, 4*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs8, o0, T1
+ lxvdsx vs9, o8, T1
+ lxvdsx vs10, o16, T1
+ lxvdsx vs11, o24, T1
+
+ addi T1, T1, 32
+
+ xvmuldp vs36, vs36, vs0
+ xvnmsubadp vs37, vs36, vs1
+ xvnmsubadp vs38, vs36, vs2
+ xvnmsubadp vs39, vs36, vs3
+ xvnmsubadp vs40, vs36, vs4
+ xvnmsubadp vs41, vs36, vs5
+ xvnmsubadp vs42, vs36, vs6
+ xvnmsubadp vs43, vs36, vs7
+ xvnmsubadp vs44, vs36, vs8
+ xvnmsubadp vs45, vs36, vs9
+ xvnmsubadp vs46, vs36, vs10
+ xvnmsubadp vs47, vs36, vs11
+
+//############### OFFSET 5 #######################
+
+ addi T1, T1, 5*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs8, o0, T1
+ lxvdsx vs9, o8, T1
+ lxvdsx vs10, o16, T1
+
+ addi T1, T1, 24
+
+ xvmuldp vs37, vs37, vs0
+ xvnmsubadp vs38, vs37, vs1
+ xvnmsubadp vs39, vs37, vs2
+ xvnmsubadp vs40, vs37, vs3
+ xvnmsubadp vs41, vs37, vs4
+ xvnmsubadp vs42, vs37, vs5
+ xvnmsubadp vs43, vs37, vs6
+ xvnmsubadp vs44, vs37, vs7
+ xvnmsubadp vs45, vs37, vs8
+ xvnmsubadp vs46, vs37, vs9
+ xvnmsubadp vs47, vs37, vs10
+
+//############### OFFSET 6 #######################
+
+ addi T1, T1, 6*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs8, o0, T1
+ lxvdsx vs9, o8, T1
+
+ addi T1, T1, 16
+
+ xvmuldp vs38, vs38, vs0
+ xvnmsubadp vs39, vs38, vs1
+ xvnmsubadp vs40, vs38, vs2
+ xvnmsubadp vs41, vs38, vs3
+ xvnmsubadp vs42, vs38, vs4
+ xvnmsubadp vs43, vs38, vs5
+ xvnmsubadp vs44, vs38, vs6
+ xvnmsubadp vs45, vs38, vs7
+ xvnmsubadp vs46, vs38, vs8
+ xvnmsubadp vs47, vs38, vs9
+
+//############### OFFSET 7 #######################
+
+ addi T1, T1, 7*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs8, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs39, vs39, vs0
+ xvnmsubadp vs40, vs39, vs1
+ xvnmsubadp vs41, vs39, vs2
+ xvnmsubadp vs42, vs39, vs3
+ xvnmsubadp vs43, vs39, vs4
+ xvnmsubadp vs44, vs39, vs5
+ xvnmsubadp vs45, vs39, vs6
+ xvnmsubadp vs46, vs39, vs7
+ xvnmsubadp vs47, vs39, vs8
+
+//############### OFFSET 8 #######################
+
+ addi T1, T1, 8*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ xvmuldp vs40, vs40, vs0
+ xvnmsubadp vs41, vs40, vs1
+ xvnmsubadp vs42, vs40, vs2
+ xvnmsubadp vs43, vs40, vs3
+ xvnmsubadp vs44, vs40, vs4
+ xvnmsubadp vs45, vs40, vs5
+ xvnmsubadp vs46, vs40, vs6
+ xvnmsubadp vs47, vs40, vs7
+
+//############### OFFSET 9 #######################
+
+ addi T1, T1, 9*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+
+ addi T1, T1, 24
+
+ xvmuldp vs41, vs41, vs0
+ xvnmsubadp vs42, vs41, vs1
+ xvnmsubadp vs43, vs41, vs2
+ xvnmsubadp vs44, vs41, vs3
+ xvnmsubadp vs45, vs41, vs4
+ xvnmsubadp vs46, vs41, vs5
+ xvnmsubadp vs47, vs41, vs6
+
+//############### OFFSET 10 #######################
+
+ addi T1, T1, 10*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+
+ addi T1, T1, 16
+
+ xvmuldp vs42, vs42, vs0
+ xvnmsubadp vs43, vs42, vs1
+ xvnmsubadp vs44, vs42, vs2
+ xvnmsubadp vs45, vs42, vs3
+ xvnmsubadp vs46, vs42, vs4
+ xvnmsubadp vs47, vs42, vs5
+
+//############### OFFSET 11 #######################
+
+ addi T1, T1, 11*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs43, vs43, vs0
+ xvnmsubadp vs44, vs43, vs1
+ xvnmsubadp vs45, vs43, vs2
+ xvnmsubadp vs46, vs43, vs3
+ xvnmsubadp vs47, vs43, vs4
+
+//############### OFFSET 12 #######################
+
+ addi T1, T1, 12*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvmuldp vs44, vs44, vs0
+ xvnmsubadp vs45, vs44, vs1
+ xvnmsubadp vs46, vs44, vs2
+ xvnmsubadp vs47, vs44, vs3
+
+//############### OFFSET 13 #######################
+
+ addi T1, T1, 13*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+
+ addi T1, T1, 24
+
+ xvmuldp vs45, vs45, vs0
+ xvnmsubadp vs46, vs45, vs1
+ xvnmsubadp vs47, vs45, vs2
+
+//############### OFFSET 14 #######################
+
+ addi T1, T1, 14*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+
+ addi T1, T1, 16
+
+ xvmuldp vs46, vs46, vs0
+ xvnmsubadp vs47, vs46, vs1
+
+//############### OFFSET 15 #######################
+
+ addi T1, T1, 15*SIZE
+
+ lxvdsx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs47, vs47, vs0
+
+//############### SAVE B #######################
+
+
+ mr T1, BO
+
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs40, o0, T1
+ stxvd2x vs41, o16, T1
+ stxvd2x vs42, o32, T1
+ stxvd2x vs43, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs44, o0, T1
+ stxvd2x vs45, o16, T1
+ stxvd2x vs46, o32, T1
+ stxvd2x vs47, o48, T1
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ stxsdx vs32, o0, T1
+ xxswapd vs32, vs32
+ stxsdx vs33, o8, T1
+ xxswapd vs33, vs33
+ stxsdx vs34, o16, T1
+ xxswapd vs34, vs34
+ stxsdx vs35, o24, T1
+ xxswapd vs35, vs35
+
+ addi T1, T1, 32
+
+ stxsdx vs36, o0, T1
+ xxswapd vs36, vs36
+ stxsdx vs37, o8, T1
+ xxswapd vs37, vs37
+ stxsdx vs38, o16, T1
+ xxswapd vs38, vs38
+ stxsdx vs39, o24, T1
+ xxswapd vs39, vs39
+
+ addi T1, T1, 32
+
+ stxsdx vs40, o0, T1
+ xxswapd vs40, vs40
+ stxsdx vs41, o8, T1
+ xxswapd vs41, vs41
+ stxsdx vs42, o16, T1
+ xxswapd vs42, vs42
+ stxsdx vs43, o24, T1
+ xxswapd vs43, vs43
+
+ addi T1, T1, 32
+
+ stxsdx vs44, o0, T1
+ xxswapd vs44, vs44
+ stxsdx vs45, o8, T1
+ xxswapd vs45, vs45
+ stxsdx vs46, o16, T1
+ xxswapd vs46, vs46
+ stxsdx vs47, o24, T1
+ xxswapd vs47, vs47
+
+ stxsdx vs32, o0, T2
+ stxsdx vs33, o8, T2
+ stxsdx vs34, o16, T2
+ stxsdx vs35, o24, T2
+
+ addi T2, T2, 32
+
+ stxsdx vs36, o0, T2
+ stxsdx vs37, o8, T2
+ stxsdx vs38, o16, T2
+ stxsdx vs39, o24, T2
+
+ addi T2, T2, 32
+
+ stxsdx vs40, o0, T2
+ stxsdx vs41, o8, T2
+ stxsdx vs42, o16, T2
+ stxsdx vs43, o24, T2
+
+ addi T2, T2, 32
+
+ stxsdx vs44, o0, T2
+ stxsdx vs45, o8, T2
+ stxsdx vs46, o16, T2
+ stxsdx vs47, o24, T2
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 8x2
+##########################################################################################*/
+
+.macro SOLVE_LT_8x2
+
+ xxpermdi vs0, vs32, vs33, 0
+ xxpermdi vs1, vs32, vs33, 3
+
+ xxpermdi vs2, vs34, vs35, 0
+ xxpermdi vs3, vs34, vs35, 3
+
+ xxpermdi vs4, vs36, vs37, 0
+ xxpermdi vs5, vs36, vs37, 3
+
+ xxpermdi vs6, vs38, vs39, 0
+ xxpermdi vs7, vs38, vs39, 3
+
+
+//############### LOAD B #######################
+
+
+ mr T1, BO
+
+ lxvd2x vs32, o0, T1
+ lxvd2x vs33, o16, T1
+ lxvd2x vs34, o32, T1
+ lxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ lxvd2x vs36, o0, T1
+ lxvd2x vs37, o16, T1
+ lxvd2x vs38, o32, T1
+ lxvd2x vs39, o48, T1
+
+ xvsubdp vs32, vs32, vs0
+ xvsubdp vs33, vs33, vs1
+ xvsubdp vs34, vs34, vs2
+ xvsubdp vs35, vs35, vs3
+ xvsubdp vs36, vs36, vs4
+ xvsubdp vs37, vs37, vs5
+ xvsubdp vs38, vs38, vs6
+ xvsubdp vs39, vs39, vs7
+
+ mr T1, AO
+
+
+//############### OFFSET 0 #######################
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+ lxvdsx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ xvmuldp vs32, vs32, vs0
+ xvnmsubadp vs33, vs32, vs1
+ xvnmsubadp vs34, vs32, vs2
+ xvnmsubadp vs35, vs32, vs3
+ xvnmsubadp vs36, vs32, vs4
+ xvnmsubadp vs37, vs32, vs5
+ xvnmsubadp vs38, vs32, vs6
+ xvnmsubadp vs39, vs32, vs7
+
+//############### OFFSET 1 #######################
+
+ addi T1, T1, 1*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+ lxvdsx vs6, o16, T1
+
+ addi T1, T1, 24
+
+ xvmuldp vs33, vs33, vs0
+ xvnmsubadp vs34, vs33, vs1
+ xvnmsubadp vs35, vs33, vs2
+ xvnmsubadp vs36, vs33, vs3
+ xvnmsubadp vs37, vs33, vs4
+ xvnmsubadp vs38, vs33, vs5
+ xvnmsubadp vs39, vs33, vs6
+
+//############### OFFSET 2 #######################
+
+ addi T1, T1, 2*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+ lxvdsx vs5, o8, T1
+
+ addi T1, T1, 16
+
+ xvmuldp vs34, vs34, vs0
+ xvnmsubadp vs35, vs34, vs1
+ xvnmsubadp vs36, vs34, vs2
+ xvnmsubadp vs37, vs34, vs3
+ xvnmsubadp vs38, vs34, vs4
+ xvnmsubadp vs39, vs34, vs5
+
+//############### OFFSET 3 #######################
+
+ addi T1, T1, 3*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxvdsx vs4, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs35, vs35, vs0
+ xvnmsubadp vs36, vs35, vs1
+ xvnmsubadp vs37, vs35, vs2
+ xvnmsubadp vs38, vs35, vs3
+ xvnmsubadp vs39, vs35, vs4
+
+//############### OFFSET 4 #######################
+
+ addi T1, T1, 4*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvmuldp vs36, vs36, vs0
+ xvnmsubadp vs37, vs36, vs1
+ xvnmsubadp vs38, vs36, vs2
+ xvnmsubadp vs39, vs36, vs3
+
+//############### OFFSET 5 #######################
+
+ addi T1, T1, 5*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+
+ addi T1, T1, 24
+
+ xvmuldp vs37, vs37, vs0
+ xvnmsubadp vs38, vs37, vs1
+ xvnmsubadp vs39, vs37, vs2
+
+//############### OFFSET 6 #######################
+
+ addi T1, T1, 6*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+
+ addi T1, T1, 16
+
+ xvmuldp vs38, vs38, vs0
+ xvnmsubadp vs39, vs38, vs1
+
+//############### OFFSET 7 #######################
+
+ addi T1, T1, 7*SIZE
+
+ lxvdsx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs39, vs39, vs0
+
+//############### SAVE B #######################
+
+
+ mr T1, BO
+
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ stxsdx vs32, o0, T1
+ xxswapd vs32, vs32
+ stxsdx vs33, o8, T1
+ xxswapd vs33, vs33
+ stxsdx vs34, o16, T1
+ xxswapd vs34, vs34
+ stxsdx vs35, o24, T1
+ xxswapd vs35, vs35
+
+ addi T1, T1, 32
+
+ stxsdx vs36, o0, T1
+ xxswapd vs36, vs36
+ stxsdx vs37, o8, T1
+ xxswapd vs37, vs37
+ stxsdx vs38, o16, T1
+ xxswapd vs38, vs38
+ stxsdx vs39, o24, T1
+ xxswapd vs39, vs39
+
+ stxsdx vs32, o0, T2
+ stxsdx vs33, o8, T2
+ stxsdx vs34, o16, T2
+ stxsdx vs35, o24, T2
+
+ addi T2, T2, 32
+
+ stxsdx vs36, o0, T2
+ stxsdx vs37, o8, T2
+ stxsdx vs38, o16, T2
+ stxsdx vs39, o24, T2
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 4x2
+##########################################################################################*/
+
+.macro SOLVE_LT_4x2
+
+ xxpermdi vs0, vs32, vs33, 0
+ xxpermdi vs1, vs32, vs33, 3
+
+ xxpermdi vs2, vs34, vs35, 0
+ xxpermdi vs3, vs34, vs35, 3
+
+
+//############### LOAD B #######################
+
+
+ mr T1, BO
+
+ lxvd2x vs32, o0, T1
+ lxvd2x vs33, o16, T1
+ lxvd2x vs34, o32, T1
+ lxvd2x vs35, o48, T1
+
+ xvsubdp vs32, vs32, vs0
+ xvsubdp vs33, vs33, vs1
+ xvsubdp vs34, vs34, vs2
+ xvsubdp vs35, vs35, vs3
+
+ mr T1, AO
+
+
+//############### OFFSET 0 #######################
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+ lxvdsx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xvmuldp vs32, vs32, vs0
+ xvnmsubadp vs33, vs32, vs1
+ xvnmsubadp vs34, vs32, vs2
+ xvnmsubadp vs35, vs32, vs3
+
+//############### OFFSET 1 #######################
+
+ addi T1, T1, 1*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+ lxvdsx vs2, o16, T1
+
+ addi T1, T1, 24
+
+ xvmuldp vs33, vs33, vs0
+ xvnmsubadp vs34, vs33, vs1
+ xvnmsubadp vs35, vs33, vs2
+
+//############### OFFSET 2 #######################
+
+ addi T1, T1, 2*SIZE
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+
+ addi T1, T1, 16
+
+ xvmuldp vs34, vs34, vs0
+ xvnmsubadp vs35, vs34, vs1
+
+//############### OFFSET 3 #######################
+
+ addi T1, T1, 3*SIZE
+
+ lxvdsx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs35, vs35, vs0
+
+//############### SAVE B #######################
+
+
+ mr T1, BO
+
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ stxsdx vs32, o0, T1
+ xxswapd vs32, vs32
+ stxsdx vs33, o8, T1
+ xxswapd vs33, vs33
+ stxsdx vs34, o16, T1
+ xxswapd vs34, vs34
+ stxsdx vs35, o24, T1
+ xxswapd vs35, vs35
+
+ stxsdx vs32, o0, T2
+ stxsdx vs33, o8, T2
+ stxsdx vs34, o16, T2
+ stxsdx vs35, o24, T2
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 2x2
+##########################################################################################*/
+
+.macro SOLVE_LT_2x2
+
+ xxpermdi vs0, vs32, vs33, 0
+ xxpermdi vs1, vs32, vs33, 3
+
+
+//############### LOAD B #######################
+
+
+ mr T1, BO
+
+ lxvd2x vs32, o0, T1
+ lxvd2x vs33, o16, T1
+
+ xvsubdp vs32, vs32, vs0
+ xvsubdp vs33, vs33, vs1
+
+ mr T1, AO
+
+
+//############### OFFSET 0 #######################
+
+ lxvdsx vs0, o0, T1
+ lxvdsx vs1, o8, T1
+
+ addi T1, T1, 16
+
+ xvmuldp vs32, vs32, vs0
+ xvnmsubadp vs33, vs32, vs1
+
+//############### OFFSET 1 #######################
+
+ addi T1, T1, 1*SIZE
+
+ lxvdsx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs33, vs33, vs0
+
+//############### SAVE B #######################
+
+
+ mr T1, BO
+
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ stxsdx vs32, o0, T1
+ xxswapd vs32, vs32
+ stxsdx vs33, o8, T1
+ xxswapd vs33, vs33
+
+ stxsdx vs32, o0, T2
+ stxsdx vs33, o8, T2
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 1x2
+##########################################################################################*/
+
+.macro SOLVE_LT_1x2
+
+ xxpermdi vs0, vs32, vs33, 0
+
+//############### LOAD B #######################
+
+
+ mr T1, BO
+
+ lxvd2x vs32, o0, T1
+
+ xvsubdp vs32, vs32, vs0
+
+ mr T1, AO
+
+
+//############### OFFSET 0 #######################
+
+ lxvdsx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xvmuldp vs32, vs32, vs0
+
+//############### SAVE B #######################
+
+
+ mr T1, BO
+
+
+ stxvd2x vs32, o0, T1
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+ add T2, CO, LDC
+
+
+ stxsdx vs32, o0, T1
+ xxswapd vs32, vs32
+
+ stxsdx vs32, o0, T2
+
+.endm
+
+
+.macro INIT_16x1
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+ xvmovdp vs33, vs0
+ xvmovdp vs34, vs0
+ xvmovdp vs35, vs0
+ xvmovdp vs36, vs0
+ xvmovdp vs37, vs0
+ xvmovdp vs38, vs0
+ xvmovdp vs39, vs0
+ xvmovdp vs40, vs0
+ xvmovdp vs41, vs0
+ xvmovdp vs42, vs0
+ xvmovdp vs43, vs0
+ xvmovdp vs44, vs0
+ xvmovdp vs45, vs0
+ xvmovdp vs46, vs0
+ xvmovdp vs47, vs0
+
+.endm
+
+
+.macro KERNEL_16x1
+
+
+ lxvdsx vs0, o0, AO
+ lxvdsx vs1, o8, AO
+ lxvdsx vs2, o16, AO
+ lxvdsx vs3, o24, AO
+
+ addi AO, AO, 32
+
+ lxvdsx vs4, o0, AO
+ lxvdsx vs5, o8, AO
+ lxvdsx vs6, o16, AO
+ lxvdsx vs7, o24, AO
+
+ addi AO, AO, 32
+
+ lxvdsx vs8, o0, AO
+ lxvdsx vs9, o8, AO
+ lxvdsx vs10, o16, AO
+ lxvdsx vs11, o24, AO
+
+ addi AO, AO, 32
+
+ lxvdsx vs12, o0, AO
+ lxvdsx vs13, o8, AO
+ lxvdsx vs14, o16, AO
+ lxvdsx vs15, o24, AO
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO
+
+ addi BO, BO, 8
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs1, vs16
+ xvmaddadp vs34, vs2, vs16
+ xvmaddadp vs35, vs3, vs16
+ xvmaddadp vs36, vs4, vs16
+ xvmaddadp vs37, vs5, vs16
+ xvmaddadp vs38, vs6, vs16
+ xvmaddadp vs39, vs7, vs16
+ xvmaddadp vs40, vs8, vs16
+ xvmaddadp vs41, vs9, vs16
+ xvmaddadp vs42, vs10, vs16
+ xvmaddadp vs43, vs11, vs16
+ xvmaddadp vs44, vs12, vs16
+ xvmaddadp vs45, vs13, vs16
+ xvmaddadp vs46, vs14, vs16
+ xvmaddadp vs47, vs15, vs16
+
+
+.endm
+
+
+.macro INIT_8x1
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+ xvmovdp vs33, vs0
+ xvmovdp vs34, vs0
+ xvmovdp vs35, vs0
+ xvmovdp vs36, vs0
+ xvmovdp vs37, vs0
+ xvmovdp vs38, vs0
+ xvmovdp vs39, vs0
+
+.endm
+
+
+.macro KERNEL_8x1
+
+
+ lxvdsx vs0, o0, AO
+ lxvdsx vs1, o8, AO
+ lxvdsx vs2, o16, AO
+ lxvdsx vs3, o24, AO
+
+ addi AO, AO, 32
+
+ lxvdsx vs4, o0, AO
+ lxvdsx vs5, o8, AO
+ lxvdsx vs6, o16, AO
+ lxvdsx vs7, o24, AO
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO
+
+ addi BO, BO, 8
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs1, vs16
+ xvmaddadp vs34, vs2, vs16
+ xvmaddadp vs35, vs3, vs16
+ xvmaddadp vs36, vs4, vs16
+ xvmaddadp vs37, vs5, vs16
+ xvmaddadp vs38, vs6, vs16
+ xvmaddadp vs39, vs7, vs16
+
+
+.endm
+
+
+.macro INIT_4x1
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+ xvmovdp vs33, vs0
+ xvmovdp vs34, vs0
+ xvmovdp vs35, vs0
+
+.endm
+
+
+.macro KERNEL_4x1
+
+
+ lxvdsx vs0, o0, AO
+ lxvdsx vs1, o8, AO
+ lxvdsx vs2, o16, AO
+ lxvdsx vs3, o24, AO
+
+ addi AO, AO, 32
+
+ lxvdsx vs16, o0, BO
+
+ addi BO, BO, 8
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs1, vs16
+ xvmaddadp vs34, vs2, vs16
+ xvmaddadp vs35, vs3, vs16
+
+
+.endm
+
+
+.macro INIT_2x1
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+ xvmovdp vs33, vs0
+
+.endm
+
+
+.macro KERNEL_2x1
+
+
+ lxvdsx vs0, o0, AO
+ lxvdsx vs1, o8, AO
+
+ addi AO, AO, 16
+
+ lxvdsx vs16, o0, BO
+
+ addi BO, BO, 8
+
+ xvmaddadp vs32, vs0, vs16
+ xvmaddadp vs33, vs1, vs16
+
+
+.endm
+
+
+.macro INIT_1x1
+
+
+ xxlxor vs0, vs0, vs0
+
+ xvmovdp vs32, vs0
+
+.endm
+
+
+.macro KERNEL_1x1
+
+
+ lxvdsx vs0, o0, AO
+
+ addi AO, AO, 8
+
+ lxvdsx vs16, o0, BO
+
+ addi BO, BO, 8
+
+ xvmaddadp vs32, vs0, vs16
+
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 16x1
+##########################################################################################*/
+
+.macro SOLVE_LT_16x1
+
+ xxswapd vs0, vs32
+ xxswapd vs1, vs33
+ xxswapd vs2, vs34
+ xxswapd vs3, vs35
+ xxswapd vs4, vs36
+ xxswapd vs5, vs37
+ xxswapd vs6, vs38
+ xxswapd vs7, vs39
+ xxswapd vs8, vs40
+ xxswapd vs9, vs41
+ xxswapd vs10, vs42
+ xxswapd vs11, vs43
+ xxswapd vs12, vs44
+ xxswapd vs13, vs45
+ xxswapd vs14, vs46
+ xxswapd vs15, vs47
+
+//############### LOAD B #######################
+
+
+ mr T1, BO
+
+ lxsdx vs32, o0, T1
+ lxsdx vs33, o8, T1
+ lxsdx vs34, o16, T1
+ lxsdx vs35, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs36, o0, T1
+ lxsdx vs37, o8, T1
+ lxsdx vs38, o16, T1
+ lxsdx vs39, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs40, o0, T1
+ lxsdx vs41, o8, T1
+ lxsdx vs42, o16, T1
+ lxsdx vs43, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs44, o0, T1
+ lxsdx vs45, o8, T1
+ lxsdx vs46, o16, T1
+ lxsdx vs47, o24, T1
+
+ xssubdp vs32, vs32, vs0
+ xssubdp vs33, vs33, vs1
+ xssubdp vs34, vs34, vs2
+ xssubdp vs35, vs35, vs3
+ xssubdp vs36, vs36, vs4
+ xssubdp vs37, vs37, vs5
+ xssubdp vs38, vs38, vs6
+ xssubdp vs39, vs39, vs7
+ xssubdp vs40, vs40, vs8
+ xssubdp vs41, vs41, vs9
+ xssubdp vs42, vs42, vs10
+ xssubdp vs43, vs43, vs11
+ xssubdp vs44, vs44, vs12
+ xssubdp vs45, vs45, vs13
+ xssubdp vs46, vs46, vs14
+ xssubdp vs47, vs47, vs15
+
+ mr T1, AO
+
+
+//############### OFFSET 0 #######################
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+ lxsdx vs5, o8, T1
+ lxsdx vs6, o16, T1
+ lxsdx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs8, o0, T1
+ lxsdx vs9, o8, T1
+ lxsdx vs10, o16, T1
+ lxsdx vs11, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs12, o0, T1
+ lxsdx vs13, o8, T1
+ lxsdx vs14, o16, T1
+ lxsdx vs15, o24, T1
+
+ addi T1, T1, 32
+
+ xsmuldp vs32, vs32, vs0
+ xsnmsubadp vs33, vs32, vs1
+ xsnmsubadp vs34, vs32, vs2
+ xsnmsubadp vs35, vs32, vs3
+ xsnmsubadp vs36, vs32, vs4
+ xsnmsubadp vs37, vs32, vs5
+ xsnmsubadp vs38, vs32, vs6
+ xsnmsubadp vs39, vs32, vs7
+ xsnmsubadp vs40, vs32, vs8
+ xsnmsubadp vs41, vs32, vs9
+ xsnmsubadp vs42, vs32, vs10
+ xsnmsubadp vs43, vs32, vs11
+ xsnmsubadp vs44, vs32, vs12
+ xsnmsubadp vs45, vs32, vs13
+ xsnmsubadp vs46, vs32, vs14
+ xsnmsubadp vs47, vs32, vs15
+
+//############### OFFSET 1 #######################
+
+ addi T1, T1, 1*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+ lxsdx vs5, o8, T1
+ lxsdx vs6, o16, T1
+ lxsdx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs8, o0, T1
+ lxsdx vs9, o8, T1
+ lxsdx vs10, o16, T1
+ lxsdx vs11, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs12, o0, T1
+ lxsdx vs13, o8, T1
+ lxsdx vs14, o16, T1
+
+ addi T1, T1, 24
+
+ xsmuldp vs33, vs33, vs0
+ xsnmsubadp vs34, vs33, vs1
+ xsnmsubadp vs35, vs33, vs2
+ xsnmsubadp vs36, vs33, vs3
+ xsnmsubadp vs37, vs33, vs4
+ xsnmsubadp vs38, vs33, vs5
+ xsnmsubadp vs39, vs33, vs6
+ xsnmsubadp vs40, vs33, vs7
+ xsnmsubadp vs41, vs33, vs8
+ xsnmsubadp vs42, vs33, vs9
+ xsnmsubadp vs43, vs33, vs10
+ xsnmsubadp vs44, vs33, vs11
+ xsnmsubadp vs45, vs33, vs12
+ xsnmsubadp vs46, vs33, vs13
+ xsnmsubadp vs47, vs33, vs14
+
+//############### OFFSET 2 #######################
+
+ addi T1, T1, 2*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+ lxsdx vs5, o8, T1
+ lxsdx vs6, o16, T1
+ lxsdx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs8, o0, T1
+ lxsdx vs9, o8, T1
+ lxsdx vs10, o16, T1
+ lxsdx vs11, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs12, o0, T1
+ lxsdx vs13, o8, T1
+
+ addi T1, T1, 16
+
+ xsmuldp vs34, vs34, vs0
+ xsnmsubadp vs35, vs34, vs1
+ xsnmsubadp vs36, vs34, vs2
+ xsnmsubadp vs37, vs34, vs3
+ xsnmsubadp vs38, vs34, vs4
+ xsnmsubadp vs39, vs34, vs5
+ xsnmsubadp vs40, vs34, vs6
+ xsnmsubadp vs41, vs34, vs7
+ xsnmsubadp vs42, vs34, vs8
+ xsnmsubadp vs43, vs34, vs9
+ xsnmsubadp vs44, vs34, vs10
+ xsnmsubadp vs45, vs34, vs11
+ xsnmsubadp vs46, vs34, vs12
+ xsnmsubadp vs47, vs34, vs13
+
+//############### OFFSET 3 #######################
+
+ addi T1, T1, 3*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+ lxsdx vs5, o8, T1
+ lxsdx vs6, o16, T1
+ lxsdx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs8, o0, T1
+ lxsdx vs9, o8, T1
+ lxsdx vs10, o16, T1
+ lxsdx vs11, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs12, o0, T1
+
+ addi T1, T1, 8
+
+ xsmuldp vs35, vs35, vs0
+ xsnmsubadp vs36, vs35, vs1
+ xsnmsubadp vs37, vs35, vs2
+ xsnmsubadp vs38, vs35, vs3
+ xsnmsubadp vs39, vs35, vs4
+ xsnmsubadp vs40, vs35, vs5
+ xsnmsubadp vs41, vs35, vs6
+ xsnmsubadp vs42, vs35, vs7
+ xsnmsubadp vs43, vs35, vs8
+ xsnmsubadp vs44, vs35, vs9
+ xsnmsubadp vs45, vs35, vs10
+ xsnmsubadp vs46, vs35, vs11
+ xsnmsubadp vs47, vs35, vs12
+
+//############### OFFSET 4 #######################
+
+ addi T1, T1, 4*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+ lxsdx vs5, o8, T1
+ lxsdx vs6, o16, T1
+ lxsdx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs8, o0, T1
+ lxsdx vs9, o8, T1
+ lxsdx vs10, o16, T1
+ lxsdx vs11, o24, T1
+
+ addi T1, T1, 32
+
+ xsmuldp vs36, vs36, vs0
+ xsnmsubadp vs37, vs36, vs1
+ xsnmsubadp vs38, vs36, vs2
+ xsnmsubadp vs39, vs36, vs3
+ xsnmsubadp vs40, vs36, vs4
+ xsnmsubadp vs41, vs36, vs5
+ xsnmsubadp vs42, vs36, vs6
+ xsnmsubadp vs43, vs36, vs7
+ xsnmsubadp vs44, vs36, vs8
+ xsnmsubadp vs45, vs36, vs9
+ xsnmsubadp vs46, vs36, vs10
+ xsnmsubadp vs47, vs36, vs11
+
+//############### OFFSET 5 #######################
+
+ addi T1, T1, 5*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+ lxsdx vs5, o8, T1
+ lxsdx vs6, o16, T1
+ lxsdx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs8, o0, T1
+ lxsdx vs9, o8, T1
+ lxsdx vs10, o16, T1
+
+ addi T1, T1, 24
+
+ xsmuldp vs37, vs37, vs0
+ xsnmsubadp vs38, vs37, vs1
+ xsnmsubadp vs39, vs37, vs2
+ xsnmsubadp vs40, vs37, vs3
+ xsnmsubadp vs41, vs37, vs4
+ xsnmsubadp vs42, vs37, vs5
+ xsnmsubadp vs43, vs37, vs6
+ xsnmsubadp vs44, vs37, vs7
+ xsnmsubadp vs45, vs37, vs8
+ xsnmsubadp vs46, vs37, vs9
+ xsnmsubadp vs47, vs37, vs10
+
+//############### OFFSET 6 #######################
+
+ addi T1, T1, 6*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+ lxsdx vs5, o8, T1
+ lxsdx vs6, o16, T1
+ lxsdx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs8, o0, T1
+ lxsdx vs9, o8, T1
+
+ addi T1, T1, 16
+
+ xsmuldp vs38, vs38, vs0
+ xsnmsubadp vs39, vs38, vs1
+ xsnmsubadp vs40, vs38, vs2
+ xsnmsubadp vs41, vs38, vs3
+ xsnmsubadp vs42, vs38, vs4
+ xsnmsubadp vs43, vs38, vs5
+ xsnmsubadp vs44, vs38, vs6
+ xsnmsubadp vs45, vs38, vs7
+ xsnmsubadp vs46, vs38, vs8
+ xsnmsubadp vs47, vs38, vs9
+
+//############### OFFSET 7 #######################
+
+ addi T1, T1, 7*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+ lxsdx vs5, o8, T1
+ lxsdx vs6, o16, T1
+ lxsdx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs8, o0, T1
+
+ addi T1, T1, 8
+
+ xsmuldp vs39, vs39, vs0
+ xsnmsubadp vs40, vs39, vs1
+ xsnmsubadp vs41, vs39, vs2
+ xsnmsubadp vs42, vs39, vs3
+ xsnmsubadp vs43, vs39, vs4
+ xsnmsubadp vs44, vs39, vs5
+ xsnmsubadp vs45, vs39, vs6
+ xsnmsubadp vs46, vs39, vs7
+ xsnmsubadp vs47, vs39, vs8
+
+//############### OFFSET 8 #######################
+
+ addi T1, T1, 8*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+ lxsdx vs5, o8, T1
+ lxsdx vs6, o16, T1
+ lxsdx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ xsmuldp vs40, vs40, vs0
+ xsnmsubadp vs41, vs40, vs1
+ xsnmsubadp vs42, vs40, vs2
+ xsnmsubadp vs43, vs40, vs3
+ xsnmsubadp vs44, vs40, vs4
+ xsnmsubadp vs45, vs40, vs5
+ xsnmsubadp vs46, vs40, vs6
+ xsnmsubadp vs47, vs40, vs7
+
+//############### OFFSET 9 #######################
+
+ addi T1, T1, 9*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+ lxsdx vs5, o8, T1
+ lxsdx vs6, o16, T1
+
+ addi T1, T1, 24
+
+ xsmuldp vs41, vs41, vs0
+ xsnmsubadp vs42, vs41, vs1
+ xsnmsubadp vs43, vs41, vs2
+ xsnmsubadp vs44, vs41, vs3
+ xsnmsubadp vs45, vs41, vs4
+ xsnmsubadp vs46, vs41, vs5
+ xsnmsubadp vs47, vs41, vs6
+
+//############### OFFSET 10 #######################
+
+ addi T1, T1, 10*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+ lxsdx vs5, o8, T1
+
+ addi T1, T1, 16
+
+ xsmuldp vs42, vs42, vs0
+ xsnmsubadp vs43, vs42, vs1
+ xsnmsubadp vs44, vs42, vs2
+ xsnmsubadp vs45, vs42, vs3
+ xsnmsubadp vs46, vs42, vs4
+ xsnmsubadp vs47, vs42, vs5
+
+//############### OFFSET 11 #######################
+
+ addi T1, T1, 11*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+
+ addi T1, T1, 8
+
+ xsmuldp vs43, vs43, vs0
+ xsnmsubadp vs44, vs43, vs1
+ xsnmsubadp vs45, vs43, vs2
+ xsnmsubadp vs46, vs43, vs3
+ xsnmsubadp vs47, vs43, vs4
+
+//############### OFFSET 12 #######################
+
+ addi T1, T1, 12*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xsmuldp vs44, vs44, vs0
+ xsnmsubadp vs45, vs44, vs1
+ xsnmsubadp vs46, vs44, vs2
+ xsnmsubadp vs47, vs44, vs3
+
+//############### OFFSET 13 #######################
+
+ addi T1, T1, 13*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+
+ addi T1, T1, 24
+
+ xsmuldp vs45, vs45, vs0
+ xsnmsubadp vs46, vs45, vs1
+ xsnmsubadp vs47, vs45, vs2
+
+//############### OFFSET 14 #######################
+
+ addi T1, T1, 14*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+
+ addi T1, T1, 16
+
+ xsmuldp vs46, vs46, vs0
+ xsnmsubadp vs47, vs46, vs1
+
+//############### OFFSET 15 #######################
+
+ addi T1, T1, 15*SIZE
+
+ lxsdx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xsmuldp vs47, vs47, vs0
+
+//############### SAVE B #######################
+
+
+ mr T1, BO
+
+
+ stxsdx vs32, o0, T1
+ stxsdx vs33, o8, T1
+ stxsdx vs34, o16, T1
+ stxsdx vs35, o24, T1
+
+ addi T1, T1, 32
+
+ stxsdx vs36, o0, T1
+ stxsdx vs37, o8, T1
+ stxsdx vs38, o16, T1
+ stxsdx vs39, o24, T1
+
+ addi T1, T1, 32
+
+ stxsdx vs40, o0, T1
+ stxsdx vs41, o8, T1
+ stxsdx vs42, o16, T1
+ stxsdx vs43, o24, T1
+
+ addi T1, T1, 32
+
+ stxsdx vs44, o0, T1
+ stxsdx vs45, o8, T1
+ stxsdx vs46, o16, T1
+ stxsdx vs47, o24, T1
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+
+ stxsdx vs32, o0, T1
+ stxsdx vs33, o8, T1
+ stxsdx vs34, o16, T1
+ stxsdx vs35, o24, T1
+
+ addi T1, T1, 32
+
+ stxsdx vs36, o0, T1
+ stxsdx vs37, o8, T1
+ stxsdx vs38, o16, T1
+ stxsdx vs39, o24, T1
+
+ addi T1, T1, 32
+
+ stxsdx vs40, o0, T1
+ stxsdx vs41, o8, T1
+ stxsdx vs42, o16, T1
+ stxsdx vs43, o24, T1
+
+ addi T1, T1, 32
+
+ stxsdx vs44, o0, T1
+ stxsdx vs45, o8, T1
+ stxsdx vs46, o16, T1
+ stxsdx vs47, o24, T1
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 8x1
+##########################################################################################*/
+
+.macro SOLVE_LT_8x1
+
+ xxswapd vs0, vs32
+ xxswapd vs1, vs33
+ xxswapd vs2, vs34
+ xxswapd vs3, vs35
+ xxswapd vs4, vs36
+ xxswapd vs5, vs37
+ xxswapd vs6, vs38
+ xxswapd vs7, vs39
+
+//############### LOAD B #######################
+
+
+ mr T1, BO
+
+ lxsdx vs32, o0, T1
+ lxsdx vs33, o8, T1
+ lxsdx vs34, o16, T1
+ lxsdx vs35, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs36, o0, T1
+ lxsdx vs37, o8, T1
+ lxsdx vs38, o16, T1
+ lxsdx vs39, o24, T1
+
+ xssubdp vs32, vs32, vs0
+ xssubdp vs33, vs33, vs1
+ xssubdp vs34, vs34, vs2
+ xssubdp vs35, vs35, vs3
+ xssubdp vs36, vs36, vs4
+ xssubdp vs37, vs37, vs5
+ xssubdp vs38, vs38, vs6
+ xssubdp vs39, vs39, vs7
+
+ mr T1, AO
+
+
+//############### OFFSET 0 #######################
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+ lxsdx vs5, o8, T1
+ lxsdx vs6, o16, T1
+ lxsdx vs7, o24, T1
+
+ addi T1, T1, 32
+
+ xsmuldp vs32, vs32, vs0
+ xsnmsubadp vs33, vs32, vs1
+ xsnmsubadp vs34, vs32, vs2
+ xsnmsubadp vs35, vs32, vs3
+ xsnmsubadp vs36, vs32, vs4
+ xsnmsubadp vs37, vs32, vs5
+ xsnmsubadp vs38, vs32, vs6
+ xsnmsubadp vs39, vs32, vs7
+
+//############### OFFSET 1 #######################
+
+ addi T1, T1, 1*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+ lxsdx vs5, o8, T1
+ lxsdx vs6, o16, T1
+
+ addi T1, T1, 24
+
+ xsmuldp vs33, vs33, vs0
+ xsnmsubadp vs34, vs33, vs1
+ xsnmsubadp vs35, vs33, vs2
+ xsnmsubadp vs36, vs33, vs3
+ xsnmsubadp vs37, vs33, vs4
+ xsnmsubadp vs38, vs33, vs5
+ xsnmsubadp vs39, vs33, vs6
+
+//############### OFFSET 2 #######################
+
+ addi T1, T1, 2*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+ lxsdx vs5, o8, T1
+
+ addi T1, T1, 16
+
+ xsmuldp vs34, vs34, vs0
+ xsnmsubadp vs35, vs34, vs1
+ xsnmsubadp vs36, vs34, vs2
+ xsnmsubadp vs37, vs34, vs3
+ xsnmsubadp vs38, vs34, vs4
+ xsnmsubadp vs39, vs34, vs5
+
+//############### OFFSET 3 #######################
+
+ addi T1, T1, 3*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ lxsdx vs4, o0, T1
+
+ addi T1, T1, 8
+
+ xsmuldp vs35, vs35, vs0
+ xsnmsubadp vs36, vs35, vs1
+ xsnmsubadp vs37, vs35, vs2
+ xsnmsubadp vs38, vs35, vs3
+ xsnmsubadp vs39, vs35, vs4
+
+//############### OFFSET 4 #######################
+
+ addi T1, T1, 4*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xsmuldp vs36, vs36, vs0
+ xsnmsubadp vs37, vs36, vs1
+ xsnmsubadp vs38, vs36, vs2
+ xsnmsubadp vs39, vs36, vs3
+
+//############### OFFSET 5 #######################
+
+ addi T1, T1, 5*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+
+ addi T1, T1, 24
+
+ xsmuldp vs37, vs37, vs0
+ xsnmsubadp vs38, vs37, vs1
+ xsnmsubadp vs39, vs37, vs2
+
+//############### OFFSET 6 #######################
+
+ addi T1, T1, 6*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+
+ addi T1, T1, 16
+
+ xsmuldp vs38, vs38, vs0
+ xsnmsubadp vs39, vs38, vs1
+
+//############### OFFSET 7 #######################
+
+ addi T1, T1, 7*SIZE
+
+ lxsdx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xsmuldp vs39, vs39, vs0
+
+//############### SAVE B #######################
+
+
+ mr T1, BO
+
+
+ stxsdx vs32, o0, T1
+ stxsdx vs33, o8, T1
+ stxsdx vs34, o16, T1
+ stxsdx vs35, o24, T1
+
+ addi T1, T1, 32
+
+ stxsdx vs36, o0, T1
+ stxsdx vs37, o8, T1
+ stxsdx vs38, o16, T1
+ stxsdx vs39, o24, T1
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+
+ stxsdx vs32, o0, T1
+ stxsdx vs33, o8, T1
+ stxsdx vs34, o16, T1
+ stxsdx vs35, o24, T1
+
+ addi T1, T1, 32
+
+ stxsdx vs36, o0, T1
+ stxsdx vs37, o8, T1
+ stxsdx vs38, o16, T1
+ stxsdx vs39, o24, T1
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 4x1
+##########################################################################################*/
+
+.macro SOLVE_LT_4x1
+
+ xxswapd vs0, vs32
+ xxswapd vs1, vs33
+ xxswapd vs2, vs34
+ xxswapd vs3, vs35
+
+//############### LOAD B #######################
+
+
+ mr T1, BO
+
+ lxsdx vs32, o0, T1
+ lxsdx vs33, o8, T1
+ lxsdx vs34, o16, T1
+ lxsdx vs35, o24, T1
+
+ xssubdp vs32, vs32, vs0
+ xssubdp vs33, vs33, vs1
+ xssubdp vs34, vs34, vs2
+ xssubdp vs35, vs35, vs3
+
+ mr T1, AO
+
+
+//############### OFFSET 0 #######################
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+ lxsdx vs3, o24, T1
+
+ addi T1, T1, 32
+
+ xsmuldp vs32, vs32, vs0
+ xsnmsubadp vs33, vs32, vs1
+ xsnmsubadp vs34, vs32, vs2
+ xsnmsubadp vs35, vs32, vs3
+
+//############### OFFSET 1 #######################
+
+ addi T1, T1, 1*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+ lxsdx vs2, o16, T1
+
+ addi T1, T1, 24
+
+ xsmuldp vs33, vs33, vs0
+ xsnmsubadp vs34, vs33, vs1
+ xsnmsubadp vs35, vs33, vs2
+
+//############### OFFSET 2 #######################
+
+ addi T1, T1, 2*SIZE
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+
+ addi T1, T1, 16
+
+ xsmuldp vs34, vs34, vs0
+ xsnmsubadp vs35, vs34, vs1
+
+//############### OFFSET 3 #######################
+
+ addi T1, T1, 3*SIZE
+
+ lxsdx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xsmuldp vs35, vs35, vs0
+
+//############### SAVE B #######################
+
+
+ mr T1, BO
+
+
+ stxsdx vs32, o0, T1
+ stxsdx vs33, o8, T1
+ stxsdx vs34, o16, T1
+ stxsdx vs35, o24, T1
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+
+ stxsdx vs32, o0, T1
+ stxsdx vs33, o8, T1
+ stxsdx vs34, o16, T1
+ stxsdx vs35, o24, T1
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 2x1
+##########################################################################################*/
+
+.macro SOLVE_LT_2x1
+
+ xxswapd vs0, vs32
+ xxswapd vs1, vs33
+
+//############### LOAD B #######################
+
+
+ mr T1, BO
+
+ lxsdx vs32, o0, T1
+ lxsdx vs33, o8, T1
+
+ xssubdp vs32, vs32, vs0
+ xssubdp vs33, vs33, vs1
+
+ mr T1, AO
+
+
+//############### OFFSET 0 #######################
+
+ lxsdx vs0, o0, T1
+ lxsdx vs1, o8, T1
+
+ addi T1, T1, 16
+
+ xsmuldp vs32, vs32, vs0
+ xsnmsubadp vs33, vs32, vs1
+
+//############### OFFSET 1 #######################
+
+ addi T1, T1, 1*SIZE
+
+ lxsdx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xsmuldp vs33, vs33, vs0
+
+//############### SAVE B #######################
+
+
+ mr T1, BO
+
+
+ stxsdx vs32, o0, T1
+ stxsdx vs33, o8, T1
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+
+ stxsdx vs32, o0, T1
+ stxsdx vs33, o8, T1
+
+.endm
+
+
+/*##########################################################################################
+ SOLVE_LT 1x1
+##########################################################################################*/
+
+.macro SOLVE_LT_1x1
+
+ xxswapd vs0, vs32
+
+//############### LOAD B #######################
+
+
+ mr T1, BO
+
+ lxsdx vs32, o0, T1
+
+ xssubdp vs32, vs32, vs0
+
+ mr T1, AO
+
+
+//############### OFFSET 0 #######################
+
+ lxsdx vs0, o0, T1
+
+ addi T1, T1, 8
+
+ xsmuldp vs32, vs32, vs0
+
+//############### SAVE B #######################
+
+
+ mr T1, BO
+
+
+ stxsdx vs32, o0, T1
+
+//############### SAVE C #######################
+
+
+ mr T1, CO
+
+ stxsdx vs32, o0, T1
+
+.endm
+
diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S
index 77f3f7cfb..e169eb970 100644
--- a/kernel/power/sgemm_kernel_16x8_power8.S
+++ b/kernel/power/sgemm_kernel_16x8_power8.S
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
-* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S
index 06bb79ea3..8907fe6ad 100644
--- a/kernel/power/sgemm_logic_16x8_power8.S
+++ b/kernel/power/sgemm_logic_16x8_power8.S
@@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
-* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
-* LAPACK-TEST : OK
+* LAPACK-TEST : OK
**************************************************************************************/
srawi. J, N, 3
@@ -40,35 +40,48 @@ SGEMM_L8_BEGIN:
mr BO, B
mr BBO, BBUFFER
- slwi T1, K, 3
+ srawi. T1, K, 2
+ ble SGEMM_L8_COPYB1
+
-SGEMM_L8_COPYB:
+SGEMM_L8_COPYB4:
+
+ dcbt BO, PRE
dcbtst BBO, PRE
+ COPYB_4x8
+ addic. T1, T1, -1
+ ble SGEMM_L8_COPYB1
- lxvw4x vs3, o0, BO
- lxvw4x vs11, o16, BO
- xxspltw vs4, vs3, 0
- xxspltw vs5, vs3, 1
- xxspltw vs6, vs3, 2
- xxspltw vs7, vs3, 3
- xxspltw vs12, vs11, 0
- xxspltw vs13, vs11, 1
- xxspltw vs14, vs11, 2
- xxspltw vs15, vs11, 3
- stxvw4x vs4, o0, BBO
- stxvw4x vs5, o16, BBO
- stxvw4x vs6, o32, BBO
- stxvw4x vs7, o48, BBO
- addi BO, BO, 32
- addi BBO, BBO, 64
- stxvw4x vs12, o0, BBO
- stxvw4x vs13, o16, BBO
- stxvw4x vs14, o32, BBO
- stxvw4x vs15, o48, BBO
- addic. T1, T1, -8
- addi BBO, BBO, 64
+ dcbtst BBO, PRE
+ COPYB_4x8
+ addic. T1, T1, -1
+ ble SGEMM_L8_COPYB1
- bge SGEMM_L8_COPYB
+ dcbtst BBO, PRE
+ COPYB_4x8
+ addic. T1, T1, -1
+ ble SGEMM_L8_COPYB1
+
+ dcbtst BBO, PRE
+ COPYB_4x8
+ addic. T1, T1, -1
+
+ bgt SGEMM_L8_COPYB4
+
+SGEMM_L8_COPYB1:
+
+ andi. T1, K, 3
+ ble SGEMM_L8_COPYB_END
+
+SGEMM_L8_COPYB1_LOOP:
+
+
+ COPYB_1x8
+ addic. T1, T1, -1
+
+ bgt SGEMM_L8_COPYB1_LOOP
+
+SGEMM_L8_COPYB_END:
mr CO, C
mr AO, A
@@ -93,24 +106,24 @@ SGEMM_L8x16_LOOP_START:
LOAD8x16_1
dcbt BO, PRE
KERNEL8x16_I1
- dcbt BO, PRE
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL8x16_2
dcbt BO, PRE
KERNEL8x16_1
- dcbt BO, PRE
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL8x16_2
dcbt BO, PRE
KERNEL8x16_1
- dcbt BO, PRE
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL8x16_2
dcbt BO, PRE
KERNEL8x16_1
- dcbt BO, PRE
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL8x16_2
addic. L, L, -2
@@ -122,24 +135,24 @@ SGEMM_L8x16_LOOP:
dcbt BO, PRE
KERNEL8x16_1
- dcbt BO, PRE
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL8x16_2
dcbt BO, PRE
KERNEL8x16_1
- dcbt BO, PRE
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL8x16_2
dcbt BO, PRE
KERNEL8x16_1
- dcbt BO, PRE
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL8x16_2
dcbt BO, PRE
KERNEL8x16_1
- dcbt BO, PRE
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL8x16_2
addic. L, L, -1
@@ -149,18 +162,15 @@ SGEMM_L8x16_LOOP_END:
dcbt BO, PRE
KERNEL8x16_1
- dcbt BO, PRE
dcbt AO, PRE
+ dcbt BO, PRE
KERNEL8x16_2
dcbt BO, PRE
KERNEL8x16_1
- dcbt BO, PRE
dcbt AO, PRE
KERNEL8x16_2
- dcbt BO, PRE
KERNEL8x16_1
- dcbt BO, PRE
dcbt AO, PRE
KERNEL8x16_2
KERNEL8x16_1
diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S
index 71dc52979..98414857f 100644
--- a/kernel/power/sgemm_macros_16x8_power8.S
+++ b/kernel/power/sgemm_macros_16x8_power8.S
@@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
-* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
-* LAPACK-TEST : OK
+* LAPACK-TEST : OK
**************************************************************************************/
@@ -5886,3 +5886,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
+
+
+
+
+.macro COPYB_4x8
+
+
+ lxvw4x vs5, o0, BO
+ xxspltw vs6, vs5, 0
+ xxspltw vs7, vs5, 1
+ xxspltw vs8, vs5, 2
+ xxspltw vs9, vs5, 3
+
+ lxvw4x vs10, o16, BO
+ xxspltw vs11, vs10, 0
+ xxspltw vs12, vs10, 1
+ xxspltw vs13, vs10, 2
+ xxspltw vs14, vs10, 3
+
+ lxvw4x vs15, o32, BO
+ xxspltw vs16, vs15, 0
+ xxspltw vs17, vs15, 1
+ xxspltw vs18, vs15, 2
+ xxspltw vs19, vs15, 3
+
+ lxvw4x vs20, o48, BO
+ xxspltw vs21, vs20, 0
+ xxspltw vs22, vs20, 1
+ xxspltw vs23, vs20, 2
+ xxspltw vs24, vs20, 3
+
+ addi BO, BO, 64
+ lxvw4x vs35, o0, BO
+ xxspltw vs36, vs35, 0
+ xxspltw vs37, vs35, 1
+ xxspltw vs38, vs35, 2
+ xxspltw vs39, vs35, 3
+
+ lxvw4x vs40, o16, BO
+ xxspltw vs41, vs40, 0
+ xxspltw vs42, vs40, 1
+ xxspltw vs43, vs40, 2
+ xxspltw vs44, vs40, 3
+
+ lxvw4x vs45, o32, BO
+ xxspltw vs46, vs45, 0
+ xxspltw vs47, vs45, 1
+ xxspltw vs48, vs45, 2
+ xxspltw vs49, vs45, 3
+
+ lxvw4x vs50, o48, BO
+ xxspltw vs51, vs50, 0
+ xxspltw vs52, vs50, 1
+ xxspltw vs53, vs50, 2
+ xxspltw vs54, vs50, 3
+
+ addi BO, BO, 64
+
+
+ stxvw4x vs6, o0, BBO
+ stxvw4x vs7, o16, BBO
+ stxvw4x vs8, o32, BBO
+ stxvw4x vs9, o48, BBO
+
+ addi BBO, BBO, 64
+ stxvw4x vs11, o0, BBO
+ stxvw4x vs12, o16, BBO
+ stxvw4x vs13, o32, BBO
+ stxvw4x vs14, o48, BBO
+
+ addi BBO, BBO, 64
+ stxvw4x vs16, o0, BBO
+ stxvw4x vs17, o16, BBO
+ stxvw4x vs18, o32, BBO
+ stxvw4x vs19, o48, BBO
+
+ addi BBO, BBO, 64
+ stxvw4x vs21, o0, BBO
+ stxvw4x vs22, o16, BBO
+ stxvw4x vs23, o32, BBO
+ stxvw4x vs24, o48, BBO
+
+ addi BBO, BBO, 64
+ stxvw4x vs36, o0, BBO
+ stxvw4x vs37, o16, BBO
+ stxvw4x vs38, o32, BBO
+ stxvw4x vs39, o48, BBO
+
+ addi BBO, BBO, 64
+ stxvw4x vs41, o0, BBO
+ stxvw4x vs42, o16, BBO
+ stxvw4x vs43, o32, BBO
+ stxvw4x vs44, o48, BBO
+
+ addi BBO, BBO, 64
+ stxvw4x vs46, o0, BBO
+ stxvw4x vs47, o16, BBO
+ stxvw4x vs48, o32, BBO
+ stxvw4x vs49, o48, BBO
+
+ addi BBO, BBO, 64
+ stxvw4x vs51, o0, BBO
+ stxvw4x vs52, o16, BBO
+ stxvw4x vs53, o32, BBO
+ stxvw4x vs54, o48, BBO
+
+ addi BBO, BBO, 64
+.endm
+
+
+.macro COPYB_1x8
+
+
+ lxvw4x vs5, o0, BO
+ xxspltw vs6, vs5, 0
+ xxspltw vs7, vs5, 1
+ xxspltw vs8, vs5, 2
+ xxspltw vs9, vs5, 3
+
+ lxvw4x vs10, o16, BO
+ xxspltw vs11, vs10, 0
+ xxspltw vs12, vs10, 1
+ xxspltw vs13, vs10, 2
+ xxspltw vs14, vs10, 3
+
+
+ addi BO, BO, 32
+
+ stxvw4x vs6, o0, BBO
+ stxvw4x vs7, o16, BBO
+ stxvw4x vs8, o32, BBO
+ stxvw4x vs9, o48, BBO
+
+ addi BBO, BBO, 64
+ stxvw4x vs11, o0, BBO
+ stxvw4x vs12, o16, BBO
+ stxvw4x vs13, o32, BBO
+ stxvw4x vs14, o48, BBO
+
+ addi BBO, BBO, 64
+.endm
+
diff --git a/kernel/power/sgemm_tcopy_16_power8.S b/kernel/power/sgemm_tcopy_16_power8.S
new file mode 100644
index 000000000..764d5b187
--- /dev/null
+++ b/kernel/power/sgemm_tcopy_16_power8.S
@@ -0,0 +1,212 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define M r3
+#define N r4
+#define A r5
+#define LDA r6
+#define B r7
+
+#define A0 r8
+#define A1 r9
+#define A2 r10
+#define A3 r11
+
+#define J r12
+
+#define PREA r14
+#define PREB r15
+#define BO r16
+#define B8 r17
+#define B4 r18
+#define B2 r19
+#define B1 r20
+#define o4 r21
+#define T2 r22
+#define I r23
+#define o16 r24
+#define o32 r25
+#define o48 r26
+#define B16 r29
+#define M16 r30
+#define T1 r31
+
+#define o0 0
+
+#include "sgemm_tcopy_macros_16_power8.S"
+
+#define STACKSIZE 384
+
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+ cmpwi cr0, M, 0
+ ble- L999
+ cmpwi cr0, N, 0
+ ble- L999
+
+ slwi LDA, LDA, BASE_SHIFT
+ slwi M16, M, 4 + BASE_SHIFT
+
+ li T1, -16
+ li T2, -8
+ li PREA, -4
+ li PREB, -2
+
+ and B8, N, T1
+ and B4, N, T2
+ and B2, N, PREA
+ and B1, N, PREB
+
+ mullw B8, B8, M
+ mullw B4, B4, M
+ mullw B2, B2, M
+ mullw B1, B1, M
+
+ slwi B8, B8, BASE_SHIFT
+ slwi B4, B4, BASE_SHIFT
+ slwi B2, B2, BASE_SHIFT
+ slwi B1, B1, BASE_SHIFT
+
+ add B8, B8, B
+ add B4, B4, B
+ add B2, B2, B
+ add B1, B1, B
+
+ li PREA, 768
+ addi PREB, M16, 128
+
+ li o4, 4
+ li o16, 16
+ li o32, 32
+ li o48, 48
+
+#include "sgemm_tcopy_logic_16_power8.S"
+
+L999:
+
+ li r3, 0
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ addi SP, SP, STACKSIZE
+
+ blr
+ EPILOGUE
+
+
diff --git a/kernel/power/sgemm_tcopy_8_power8.S b/kernel/power/sgemm_tcopy_8_power8.S
new file mode 100644
index 000000000..2bbd6e696
--- /dev/null
+++ b/kernel/power/sgemm_tcopy_8_power8.S
@@ -0,0 +1,207 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define M r3
+#define N r4
+#define A r5
+#define LDA r6
+#define B r7
+
+#define A0 r8
+#define A1 r9
+#define A2 r10
+#define A3 r11
+
+#define J r12
+
+#define PREA r14
+#define PREB r15
+#define BO r16
+#define B8 r17
+#define B4 r18
+#define B2 r19
+#define B1 r20
+#define o4 r21
+#define T2 r22
+#define I r23
+#define o16 r24
+#define o32 r25
+#define o48 r26
+#define NOTU1 r29
+#define M8 r30
+#define T1 r31
+
+#define o0 0
+
+#include "sgemm_tcopy_macros_8_power8.S"
+
+#define STACKSIZE 384
+
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+ cmpwi cr0, M, 0
+ ble- L999
+ cmpwi cr0, N, 0
+ ble- L999
+
+ slwi LDA, LDA, BASE_SHIFT
+ slwi M8, M, 3 + BASE_SHIFT
+
+ li T2, -8
+ li PREA, -4
+ li PREB, -2
+
+ and B4, N, T2
+ and B2, N, PREA
+ and B1, N, PREB
+
+ mullw B4, B4, M
+ mullw B2, B2, M
+ mullw B1, B1, M
+
+ slwi B4, B4, BASE_SHIFT
+ slwi B2, B2, BASE_SHIFT
+ slwi B1, B1, BASE_SHIFT
+
+ add B4, B4, B
+ add B2, B2, B
+ add B1, B1, B
+
+ li PREA, 384
+ addi PREB, M8, 128
+
+ li o4, 4
+ li o16, 16
+ li o32, 32
+ li o48, 48
+
+#include "sgemm_tcopy_logic_8_power8.S"
+
+L999:
+
+ li r3, 0
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ addi SP, SP, STACKSIZE
+
+ blr
+ EPILOGUE
+
+
diff --git a/kernel/power/sgemm_tcopy_logic_16_power8.S b/kernel/power/sgemm_tcopy_logic_16_power8.S
new file mode 100644
index 000000000..7dfb6fa46
--- /dev/null
+++ b/kernel/power/sgemm_tcopy_logic_16_power8.S
@@ -0,0 +1,324 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+ srawi. I, M, 2
+ ble SCOPYT_L2_BEGIN
+
+
+SCOPYT_L4_BEGIN:
+
+ mr A0, A
+ add A1, A0, LDA
+ add A2, A1, LDA
+ add A3, A2, LDA
+ add A, A3, LDA
+ mr B16, B
+ addi B, B, 64*SIZE
+
+ sradi. J, N, 4
+ ble SCOPYT_L4x8_BEGIN
+
+ mr BO, B16
+
+SCOPYT_L4x16_LOOP:
+
+ dcbtst BO, M16
+ dcbtst BO, PREB
+ dcbt A0, PREA
+ dcbt A1, PREA
+ dcbt A2, PREA
+ dcbt A3, PREA
+ COPY_4x16
+
+ addi A0, A0, 16*SIZE
+ addi A1, A1, 16*SIZE
+ addi A2, A2, 16*SIZE
+ addi A3, A3, 16*SIZE
+ add BO, BO, M16
+
+ addic. J, J, -1
+ ble SCOPYT_L4x8_BEGIN
+
+
+ dcbtst BO, M16
+ dcbtst BO, PREB
+ COPY_4x16
+
+ addi A0, A0, 16*SIZE
+ addi A1, A1, 16*SIZE
+ addi A2, A2, 16*SIZE
+ addi A3, A3, 16*SIZE
+ add BO, BO, M16
+
+ addic. J, J, -1
+ bgt SCOPYT_L4x16_LOOP
+
+SCOPYT_L4x8_BEGIN:
+
+ andi. T1, N, 8
+ ble SCOPYT_L4x4_BEGIN
+
+ mr BO, B8
+
+ COPY_4x8
+
+ addi A0, A0, 8*SIZE
+ addi A1, A1, 8*SIZE
+ addi A2, A2, 8*SIZE
+ addi A3, A3, 8*SIZE
+
+ addi B8, B8, 32*SIZE
+
+SCOPYT_L4x4_BEGIN:
+
+ andi. T1, N, 4
+ ble SCOPYT_L4x2_BEGIN
+
+ mr BO, B4
+
+ COPY_4x4
+
+ addi A0, A0, 4*SIZE
+ addi A1, A1, 4*SIZE
+ addi A2, A2, 4*SIZE
+ addi A3, A3, 4*SIZE
+
+ addi B4, B4, 16*SIZE
+
+SCOPYT_L4x2_BEGIN:
+
+ andi. T1, N, 2
+ ble SCOPYT_L4x1_BEGIN
+
+ mr BO, B2
+
+ COPY_4x2
+
+ addi A0, A0, 2*SIZE
+ addi A1, A1, 2*SIZE
+ addi A2, A2, 2*SIZE
+ addi A3, A3, 2*SIZE
+
+ addi B2, B2, 8*SIZE
+
+SCOPYT_L4x1_BEGIN:
+
+ andi. T1, N, 1
+ ble SCOPYT_L4_END
+
+ mr BO, B1
+
+ COPY_4x1
+
+ addi A0, A0, 1*SIZE
+ addi A1, A1, 1*SIZE
+ addi A2, A2, 1*SIZE
+ addi A3, A3, 1*SIZE
+
+ addi B1, B1, 4*SIZE
+
+SCOPYT_L4_END:
+
+ addic. I, I, -1
+ bgt SCOPYT_L4_BEGIN
+
+
+
+SCOPYT_L2_BEGIN:
+
+ andi. T1, M, 2
+ ble SCOPYT_L1_BEGIN
+
+ mr A0, A
+ add A1, A0, LDA
+ add A, A1, LDA
+ mr B16, B
+ addi B, B, 32*SIZE
+
+ sradi. J, N, 4
+ ble SCOPYT_L2x8_BEGIN
+
+ mr BO, B16
+
+SCOPYT_L2x16_LOOP:
+
+ COPY_2x16
+
+ addi A0, A0, 16*SIZE
+ addi A1, A1, 16*SIZE
+ add BO, BO, M16
+
+ addic. J, J, -1
+ bgt SCOPYT_L2x16_LOOP
+
+SCOPYT_L2x8_BEGIN:
+
+ andi. T1, N, 8
+ ble SCOPYT_L2x4_BEGIN
+
+ mr BO, B8
+
+ COPY_2x8
+
+ addi A0, A0, 8*SIZE
+ addi A1, A1, 8*SIZE
+
+ addi B8, B8, 16*SIZE
+
+SCOPYT_L2x4_BEGIN:
+
+ andi. T1, N, 4
+ ble SCOPYT_L2x2_BEGIN
+
+ mr BO, B4
+
+ COPY_2x4
+
+ addi A0, A0, 4*SIZE
+ addi A1, A1, 4*SIZE
+
+ addi B4, B4, 8*SIZE
+
+SCOPYT_L2x2_BEGIN:
+
+ andi. T1, N, 2
+ ble SCOPYT_L2x1_BEGIN
+
+ mr BO, B2
+
+ COPY_2x2
+
+ addi A0, A0, 2*SIZE
+ addi A1, A1, 2*SIZE
+
+ addi B2, B2, 4*SIZE
+
+SCOPYT_L2x1_BEGIN:
+
+ andi. T1, N, 1
+ ble SCOPYT_L2_END
+
+ mr BO, B1
+
+ COPY_2x1
+
+ addi A0, A0, 1*SIZE
+ addi A1, A1, 1*SIZE
+
+ addi B1, B1, 2*SIZE
+
+SCOPYT_L2_END:
+
+
+SCOPYT_L1_BEGIN:
+
+ andi. T1, M, 1
+ ble L999
+
+ mr A0, A
+ add A, A0, LDA
+ mr B16, B
+ addi B, B, 16*SIZE
+
+ sradi. J, N, 4
+ ble SCOPYT_L1x8_BEGIN
+
+ mr BO, B16
+
+SCOPYT_L1x16_LOOP:
+
+ COPY_1x16
+
+ addi A0, A0, 16*SIZE
+ add BO, BO, M16
+
+ addic. J, J, -1
+ bgt SCOPYT_L1x16_LOOP
+
+SCOPYT_L1x8_BEGIN:
+
+ andi. T1, N, 8
+ ble SCOPYT_L1x4_BEGIN
+
+ mr BO, B8
+
+ COPY_1x8
+
+ addi A0, A0, 8*SIZE
+
+ addi B8, B8, 8*SIZE
+
+SCOPYT_L1x4_BEGIN:
+
+ andi. T1, N, 4
+ ble SCOPYT_L1x2_BEGIN
+
+ mr BO, B4
+
+ COPY_1x4
+
+ addi A0, A0, 4*SIZE
+
+ addi B4, B4, 4*SIZE
+
+SCOPYT_L1x2_BEGIN:
+
+ andi. T1, N, 2
+ ble SCOPYT_L1x1_BEGIN
+
+ mr BO, B2
+
+ COPY_1x2
+
+ addi A0, A0, 2*SIZE
+
+ addi B2, B2, 2*SIZE
+
+SCOPYT_L1x1_BEGIN:
+
+ andi. T1, N, 1
+ ble SCOPYT_L1_END
+
+ mr BO, B1
+
+ COPY_1x1
+
+ addi A0, A0, 1*SIZE
+
+ addi B1, B1, 1*SIZE
+
+SCOPYT_L1_END:
+
diff --git a/kernel/power/sgemm_tcopy_logic_8_power8.S b/kernel/power/sgemm_tcopy_logic_8_power8.S
new file mode 100644
index 000000000..4cf74baa3
--- /dev/null
+++ b/kernel/power/sgemm_tcopy_logic_8_power8.S
@@ -0,0 +1,299 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+ srawi. I, M, 2
+ ble SCOPYOT_L2_BEGIN
+
+
+SCOPYOT_L4_BEGIN:
+
+ mr A0, A
+ add A1, A0, LDA
+ add A2, A1, LDA
+ add A3, A2, LDA
+ add A, A3, LDA
+ mr B8, B
+ addi B, B, 32*SIZE
+
+ sradi. J, N, 3
+ ble SCOPYOT_L4x4_BEGIN
+
+ mr BO, B8
+ .align 5
+
+SCOPYOT_L4x8_LOOP:
+
+ dcbt A0, PREA
+ dcbt A1, PREA
+ dcbt A2, PREA
+ dcbt A3, PREA
+ COPY_4x8
+
+ addi A0, A0, 8*SIZE
+ addi A1, A1, 8*SIZE
+ addi A2, A2, 8*SIZE
+ addi A3, A3, 8*SIZE
+ add BO, BO, M8
+
+ addic. J, J, -1
+ ble SCOPYOT_L4x4_BEGIN
+
+ COPY_4x8
+
+ addi A0, A0, 8*SIZE
+ addi A1, A1, 8*SIZE
+ addi A2, A2, 8*SIZE
+ addi A3, A3, 8*SIZE
+ add BO, BO, M8
+
+ addic. J, J, -1
+ ble SCOPYOT_L4x4_BEGIN
+
+ COPY_4x8
+
+ addi A0, A0, 8*SIZE
+ addi A1, A1, 8*SIZE
+ addi A2, A2, 8*SIZE
+ addi A3, A3, 8*SIZE
+ add BO, BO, M8
+
+ addic. J, J, -1
+ ble SCOPYOT_L4x4_BEGIN
+
+ COPY_4x8
+
+ addi A0, A0, 8*SIZE
+ addi A1, A1, 8*SIZE
+ addi A2, A2, 8*SIZE
+ addi A3, A3, 8*SIZE
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt SCOPYOT_L4x8_LOOP
+
+SCOPYOT_L4x4_BEGIN:
+
+ andi. T1, N, 4
+ ble SCOPYOT_L4x2_BEGIN
+
+ mr BO, B4
+
+ COPY_4x4
+
+ addi A0, A0, 4*SIZE
+ addi A1, A1, 4*SIZE
+ addi A2, A2, 4*SIZE
+ addi A3, A3, 4*SIZE
+
+ addi B4, B4, 16*SIZE
+
+SCOPYOT_L4x2_BEGIN:
+
+ andi. T1, N, 2
+ ble SCOPYOT_L4x1_BEGIN
+
+ mr BO, B2
+
+ COPY_4x2
+
+ addi A0, A0, 2*SIZE
+ addi A1, A1, 2*SIZE
+ addi A2, A2, 2*SIZE
+ addi A3, A3, 2*SIZE
+
+ addi B2, B2, 8*SIZE
+
+SCOPYOT_L4x1_BEGIN:
+
+ andi. T1, N, 1
+ ble SCOPYOT_L4_END
+
+ mr BO, B1
+
+ COPY_4x1
+
+ addi A0, A0, 1*SIZE
+ addi A1, A1, 1*SIZE
+ addi A2, A2, 1*SIZE
+ addi A3, A3, 1*SIZE
+
+ addi B1, B1, 4*SIZE
+
+SCOPYOT_L4_END:
+
+ addic. I, I, -1
+ bgt SCOPYOT_L4_BEGIN
+
+
+
+SCOPYOT_L2_BEGIN:
+
+ andi. T1, M, 2
+ ble SCOPYOT_L1_BEGIN
+
+ mr A0, A
+ add A1, A0, LDA
+ add A, A1, LDA
+ mr B8, B
+ addi B, B, 16*SIZE
+
+ sradi. J, N, 3
+ ble SCOPYOT_L2x4_BEGIN
+
+ mr BO, B8
+
+SCOPYOT_L2x8_LOOP:
+
+ COPY_2x8
+
+ addi A0, A0, 8*SIZE
+ addi A1, A1, 8*SIZE
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt SCOPYOT_L2x8_LOOP
+
+SCOPYOT_L2x4_BEGIN:
+
+ andi. T1, N, 4
+ ble SCOPYOT_L2x2_BEGIN
+
+ mr BO, B4
+
+ COPY_2x4
+
+ addi A0, A0, 4*SIZE
+ addi A1, A1, 4*SIZE
+
+ addi B4, B4, 8*SIZE
+
+SCOPYOT_L2x2_BEGIN:
+
+ andi. T1, N, 2
+ ble SCOPYOT_L2x1_BEGIN
+
+ mr BO, B2
+
+ COPY_2x2
+
+ addi A0, A0, 2*SIZE
+ addi A1, A1, 2*SIZE
+
+ addi B2, B2, 4*SIZE
+
+SCOPYOT_L2x1_BEGIN:
+
+ andi. T1, N, 1
+ ble SCOPYOT_L2_END
+
+ mr BO, B1
+
+ COPY_2x1
+
+ addi A0, A0, 1*SIZE
+ addi A1, A1, 1*SIZE
+
+ addi B1, B1, 2*SIZE
+
+SCOPYOT_L2_END:
+
+
+SCOPYOT_L1_BEGIN:
+
+ andi. T1, M, 1
+ ble L999
+
+ mr A0, A
+ add A, A0, LDA
+ mr B8, B
+ addi B, B, 8*SIZE
+
+ sradi. J, N, 3
+ ble SCOPYOT_L1x4_BEGIN
+
+ mr BO, B8
+
+SCOPYOT_L1x8_LOOP:
+
+ COPY_1x8
+
+ addi A0, A0, 8*SIZE
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt SCOPYOT_L1x8_LOOP
+
+SCOPYOT_L1x4_BEGIN:
+
+ andi. T1, N, 4
+ ble SCOPYOT_L1x2_BEGIN
+
+ mr BO, B4
+
+ COPY_1x4
+
+ addi A0, A0, 4*SIZE
+
+ addi B4, B4, 4*SIZE
+
+SCOPYOT_L1x2_BEGIN:
+
+ andi. T1, N, 2
+ ble SCOPYOT_L1x1_BEGIN
+
+ mr BO, B2
+
+ COPY_1x2
+
+ addi A0, A0, 2*SIZE
+
+ addi B2, B2, 2*SIZE
+
+SCOPYOT_L1x1_BEGIN:
+
+ andi. T1, N, 1
+ ble SCOPYOT_L1_END
+
+ mr BO, B1
+
+ COPY_1x1
+
+ addi A0, A0, 1*SIZE
+
+ addi B1, B1, 1*SIZE
+
+SCOPYOT_L1_END:
+
diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S
new file mode 100644
index 000000000..53f9c8b82
--- /dev/null
+++ b/kernel/power/sgemm_tcopy_macros_16_power8.S
@@ -0,0 +1,416 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro COPY_4x16
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ lxvw4x vs34, o32, A0
+ lxvw4x vs35, o48, A0
+
+ lxvw4x vs36, o0, A1
+ lxvw4x vs37, o16, A1
+ lxvw4x vs38, o32, A1
+ lxvw4x vs39, o48, A1
+
+ lxvw4x vs40, o0, A2
+ lxvw4x vs41, o16, A2
+ lxvw4x vs42, o32, A2
+ lxvw4x vs43, o48, A2
+
+ lxvw4x vs44, o0, A3
+ lxvw4x vs45, o16, A3
+ lxvw4x vs46, o32, A3
+ lxvw4x vs47, o48, A3
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs36, o0, T1
+ stxvw4x vs37, o16, T1
+ stxvw4x vs38, o32, T1
+ stxvw4x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs40, o0, T1
+ stxvw4x vs41, o16, T1
+ stxvw4x vs42, o32, T1
+ stxvw4x vs43, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs44, o0, T1
+ stxvw4x vs45, o16, T1
+ stxvw4x vs46, o32, T1
+ stxvw4x vs47, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro COPY_4x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+
+ lxvw4x vs34, o0, A1
+ lxvw4x vs35, o16, A1
+
+ lxvw4x vs36, o0, A2
+ lxvw4x vs37, o16, A2
+
+ lxvw4x vs38, o0, A3
+ lxvw4x vs39, o16, A3
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs36, o0, T1
+ stxvw4x vs37, o16, T1
+
+ stxvw4x vs38, o32, T1
+ stxvw4x vs39, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro COPY_4x4
+
+ lxvw4x vs32, o0, A0
+
+ lxvw4x vs33, o0, A1
+
+ lxvw4x vs34, o0, A2
+
+ lxvw4x vs35, o0, A3
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+
+ stxvw4x vs35, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro COPY_4x2
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+
+ lxsspx vs34, o0, A1
+ lxsspx vs35, o4, A1
+
+ lxsspx vs36, o0, A2
+ lxsspx vs37, o4, A2
+
+ lxsspx vs38, o0, A3
+ lxsspx vs39, o4, A3
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs34, o0, T1
+ stxsspx vs35, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs36, o0, T1
+ stxsspx vs37, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs38, o0, T1
+ stxsspx vs39, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro COPY_4x1
+
+ lxsspx vs32, o0, A0
+
+ lxsspx vs33, o0, A1
+
+ lxsspx vs34, o0, A2
+
+ lxsspx vs35, o0, A3
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+
+ stxsspx vs33, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs34, o0, T1
+
+ stxsspx vs35, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+.macro COPY_2x16
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ lxvw4x vs34, o32, A0
+ lxvw4x vs35, o48, A0
+
+ lxvw4x vs36, o0, A1
+ lxvw4x vs37, o16, A1
+ lxvw4x vs38, o32, A1
+ lxvw4x vs39, o48, A1
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs36, o0, T1
+ stxvw4x vs37, o16, T1
+ stxvw4x vs38, o32, T1
+ stxvw4x vs39, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro COPY_2x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+
+ lxvw4x vs34, o0, A1
+ lxvw4x vs35, o16, A1
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro COPY_2x4
+
+ lxvw4x vs32, o0, A0
+
+ lxvw4x vs33, o0, A1
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+ stxvw4x vs33, o16, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro COPY_2x2
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+
+ lxsspx vs34, o0, A1
+ lxsspx vs35, o4, A1
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs34, o0, T1
+ stxsspx vs35, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro COPY_2x1
+
+ lxsspx vs32, o0, A0
+
+ lxsspx vs33, o0, A1
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+
+ stxsspx vs33, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+.macro COPY_1x16
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+ lxvw4x vs34, o32, A0
+ lxvw4x vs35, o48, A0
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro COPY_1x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro COPY_1x4
+
+ lxvw4x vs32, o0, A0
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro COPY_1x2
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro COPY_1x1
+
+ lxsspx vs32, o0, A0
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+
+.endm
+
diff --git a/kernel/power/sgemm_tcopy_macros_8_power8.S b/kernel/power/sgemm_tcopy_macros_8_power8.S
new file mode 100644
index 000000000..1b71d5bb3
--- /dev/null
+++ b/kernel/power/sgemm_tcopy_macros_8_power8.S
@@ -0,0 +1,308 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro COPY_4x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+
+ lxvw4x vs34, o0, A1
+ lxvw4x vs35, o16, A1
+
+ lxvw4x vs36, o0, A2
+ lxvw4x vs37, o16, A2
+
+ lxvw4x vs38, o0, A3
+ lxvw4x vs39, o16, A3
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvw4x vs36, o0, T1
+ stxvw4x vs37, o16, T1
+
+ stxvw4x vs38, o32, T1
+ stxvw4x vs39, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro COPY_4x4
+
+ lxvw4x vs32, o0, A0
+
+ lxvw4x vs33, o0, A1
+
+ lxvw4x vs34, o0, A2
+
+ lxvw4x vs35, o0, A3
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+
+ stxvw4x vs35, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro COPY_4x2
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+
+ lxsspx vs34, o0, A1
+ lxsspx vs35, o4, A1
+
+ lxsspx vs36, o0, A2
+ lxsspx vs37, o4, A2
+
+ lxsspx vs38, o0, A3
+ lxsspx vs39, o4, A3
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs34, o0, T1
+ stxsspx vs35, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs36, o0, T1
+ stxsspx vs37, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs38, o0, T1
+ stxsspx vs39, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro COPY_4x1
+
+ lxsspx vs32, o0, A0
+
+ lxsspx vs33, o0, A1
+
+ lxsspx vs34, o0, A2
+
+ lxsspx vs35, o0, A3
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+
+ stxsspx vs33, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs34, o0, T1
+
+ stxsspx vs35, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro COPY_2x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+
+ lxvw4x vs34, o0, A1
+ lxvw4x vs35, o16, A1
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+ stxvw4x vs34, o32, T1
+ stxvw4x vs35, o48, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro COPY_2x4
+
+ lxvw4x vs32, o0, A0
+
+ lxvw4x vs33, o0, A1
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+ stxvw4x vs33, o16, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro COPY_2x2
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+
+ lxsspx vs34, o0, A1
+ lxsspx vs35, o4, A1
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+ addi T1, T1, 8
+
+ stxsspx vs34, o0, T1
+ stxsspx vs35, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro COPY_2x1
+
+ lxsspx vs32, o0, A0
+
+ lxsspx vs33, o0, A1
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+
+ stxsspx vs33, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro COPY_1x8
+
+ lxvw4x vs32, o0, A0
+ lxvw4x vs33, o16, A0
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+ stxvw4x vs33, o16, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro COPY_1x4
+
+ lxvw4x vs32, o0, A0
+
+ mr T1, BO
+
+ stxvw4x vs32, o0, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro COPY_1x2
+
+ lxsspx vs32, o0, A0
+ lxsspx vs33, o4, A0
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+ stxsspx vs33, o4, T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro COPY_1x1
+
+ lxsspx vs32, o0, A0
+
+ mr T1, BO
+
+ stxsspx vs32, o0, T1
+
+.endm
+
diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S
index 336b13b1f..02c94a88a 100644
--- a/kernel/power/zgemm_kernel_8x2_power8.S
+++ b/kernel/power/zgemm_kernel_8x2_power8.S
@@ -1,3 +1,73 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
@@ -250,7 +320,7 @@
ble L999
slwi LDC, LDC, ZBASE_SHIFT
- li PRE, 384
+ li PRE, 512
li o8 , 8
li o16 , 16
li o24 , 24
diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S
index 96612da82..0cd784cc0 100644
--- a/kernel/power/zgemm_logic_8x2_power8.S
+++ b/kernel/power/zgemm_logic_8x2_power8.S
@@ -1,3 +1,39 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
srawi. J, N, 1
ble ZGEMM_L2_END
@@ -5,20 +41,34 @@ ZGEMM_L2_BEGIN:
mr BO, B
mr BBO, BBUFFER
- slwi T1, K, 1
+ srawi. T1, K, 2
+ ble ZGEMM_L2_COPYB1
-ZGEMM_L2_COPYB:
+ZGEMM_L2_COPYB8:
- lxvdsx vs4, o0, BO // b0_r
- lxvdsx vs5, o8, BO // b0_i
- addi BO, BO, 16
- stxvd2x vs4, o0, BBO
- stxvd2x vs5, o16, BBO
+ addi T2, PRE, 128
+ dcbt BO, PRE
+ dcbtst BBO, PRE
+ dcbtst BBO, T2
+ ZCOPYB_8x1
addic. T1, T1, -1
- addi BBO, BBO, 32
- bge ZGEMM_L2_COPYB
+ bgt ZGEMM_L2_COPYB8
+
+ZGEMM_L2_COPYB1:
+
+ andi. T1, K, 3
+ ble ZGEMM_L2_COPYB_END
+
+ZGEMM_L2_COPYB_LOOP:
+
+ ZCOPYB_1x1
+ ZCOPYB_1x1
+ addic. T1, T1, -1
+
+ bgt ZGEMM_L2_COPYB_LOOP
+ZGEMM_L2_COPYB_END:
mr CO, C
mr AO, A
@@ -493,6 +543,7 @@ ZGEMM_L1_BEGIN:
slwi T1, K, 0
ZGEMM_L1_COPYB:
+ dcbtst BBO, PRE
lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i
diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S
index a0fbb2e11..c43a115b2 100644
--- a/kernel/power/zgemm_macros_8x2_power8.S
+++ b/kernel/power/zgemm_macros_8x2_power8.S
@@ -1,3 +1,38 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define XSFADD_R1 xsadddp
@@ -3055,3 +3090,76 @@
.endm
+
+
+.macro ZCOPYB_1x1
+
+ lxvdsx vs4, o0, BO // b0_r
+ lxvdsx vs5, o8, BO // b0_i
+ addi BO, BO, 16
+ stxvd2x vs4, o0, BBO
+ stxvd2x vs5, o16, BBO
+ addi BBO, BBO, 32
+
+.endm
+
+
+.macro ZCOPYB_8x1
+
+ lxvd2x vs32, o0, BO
+ lxvd2x vs33, o16, BO
+ lxvd2x vs34, o32, BO
+ lxvd2x vs35, o48, BO
+ addi BO, BO, 64
+
+ lxvd2x vs36, o0, BO
+ lxvd2x vs37, o16, BO
+ lxvd2x vs38, o32, BO
+ lxvd2x vs39, o48, BO
+ addi BO, BO, 64
+
+ xxspltd vs40, vs32, 0
+ xxspltd vs41, vs32, 1
+ xxspltd vs42, vs33, 0
+ xxspltd vs43, vs33, 1
+ xxspltd vs44, vs34, 0
+ xxspltd vs45, vs34, 1
+ xxspltd vs46, vs35, 0
+ xxspltd vs47, vs35, 1
+
+ xxspltd vs48, vs36, 0
+ xxspltd vs49, vs36, 1
+ xxspltd vs50, vs37, 0
+ xxspltd vs51, vs37, 1
+ xxspltd vs52, vs38, 0
+ xxspltd vs53, vs38, 1
+ xxspltd vs54, vs39, 0
+ xxspltd vs55, vs39, 1
+
+ stxvd2x vs40, o0, BBO
+ stxvd2x vs41, o16, BBO
+ stxvd2x vs42, o32, BBO
+ stxvd2x vs43, o48, BBO
+ addi BBO, BBO, 64
+
+ stxvd2x vs44, o0, BBO
+ stxvd2x vs45, o16, BBO
+ stxvd2x vs46, o32, BBO
+ stxvd2x vs47, o48, BBO
+ addi BBO, BBO, 64
+
+ stxvd2x vs48, o0, BBO
+ stxvd2x vs49, o16, BBO
+ stxvd2x vs50, o32, BBO
+ stxvd2x vs51, o48, BBO
+ addi BBO, BBO, 64
+
+ stxvd2x vs52, o0, BBO
+ stxvd2x vs53, o16, BBO
+ stxvd2x vs54, o32, BBO
+ stxvd2x vs55, o48, BBO
+ addi BBO, BBO, 64
+
+.endm
+
+
diff --git a/kernel/power/zgemm_tcopy_8_power8.S b/kernel/power/zgemm_tcopy_8_power8.S
new file mode 100644
index 000000000..1f3f35419
--- /dev/null
+++ b/kernel/power/zgemm_tcopy_8_power8.S
@@ -0,0 +1,205 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define M r3
+#define N r4
+#define A r5
+#define LDA r6
+#define B r7
+
+#define A0 r8
+#define A1 r9
+#define A2 r10
+#define A3 r11
+
+#define J r12
+
+#define PREA r14
+#define PREB r15
+#define BO r16
+#define B8 r17
+#define B4 r18
+#define B2 r19
+#define B1 r20
+#define NOTUS1 r21
+#define T2 r22
+#define I r23
+#define o16 r24
+#define o32 r25
+#define o48 r26
+#define NOTUS2 r27
+#define M8 r30
+#define T1 r31
+
+#define o0 0
+
+#include "zgemm_tcopy_macros_8_power8.S"
+
+#define STACKSIZE 384
+
+
+ PROLOGUE
+ PROFCODE
+
+ addi SP, SP, -STACKSIZE
+ li r0, 0
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+ cmpwi cr0, M, 0
+ ble- L999
+ cmpwi cr0, N, 0
+ ble- L999
+
+ slwi LDA, LDA, ZBASE_SHIFT
+ slwi M8, M, 3 + ZBASE_SHIFT
+
+ li T2, -8
+ li PREA, -4
+ li PREB, -2
+
+ and B4, N, T2
+ and B2, N, PREA
+ and B1, N, PREB
+
+ mullw B4, B4, M
+ mullw B2, B2, M
+ mullw B1, B1, M
+
+ slwi B4, B4, ZBASE_SHIFT
+ slwi B2, B2, ZBASE_SHIFT
+ slwi B1, B1, ZBASE_SHIFT
+
+ add B4, B4, B
+ add B2, B2, B
+ add B1, B1, B
+
+ li PREA, 384
+ addi PREB, M8, 128
+
+ li o16, 16
+ li o32, 32
+ li o48, 48
+
+#include "zgemm_tcopy_logic_8_power8.S"
+
+L999:
+
+ li r3, 0
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ addi SP, SP, STACKSIZE
+
+ blr
+ EPILOGUE
+
+
diff --git a/kernel/power/zgemm_tcopy_logic_8_power8.S b/kernel/power/zgemm_tcopy_logic_8_power8.S
new file mode 100644
index 000000000..34fd307bd
--- /dev/null
+++ b/kernel/power/zgemm_tcopy_logic_8_power8.S
@@ -0,0 +1,246 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+ srawi. I, M, 2
+ ble ZCOPYT_L2_BEGIN
+
+
+ZCOPYT_L4_BEGIN:
+
+ mr A0, A
+ add A1, A0, LDA
+ add A2, A1, LDA
+ add A3, A2, LDA
+ add A, A3, LDA
+ mr B8, B
+ addi B, B, 64*SIZE
+
+ sradi. J, N, 3
+ ble ZCOPYT_L4x4_BEGIN
+
+ mr BO, B8
+
+ .align 5
+
+ZCOPYT_L4x8_LOOP:
+
+ addi T1, PREB, 128
+ addi T2, PREB, 256
+ dcbt A0, PREA
+ dcbt A1, PREA
+ dcbt A2, PREA
+ dcbt A3, PREA
+ dcbtst BO, M8
+ dcbtst BO, PREB
+ dcbtst BO, T1
+ dcbtst BO, T2
+
+ COPY_4x8
+
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt ZCOPYT_L4x8_LOOP
+
+ZCOPYT_L4x4_BEGIN:
+
+ andi. T1, N, 4
+ ble ZCOPYT_L4x2_BEGIN
+
+ mr BO, B4
+
+ COPY_4x4
+
+
+ addi B4, B4, 32*SIZE
+
+ZCOPYT_L4x2_BEGIN:
+
+ andi. T1, N, 2
+ ble ZCOPYT_L4x1_BEGIN
+
+ mr BO, B2
+
+ COPY_4x2
+
+
+ addi B2, B2, 16*SIZE
+
+ZCOPYT_L4x1_BEGIN:
+
+ andi. T1, N, 1
+ ble ZCOPYT_L4_END
+
+ mr BO, B1
+
+ COPY_4x1
+
+
+ addi B1, B1, 8*SIZE
+
+ZCOPYT_L4_END:
+
+ addic. I, I, -1
+ bgt ZCOPYT_L4_BEGIN
+
+
+
+ZCOPYT_L2_BEGIN:
+
+ andi. T1, M, 2
+ ble ZCOPYT_L1_BEGIN
+
+ mr A0, A
+ add A1, A0, LDA
+ add A, A1, LDA
+ mr B8, B
+ addi B, B, 32*SIZE
+
+ sradi. J, N, 3
+ ble ZCOPYT_L2x4_BEGIN
+
+ mr BO, B8
+
+ZCOPYT_L2x8_LOOP:
+
+ COPY_2x8
+
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt ZCOPYT_L2x8_LOOP
+
+ZCOPYT_L2x4_BEGIN:
+
+ andi. T1, N, 4
+ ble ZCOPYT_L2x2_BEGIN
+
+ mr BO, B4
+
+ COPY_2x4
+
+
+ addi B4, B4, 16*SIZE
+
+ZCOPYT_L2x2_BEGIN:
+
+ andi. T1, N, 2
+ ble ZCOPYT_L2x1_BEGIN
+
+ mr BO, B2
+
+ COPY_2x2
+
+
+ addi B2, B2, 8*SIZE
+
+ZCOPYT_L2x1_BEGIN:
+
+ andi. T1, N, 1
+ ble ZCOPYT_L2_END
+
+ mr BO, B1
+
+ COPY_2x1
+
+
+ addi B1, B1, 4*SIZE
+
+ZCOPYT_L2_END:
+
+
+ZCOPYT_L1_BEGIN:
+
+ andi. T1, M, 1
+ ble L999
+
+ mr A0, A
+ add A, A0, LDA
+ mr B8, B
+ addi B, B, 16*SIZE
+
+ sradi. J, N, 3
+ ble ZCOPYT_L1x4_BEGIN
+
+ mr BO, B8
+
+ZCOPYT_L1x8_LOOP:
+
+ COPY_1x8
+
+ add BO, BO, M8
+
+ addic. J, J, -1
+ bgt ZCOPYT_L1x8_LOOP
+
+ZCOPYT_L1x4_BEGIN:
+
+ andi. T1, N, 4
+ ble ZCOPYT_L1x2_BEGIN
+
+ mr BO, B4
+
+ COPY_1x4
+
+
+ addi B4, B4, 8*SIZE
+
+ZCOPYT_L1x2_BEGIN:
+
+ andi. T1, N, 2
+ ble ZCOPYT_L1x1_BEGIN
+
+ mr BO, B2
+
+ COPY_1x2
+
+
+ addi B2, B2, 4*SIZE
+
+ZCOPYT_L1x1_BEGIN:
+
+ andi. T1, N, 1
+ ble ZCOPYT_L1_END
+
+ mr BO, B1
+
+ COPY_1x1
+
+
+ addi B1, B1, 2*SIZE
+
+ZCOPYT_L1_END:
+
diff --git a/kernel/power/zgemm_tcopy_macros_8_power8.S b/kernel/power/zgemm_tcopy_macros_8_power8.S
new file mode 100644
index 000000000..e8c2f0baa
--- /dev/null
+++ b/kernel/power/zgemm_tcopy_macros_8_power8.S
@@ -0,0 +1,535 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+* LAPACK-TEST : OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro COPY_4x8
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+ lxvd2x vs36, o0, A0
+ lxvd2x vs37, o16, A0
+ lxvd2x vs38, o32, A0
+ lxvd2x vs39, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs40, o0, A1
+ lxvd2x vs41, o16, A1
+ lxvd2x vs42, o32, A1
+ lxvd2x vs43, o48, A1
+ addi A1, A1, 64
+
+ lxvd2x vs44, o0, A1
+ lxvd2x vs45, o16, A1
+ lxvd2x vs46, o32, A1
+ lxvd2x vs47, o48, A1
+ addi A1, A1, 64
+
+
+ lxvd2x vs48, o0, A2
+ lxvd2x vs49, o16, A2
+ lxvd2x vs50, o32, A2
+ lxvd2x vs51, o48, A2
+ addi A2, A2, 64
+
+ lxvd2x vs52, o0, A2
+ lxvd2x vs53, o16, A2
+ lxvd2x vs54, o32, A2
+ lxvd2x vs55, o48, A2
+ addi A2, A2, 64
+
+
+ lxvd2x vs56, o0, A3
+ lxvd2x vs57, o16, A3
+ lxvd2x vs58, o32, A3
+ lxvd2x vs59, o48, A3
+ addi A3, A3, 64
+
+ lxvd2x vs60, o0, A3
+ lxvd2x vs61, o16, A3
+ lxvd2x vs62, o32, A3
+ lxvd2x vs63, o48, A3
+ addi A3, A3, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs40, o0, T1
+ stxvd2x vs41, o16, T1
+ stxvd2x vs42, o32, T1
+ stxvd2x vs43, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs44, o0, T1
+ stxvd2x vs45, o16, T1
+ stxvd2x vs46, o32, T1
+ stxvd2x vs47, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs48, o0, T1
+ stxvd2x vs49, o16, T1
+ stxvd2x vs50, o32, T1
+ stxvd2x vs51, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs52, o0, T1
+ stxvd2x vs53, o16, T1
+ stxvd2x vs54, o32, T1
+ stxvd2x vs55, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs56, o0, T1
+ stxvd2x vs57, o16, T1
+ stxvd2x vs58, o32, T1
+ stxvd2x vs59, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs60, o0, T1
+ stxvd2x vs61, o16, T1
+ stxvd2x vs62, o32, T1
+ stxvd2x vs63, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro COPY_4x4
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs36, o0, A1
+ lxvd2x vs37, o16, A1
+ lxvd2x vs38, o32, A1
+ lxvd2x vs39, o48, A1
+ addi A1, A1, 64
+
+
+ lxvd2x vs40, o0, A2
+ lxvd2x vs41, o16, A2
+ lxvd2x vs42, o32, A2
+ lxvd2x vs43, o48, A2
+ addi A2, A2, 64
+
+
+ lxvd2x vs44, o0, A3
+ lxvd2x vs45, o16, A3
+ lxvd2x vs46, o32, A3
+ lxvd2x vs47, o48, A3
+ addi A3, A3, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs40, o0, T1
+ stxvd2x vs41, o16, T1
+ stxvd2x vs42, o32, T1
+ stxvd2x vs43, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs44, o0, T1
+ stxvd2x vs45, o16, T1
+ stxvd2x vs46, o32, T1
+ stxvd2x vs47, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro COPY_4x2
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ addi A0, A0, 32
+
+
+ lxvd2x vs34, o0, A1
+ lxvd2x vs35, o16, A1
+ addi A1, A1, 32
+
+
+ lxvd2x vs36, o0, A2
+ lxvd2x vs37, o16, A2
+ addi A2, A2, 32
+
+
+ lxvd2x vs38, o0, A3
+ lxvd2x vs39, o16, A3
+ addi A3, A3, 32
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro COPY_4x1
+
+ lxvd2x vs32, o0, A0
+ addi A0, A0, 16
+
+
+ lxvd2x vs33, o0, A1
+ addi A1, A1, 16
+
+
+ lxvd2x vs34, o0, A2
+ addi A2, A2, 16
+
+
+ lxvd2x vs35, o0, A3
+ addi A3, A3, 16
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+
+ stxvd2x vs33, o16, T1
+
+ stxvd2x vs34, o32, T1
+
+ stxvd2x vs35, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro COPY_2x8
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+ lxvd2x vs36, o0, A0
+ lxvd2x vs37, o16, A0
+ lxvd2x vs38, o32, A0
+ lxvd2x vs39, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs40, o0, A1
+ lxvd2x vs41, o16, A1
+ lxvd2x vs42, o32, A1
+ lxvd2x vs43, o48, A1
+ addi A1, A1, 64
+
+ lxvd2x vs44, o0, A1
+ lxvd2x vs45, o16, A1
+ lxvd2x vs46, o32, A1
+ lxvd2x vs47, o48, A1
+ addi A1, A1, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs40, o0, T1
+ stxvd2x vs41, o16, T1
+ stxvd2x vs42, o32, T1
+ stxvd2x vs43, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs44, o0, T1
+ stxvd2x vs45, o16, T1
+ stxvd2x vs46, o32, T1
+ stxvd2x vs47, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro COPY_2x4
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+
+ lxvd2x vs36, o0, A1
+ lxvd2x vs37, o16, A1
+ lxvd2x vs38, o32, A1
+ lxvd2x vs39, o48, A1
+ addi A1, A1, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro COPY_2x2
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ addi A0, A0, 32
+
+
+ lxvd2x vs34, o0, A1
+ lxvd2x vs35, o16, A1
+ addi A1, A1, 32
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro COPY_2x1
+
+ lxvd2x vs32, o0, A0
+ addi A0, A0, 16
+
+
+ lxvd2x vs33, o0, A1
+ addi A1, A1, 16
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+
+ stxvd2x vs33, o16, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro COPY_1x8
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+ lxvd2x vs36, o0, A0
+ lxvd2x vs37, o16, A0
+ lxvd2x vs38, o32, A0
+ lxvd2x vs39, o48, A0
+ addi A0, A0, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+ addi T1, T1, 64
+
+ stxvd2x vs36, o0, T1
+ stxvd2x vs37, o16, T1
+ stxvd2x vs38, o32, T1
+ stxvd2x vs39, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro COPY_1x4
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ lxvd2x vs34, o32, A0
+ lxvd2x vs35, o48, A0
+ addi A0, A0, 64
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+ stxvd2x vs34, o32, T1
+ stxvd2x vs35, o48, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro COPY_1x2
+
+ lxvd2x vs32, o0, A0
+ lxvd2x vs33, o16, A0
+ addi A0, A0, 32
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+ stxvd2x vs33, o16, T1
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro COPY_1x1
+
+ lxvd2x vs32, o0, A0
+ addi A0, A0, 16
+
+
+ mr T1, BO
+
+ stxvd2x vs32, o0, T1
+
+.endm
+
diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c
index 213839a8f..410fc9840 100644
--- a/kernel/power/zscal.c
+++ b/kernel/power/zscal.c
@@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma GCC optimize "O1"
#if defined(POWER8)
+#if defined(DOUBLE)
#include "zscal_microk_power8.c"
#endif
+#endif
#ifndef HAVE_KERNEL_8
@@ -123,6 +125,21 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
if ( inc_x <= 0 )
return(0);
+ if (da_r == ZERO && da_i == ZERO) {
+ //clear the vector and return
+ if (inc_x == 1) {
+ memset(x, 0, n*COMPSIZE*SIZE);
+ }else{
+ inc_x2 = 2 * inc_x;
+ for(i=0; i a
END PROGRAM
")
- try_compile( SIZEOF_${_TYPE_NAME} ${CMAKE_BINARY_DIR} ${__TEST_FILE} )
+ try_compile( SIZEOF_${_TYPE_NAME} ${PROJECT_BINARY_DIR} ${__TEST_FILE} )
if( SIZEOF_${_TYPE_NAME} )
message( STATUS "Testing default ${_TYPE_NAME}*${__TEST_SIZE} - found" )
set( SIZEOF_${_TYPE_NAME} ${__TEST_SIZE} CACHE INTERNAL "Size of the default ${_TYPE_NAME} type" FORCE )
diff --git a/lapack-netlib/CMAKE/CheckTimeFunction.cmake b/lapack-netlib/CMAKE/CheckTimeFunction.cmake
index 350a59132..1a65f242b 100644
--- a/lapack-netlib/CMAKE/CheckTimeFunction.cmake
+++ b/lapack-netlib/CMAKE/CheckTimeFunction.cmake
@@ -16,11 +16,11 @@ macro(CHECK_TIME_FUNCTION FUNCTION VARIABLE)
if(RES)
set(${VARIABLE} ${FUNCTION} CACHE INTERNAL "Have Fortran function ${FUNCTION}")
message(STATUS "Looking for Fortran ${FUNCTION} - found")
- file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
+ file(APPEND ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
"Fortran ${FUNCTION} exists. ${OUTPUT} \n\n")
else(RES)
message(STATUS "Looking for Fortran ${FUNCTION} - not found")
- file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
+ file(APPEND ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
"Fortran ${FUNCTION} does not exist. \n ${OUTPUT} \n")
endif(RES)
endmacro(CHECK_TIME_FUNCTION)
diff --git a/lapack-netlib/CMAKE/FortranMangling.cmake b/lapack-netlib/CMAKE/FortranMangling.cmake
index 98b8443ef..538c80218 100644
--- a/lapack-netlib/CMAKE/FortranMangling.cmake
+++ b/lapack-netlib/CMAKE/FortranMangling.cmake
@@ -43,7 +43,7 @@ MESSAGE(STATUS "Testing FORTRAN_MANGLING")
MESSAGE(STATUS "Compiling Finface.f...")
execute_process ( COMMAND ${CMAKE_Fortran_COMPILER} ${F77_OPTION_COMPILE} ${PROJECT_SOURCE_DIR}/lapacke/mangling/Fintface.f
- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp
+ WORKING_DIRECTORY ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp
OUTPUT_VARIABLE OUTPUT
RESULT_VARIABLE RESULT
ERROR_VARIABLE ERROR)
@@ -58,7 +58,7 @@ MESSAGE(STATUS "Compiling Finface.f...")
MESSAGE(STATUS "Compiling Cintface.c...")
execute_process ( COMMAND ${CMAKE_C_COMPILER} ${F77_OPTION_COMPILE} ${PROJECT_SOURCE_DIR}/lapacke/mangling/Cintface.c
- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp
+ WORKING_DIRECTORY ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp
OUTPUT_VARIABLE OUTPUT
RESULT_VARIABLE RESULT
ERROR_VARIABLE ERROR)
@@ -73,7 +73,7 @@ MESSAGE(STATUS "Compiling Cintface.c...")
MESSAGE(STATUS "Linking Finface.f and Cintface.c...")
execute_process ( COMMAND ${CMAKE_Fortran_COMPILER} ${F77_OUTPUT_OBJ} xintface.exe Fintface.o Cintface.o
- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp
+ WORKING_DIRECTORY ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp
OUTPUT_VARIABLE OUTPUT
RESULT_VARIABLE RESULT
ERROR_VARIABLE ERROR)
@@ -88,7 +88,7 @@ MESSAGE(STATUS "Linking Finface.f and Cintface.c...")
MESSAGE(STATUS "Running ./xintface...")
execute_process ( COMMAND ./xintface.exe
- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp
+ WORKING_DIRECTORY ${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp
RESULT_VARIABLE xintface_RES
OUTPUT_VARIABLE xintface_OUT
ERROR_VARIABLE xintface_ERR)
diff --git a/lapack-netlib/SRC/zgetrf2.f b/lapack-netlib/SRC/zgetrf2.f
index 290d4847e..7d28b5812 100644
--- a/lapack-netlib/SRC/zgetrf2.f
+++ b/lapack-netlib/SRC/zgetrf2.f
@@ -144,7 +144,7 @@
EXTERNAL DLAMCH, IZAMAX
* ..
* .. External Subroutines ..
- EXTERNAL ZGEMM, ZSCAL, ZLASWP, ZTRSM, ZERBLA
+ EXTERNAL ZGEMM, ZSCAL, ZLASWP, ZTRSM, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt
index de42e1ab6..afd583c11 100644
--- a/lapack/CMakeLists.txt
+++ b/lapack/CMakeLists.txt
@@ -1,5 +1,5 @@
-include_directories(${CMAKE_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR})
set(LAPACK_SOURCES
diff --git a/lapack/getrf/getrf_parallel_omp.c b/lapack/getrf/getrf_parallel_omp.c
index 7e2319718..6b8cbda2f 100644
--- a/lapack/getrf/getrf_parallel_omp.c
+++ b/lapack/getrf/getrf_parallel_omp.c
@@ -173,10 +173,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
if (blocking > GEMM_Q) blocking = GEMM_Q;
- if (blocking <= GEMM_UNROLL_N * 2) {
+#ifdef POWER8
+ if (blocking <= GEMM_UNROLL_N) {
info = GETF2(args, NULL, range_n, sa, sb, 0);
return info;
}
+#else
+ if (blocking <= GEMM_UNROLL_N*2) {
+ info = GETF2(args, NULL, range_n, sa, sb, 0);
+ return info;
+ }
+#endif
sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
diff --git a/lapack/getrf/getrf_single.c b/lapack/getrf/getrf_single.c
index e60a16c11..9f0f36b78 100644
--- a/lapack/getrf/getrf_single.c
+++ b/lapack/getrf/getrf_single.c
@@ -77,10 +77,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
if (blocking > GEMM_Q) blocking = GEMM_Q;
+#ifdef POWER8
+ if (blocking <= GEMM_UNROLL_N) {
+ info = GETF2(args, NULL, range_n, sa, sb, 0);
+ return info;
+ }
+#else
if (blocking <= GEMM_UNROLL_N * 2) {
info = GETF2(args, NULL, range_n, sa, sb, 0);
return info;
}
+#endif
sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
diff --git a/lapack/laswp/mips/Makefile b/lapack/laswp/mips/Makefile
new file mode 100644
index 000000000..75411deb5
--- /dev/null
+++ b/lapack/laswp/mips/Makefile
@@ -0,0 +1,13 @@
+TOPDIR = ../../..
+include ../../../Makefile.system
+
+ifndef LASWP
+LASWP = ../generic/laswp_k.c
+endif
+
+ifndef ZLASWP
+ZLASWP = ../generic/zlaswp_k.c
+endif
+
+include ../generic/Makefile
+
diff --git a/param.h b/param.h
index a6ead4b64..480518cd4 100644
--- a/param.h
+++ b/param.h
@@ -1964,9 +1964,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SNUMOPT 16
#define DNUMOPT 8
-#define GEMM_DEFAULT_OFFSET_A 4096
-#define GEMM_DEFAULT_OFFSET_B 4096
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 65536
+#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8
@@ -1977,20 +1977,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2
-#define SGEMM_DEFAULT_P 960
-#define DGEMM_DEFAULT_P 480
-#define CGEMM_DEFAULT_P 720
-#define ZGEMM_DEFAULT_P 480
+#define SGEMM_DEFAULT_P 1280
+#define DGEMM_DEFAULT_P 640
+#define CGEMM_DEFAULT_P 640
+#define ZGEMM_DEFAULT_P 320
-#define SGEMM_DEFAULT_Q 720
+#define SGEMM_DEFAULT_Q 640
#define DGEMM_DEFAULT_Q 720
-#define CGEMM_DEFAULT_Q 720
-#define ZGEMM_DEFAULT_Q 720
-
-#define SGEMM_DEFAULT_R 21600
-#define DGEMM_DEFAULT_R 14400
-#define CGEMM_DEFAULT_R 16200
-#define ZGEMM_DEFAULT_R 21600
+#define CGEMM_DEFAULT_Q 640
+#define ZGEMM_DEFAULT_Q 640
#define SYMV_P 8
@@ -2179,6 +2174,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16
#endif
+#if defined(P5600) || defined(I6400) || defined(P6600)
+#define SNUMOPT 2
+#define DNUMOPT 2
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#ifdef HAVE_MSA
+#define SGEMM_DEFAULT_UNROLL_M 8
+#define SGEMM_DEFAULT_UNROLL_N 8
+
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 4
+
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+#else
+#define SGEMM_DEFAULT_UNROLL_M 2
+#define SGEMM_DEFAULT_UNROLL_N 2
+
+#define DGEMM_DEFAULT_UNROLL_M 2
+#define DGEMM_DEFAULT_UNROLL_N 2
+
+#define CGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_N 2
+
+#define ZGEMM_DEFAULT_UNROLL_M 2
+#define ZGEMM_DEFAULT_UNROLL_N 2
+#endif
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 128
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 120
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P 16
+#endif
#ifdef ARMV7
#define SNUMOPT 2
@@ -2269,13 +2315,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
-#define SGEMM_DEFAULT_UNROLL_M 4
+#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
-#define DGEMM_DEFAULT_UNROLL_M 4
+#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
-#define CGEMM_DEFAULT_UNROLL_M 4
+#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 4
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index cd4497117..5e9baf928 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-include_directories(${CMAKE_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR})
enable_language(Fortran)
diff --git a/test/Makefile b/test/Makefile
index 75ea6de60..65fb6f438 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -4,6 +4,7 @@ include ../Makefile.system
all :: level1 level2 level3
level1 : sblat1 dblat1 cblat1 zblat1
+ifndef CROSS
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1
@@ -21,8 +22,10 @@ else
OPENBLAS_NUM_THREADS=2 ./zblat1
endif
endif
+endif
level2 : sblat2 dblat2 cblat2 zblat2
+ifndef CROSS
rm -f ?BLAT2.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat
@$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0
@@ -54,8 +57,10 @@ else
@$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0
endif
endif
+endif
level3 : sblat3 dblat3 cblat3 zblat3
+ifndef CROSS
rm -f ?BLAT3.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat
@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
@@ -87,9 +92,11 @@ else
@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
endif
endif
+endif
level3_3m : zblat3_3m cblat3_3m
+ifndef CROSS
rm -f ?BLAT3_3M.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat
@$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
@@ -109,6 +116,7 @@ else
@$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
endif
endif
+endif
diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index dfa42df67..f0ffee088 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -1,4 +1,4 @@
-include_directories(${CMAKE_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR})
set(OpenBLAS_utest_src
utest_main.c
diff --git a/utest/Makefile b/utest/Makefile
index 9f9808920..3ccc0a041 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -21,7 +21,9 @@ $(UTESTBIN): $(OBJS)
$(CC) $(CFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
run_test: $(UTESTBIN)
+ifndef CROSS
./$(UTESTBIN)
+endif
clean:
-rm -f *.o $(UTESTBIN)
diff --git a/utest/ctest.h b/utest/ctest.h
index a62103ff5..1deea32f6 100644
--- a/utest/ctest.h
+++ b/utest/ctest.h
@@ -637,7 +637,7 @@ static void *find_symbol(struct ctest *test, const char *fname)
static void sighandler(int signum)
{
char msg[128];
- sprintf(msg, "[SIGNAL %d: %s]", signum, sys_siglist[signum]);
+ snprintf(msg, sizeof(msg), "[SIGNAL %d: %s]", signum, strsignal(signum));
color_print(ANSI_BRED, msg);
fflush(stdout);