Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>tags/v0.2.19^2
| @@ -151,5 +151,9 @@ In chronological order: | |||
| * [2016-03-20] Fix compiler error in VisualStudio with CMake | |||
| * [2016-03-22] Fix access violation on Windows while static linking | |||
| * Paul Mustière <https://github.com/buffer51/> | |||
| * [2016-02-04] Fix Android build on ARMV7 | |||
| * [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8 | |||
| * Shivraj Patil <https://github.com/sva-img/> | |||
| * [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA | |||
| @@ -108,8 +108,6 @@ endif | |||
| tests : | |||
| ifndef NOFORTRAN | |||
| ifndef TARGET | |||
| ifndef CROSS | |||
| touch $(LIBNAME) | |||
| ifndef NO_FBLAS | |||
| $(MAKE) -C test all | |||
| @@ -119,8 +117,6 @@ ifndef NO_CBLAS | |||
| $(MAKE) -C ctest all | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| libs : | |||
| ifeq ($(CORE), UNKOWN) | |||
| @@ -20,75 +20,75 @@ lib.grd : | |||
| $(error OpenBLAS: Please run "make" firstly) | |||
| install : lib.grd | |||
| @-mkdir -p $(DESTDIR)$(PREFIX) | |||
| @-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
| @-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | |||
| @-mkdir -p "$(DESTDIR)$(PREFIX)" | |||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" | |||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" | |||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" | |||
| @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| #for inc | |||
| @echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||
| @echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||
| @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||
| @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||
| @cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||
| @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | |||
| @echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||
| @echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||
| @echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||
| @cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||
| @echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||
| @echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||
| @echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||
| @echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||
| @cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||
| @echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" | |||
| ifndef NO_CBLAS | |||
| @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h | |||
| @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" | |||
| endif | |||
| ifndef NO_LAPACKE | |||
| @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" | |||
| endif | |||
| #for install static library | |||
| ifndef NO_STATIC | |||
| @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||
| @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| endif | |||
| #for install shared library | |||
| ifndef NO_SHARED | |||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) | |||
| @install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||
| @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||
| @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| ifeq ($(OSNAME), NetBSD) | |||
| @cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||
| @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| ifeq ($(OSNAME), Darwin) | |||
| @-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) | |||
| @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ | |||
| @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| @-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
| @-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" | |||
| @-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| @-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR) | |||
| @@ -96,34 +96,34 @@ endif | |||
| endif | |||
| #Generating OpenBLASConfig.cmake | |||
| @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | |||
| @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| ifndef NO_SHARED | |||
| #ifeq logical or | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| endif | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| endif | |||
| ifeq ($(OSNAME), Darwin) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| endif | |||
| else | |||
| #only static | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| endif | |||
| #Generating OpenBLASConfigVersion.cmake | |||
| @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | |||
| @echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) | |||
| @echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo Install OK! | |||
| @@ -82,6 +82,7 @@ Please read GotoBLAS_01Readme.txt | |||
| - **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | |||
| - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. | |||
| - **FreeBSD**: Supported by community. We didn't test the library on this OS. | |||
| - **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. | |||
| ## Usages | |||
| Link with libopenblas.a or -lopenblas for shared library. | |||
| @@ -1,5 +1,7 @@ | |||
| #!/usr/bin/perl | |||
| use File::Basename; | |||
| # Checking cross compile | |||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); | |||
| @@ -26,14 +28,12 @@ if ($?) { | |||
| $cross_suffix = ""; | |||
| if ($ARGV[0] =~ /(.*)(-[.\d]+)/) { | |||
| if ($1 =~ /(.*-)(.*)/) { | |||
| $cross_suffix = $1; | |||
| } | |||
| } else { | |||
| if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) { | |||
| $cross_suffix = $1; | |||
| } | |||
| if (dirname($compiler_name) ne ".") { | |||
| $cross_suffix .= dirname($compiler_name) . "/"; | |||
| } | |||
| if (basename($compiler_name) =~ /(.*-)(.*)/) { | |||
| $cross_suffix .= $1; | |||
| } | |||
| $compiler = ""; | |||
| @@ -243,7 +243,7 @@ print MAKEFILE "BINARY64=\n" if $binformat ne bin64; | |||
| print MAKEFILE "BINARY32=1\n" if $binformat eq bin32; | |||
| print MAKEFILE "BINARY64=1\n" if $binformat eq bin64; | |||
| print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; | |||
| print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne ""; | |||
| print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne ""; | |||
| print MAKEFILE "CROSS=1\n" if $cross != 0; | |||
| print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; | |||
| @@ -42,6 +42,7 @@ ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o | |||
| all :: all1 all2 all3 | |||
| all1: xscblat1 xdcblat1 xccblat1 xzcblat1 | |||
| ifndef CROSS | |||
| ifeq ($(USE_OPENMP), 1) | |||
| OMP_NUM_THREADS=2 ./xscblat1 | |||
| OMP_NUM_THREADS=2 ./xdcblat1 | |||
| @@ -53,8 +54,10 @@ else | |||
| OPENBLAS_NUM_THREADS=2 ./xccblat1 | |||
| OPENBLAS_NUM_THREADS=2 ./xzcblat1 | |||
| endif | |||
| endif | |||
| all2: xscblat2 xdcblat2 xccblat2 xzcblat2 | |||
| ifndef CROSS | |||
| ifeq ($(USE_OPENMP), 1) | |||
| OMP_NUM_THREADS=2 ./xscblat2 < sin2 | |||
| OMP_NUM_THREADS=2 ./xdcblat2 < din2 | |||
| @@ -66,8 +69,10 @@ else | |||
| OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2 | |||
| OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2 | |||
| endif | |||
| endif | |||
| all3: xscblat3 xdcblat3 xccblat3 xzcblat3 | |||
| ifndef CROSS | |||
| ifeq ($(USE_OPENMP), 1) | |||
| OMP_NUM_THREADS=2 ./xscblat3 < sin3 | |||
| OMP_NUM_THREADS=2 ./xdcblat3 < din3 | |||
| @@ -88,6 +93,7 @@ else | |||
| OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m | |||
| OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m | |||
| endif | |||
| endif | |||
| @@ -439,7 +439,7 @@ static gotoblas_t *force_coretype(char *coretype){ | |||
| char message[128]; | |||
| //char mname[20]; | |||
| for ( i=1 ; i <= 21; i++) | |||
| for ( i=1 ; i <= 22; i++) | |||
| { | |||
| if (!strncasecmp(coretype,corename[i],20)) | |||
| { | |||
| @@ -361,6 +361,9 @@ static void numa_mapping(void) { | |||
| unsigned long work, bit; | |||
| int count = 0; | |||
| int bitmask_idx = 0; | |||
| int current_cpu; | |||
| int current_node = 0; | |||
| int cpu_count = 0; | |||
| for (node = 0; node < common -> num_nodes; node ++) { | |||
| core = 0; | |||
| @@ -382,33 +385,84 @@ static void numa_mapping(void) { | |||
| fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); | |||
| #endif | |||
| h = 1; | |||
| while (h < count) h = 2 * h + 1; | |||
| while (h > 1) { | |||
| h /= 2; | |||
| for (i = h; i < count; i++) { | |||
| work = common -> cpu_info[i]; | |||
| bit = CPU_ISSET(i, &cpu_orig_mask[0]); | |||
| j = i - h; | |||
| while (work < common -> cpu_info[j]) { | |||
| common -> cpu_info[j + h] = common -> cpu_info[j]; | |||
| if (CPU_ISSET(j, &cpu_orig_mask[0])) { | |||
| CPU_SET(j + h, &cpu_orig_mask[0]); | |||
| } else { | |||
| CPU_CLR(j + h, &cpu_orig_mask[0]); | |||
| } | |||
| j -= h; | |||
| if (j < 0) break; | |||
| } | |||
| common -> cpu_info[j + h] = work; | |||
| if (bit) { | |||
| CPU_SET(j + h, &cpu_orig_mask[0]); | |||
| } else { | |||
| CPU_CLR(j + h, &cpu_orig_mask[0]); | |||
| current_cpu = sched_getcpu(); | |||
| for (cpu = 0; cpu < count; cpu++) { | |||
| if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) { | |||
| current_node = READ_NODE(common -> cpu_info[cpu]); | |||
| break; | |||
| } | |||
| } | |||
| for (i = 0; i < MAX_BITMASK_LEN; i++) | |||
| cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]); | |||
| /* | |||
| * If all the processes can be accommodated in the | |||
| * in the current node itself, then bind to cores | |||
| * from the current node only | |||
| */ | |||
| if (numprocs <= cpu_count) { | |||
| /* | |||
| * First sort all the cores in order from the current node. | |||
| * Then take remaining nodes one by one in order, | |||
| * and sort their cores in order. | |||
| */ | |||
| for (i = 0; i < count; i++) { | |||
| for (j = 0; j < count - 1; j++) { | |||
| int node_1, node_2; | |||
| int core_1, core_2; | |||
| int swap = 0; | |||
| node_1 = READ_NODE(common -> cpu_info[j]); | |||
| node_2 = READ_NODE(common -> cpu_info[j + 1]); | |||
| core_1 = READ_CORE(common -> cpu_info[j]); | |||
| core_2 = READ_CORE(common -> cpu_info[j + 1]); | |||
| if (node_1 == node_2) { | |||
| if (core_1 > core_2) | |||
| swap = 1; | |||
| } else { | |||
| if ((node_2 == current_node) || | |||
| ((node_1 != current_node) && (node_1 > node_2))) | |||
| swap = 1; | |||
| } | |||
| if (swap) { | |||
| unsigned long temp; | |||
| temp = common->cpu_info[j]; | |||
| common->cpu_info[j] = common->cpu_info[j + 1]; | |||
| common->cpu_info[j + 1] = temp; | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| h = 1; | |||
| while (h < count) h = 2 * h + 1; | |||
| while (h > 1) { | |||
| h /= 2; | |||
| for (i = h; i < count; i++) { | |||
| work = common -> cpu_info[i]; | |||
| bit = CPU_ISSET(i, &cpu_orig_mask[0]); | |||
| j = i - h; | |||
| while (work < common -> cpu_info[j]) { | |||
| common -> cpu_info[j + h] = common -> cpu_info[j]; | |||
| if (CPU_ISSET(j, &cpu_orig_mask[0])) { | |||
| CPU_SET(j + h, &cpu_orig_mask[0]); | |||
| } else { | |||
| CPU_CLR(j + h, &cpu_orig_mask[0]); | |||
| } | |||
| j -= h; | |||
| if (j < 0) break; | |||
| } | |||
| common -> cpu_info[j + h] = work; | |||
| if (bit) { | |||
| CPU_SET(j + h, &cpu_orig_mask[0]); | |||
| } else { | |||
| CPU_CLR(j + h, &cpu_orig_mask[0]); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -416,7 +470,10 @@ static void numa_mapping(void) { | |||
| fprintf(stderr, "\nSorting ...\n\n"); | |||
| for (cpu = 0; cpu < count; cpu++) | |||
| fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); | |||
| fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu], | |||
| READ_CPU(common -> cpu_info[cpu]), | |||
| READ_CORE(common -> cpu_info[cpu]), | |||
| READ_NODE(common -> cpu_info[cpu])); | |||
| #endif | |||
| } | |||
| @@ -167,7 +167,7 @@ int get_L2_size(void){ | |||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ | |||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | |||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ | |||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) | |||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | |||
| @@ -251,7 +251,7 @@ int get_L2_size(void){ | |||
| void blas_set_parameter(void){ | |||
| int factor; | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| int size = 16; | |||
| #else | |||
| int size = get_L2_size(); | |||
| @@ -110,9 +110,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def | |||
| endif | |||
| ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2)) | |||
| #only build without Fortran | |||
| $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| else | |||
| $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| endif | |||
| dllinit.$(SUFFIX) : dllinit.c | |||
| @@ -12,7 +12,7 @@ SGEMMKERNEL = sgemm_kernel_16x8_power8.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| @@ -21,16 +21,16 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| DGEMMKERNEL = dgemm_kernel_16x4_power8.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| DGEMMITCOPY = dgemm_tcopy_16_power8.S | |||
| DGEMMONCOPY = gemm_ncopy_4.S | |||
| DGEMMOTCOPY = gemm_tcopy_4.S | |||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| DGEMMONCOPY = dgemm_ncopy_4_power8.S | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| CGEMMITCOPY = cgemm_tcopy_8_power8.S | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| @@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| ZGEMMITCOPY = zgemm_tcopy_8_power8.S | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| ZGEMMINCOPYOBJ = zgemm_incopy.o | |||
| @@ -0,0 +1,206 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/23 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define M r3 | |||
| #define N r4 | |||
| #define A r5 | |||
| #define LDA r6 | |||
| #define B r7 | |||
| #define A0 r8 | |||
| #define A1 r9 | |||
| #define A2 r10 | |||
| #define A3 r11 | |||
| #define J r12 | |||
| #define PREA r14 | |||
| #define PREB r15 | |||
| #define BO r16 | |||
| #define B8 r17 | |||
| #define B4 r18 | |||
| #define B2 r19 | |||
| #define B1 r20 | |||
| #define o4 r21 | |||
| #define T2 r22 | |||
| #define I r23 | |||
| #define o16 r24 | |||
| #define o32 r25 | |||
| #define o48 r26 | |||
| #define NOTUS2 r27 | |||
| #define M8 r30 | |||
| #define T1 r31 | |||
| #define o0 0 | |||
| #include "cgemm_tcopy_macros_8_power8.S" | |||
| #define STACKSIZE 384 | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| cmpwi cr0, M, 0 | |||
| ble- L999 | |||
| cmpwi cr0, N, 0 | |||
| ble- L999 | |||
| slwi LDA, LDA, ZBASE_SHIFT | |||
| slwi M8, M, 3 + ZBASE_SHIFT | |||
| li T2, -8 | |||
| li PREA, -4 | |||
| li PREB, -2 | |||
| and B4, N, T2 | |||
| and B2, N, PREA | |||
| and B1, N, PREB | |||
| mullw B4, B4, M | |||
| mullw B2, B2, M | |||
| mullw B1, B1, M | |||
| slwi B4, B4, ZBASE_SHIFT | |||
| slwi B2, B2, ZBASE_SHIFT | |||
| slwi B1, B1, ZBASE_SHIFT | |||
| add B4, B4, B | |||
| add B2, B2, B | |||
| add B1, B1, B | |||
| li PREA, 384 | |||
| addi PREB, M8, 128 | |||
| li o4, 4 | |||
| li o16, 16 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| #include "cgemm_tcopy_logic_8_power8.S" | |||
| L999: | |||
| li r3, 0 | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| @@ -0,0 +1,247 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/23 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| srawi. I, M, 2 | |||
| ble CCOPYT_L2_BEGIN | |||
| CCOPYT_L4_BEGIN: | |||
| mr A0, A | |||
| add A1, A0, LDA | |||
| add A2, A1, LDA | |||
| add A3, A2, LDA | |||
| add A, A3, LDA | |||
| mr B8, B | |||
| addi B, B, 64*SIZE | |||
| sradi. J, N, 3 | |||
| ble CCOPYT_L4x4_BEGIN | |||
| mr BO, B8 | |||
| CCOPYT_L4x8_LOOP: | |||
| dcbt A0, PREA | |||
| dcbt A1, PREA | |||
| dcbt A2, PREA | |||
| dcbt A3, PREA | |||
| dcbtst BO, M8 | |||
| dcbtst BO, PREB | |||
| COPY_4x8 | |||
| add BO, BO, M8 | |||
| addic. J, J, -1 | |||
| ble CCOPYT_L4x4_BEGIN | |||
| COPY_4x8 | |||
| add BO, BO, M8 | |||
| addic. J, J, -1 | |||
| bgt CCOPYT_L4x8_LOOP | |||
| CCOPYT_L4x4_BEGIN: | |||
| andi. T1, N, 4 | |||
| ble CCOPYT_L4x2_BEGIN | |||
| mr BO, B4 | |||
| COPY_4x4 | |||
| addi B4, B4, 32*SIZE | |||
| CCOPYT_L4x2_BEGIN: | |||
| andi. T1, N, 2 | |||
| ble CCOPYT_L4x1_BEGIN | |||
| mr BO, B2 | |||
| COPY_4x2 | |||
| addi B2, B2, 16*SIZE | |||
| CCOPYT_L4x1_BEGIN: | |||
| andi. T1, N, 1 | |||
| ble CCOPYT_L4_END | |||
| mr BO, B1 | |||
| COPY_4x1 | |||
| addi B1, B1, 8*SIZE | |||
| CCOPYT_L4_END: | |||
| addic. I, I, -1 | |||
| bgt CCOPYT_L4_BEGIN | |||
| CCOPYT_L2_BEGIN: | |||
| andi. T1, M, 2 | |||
| ble CCOPYT_L1_BEGIN | |||
| mr A0, A | |||
| add A1, A0, LDA | |||
| add A, A1, LDA | |||
| mr B8, B | |||
| addi B, B, 32*SIZE | |||
| sradi. J, N, 3 | |||
| ble CCOPYT_L2x4_BEGIN | |||
| mr BO, B8 | |||
| CCOPYT_L2x8_LOOP: | |||
| COPY_2x8 | |||
| add BO, BO, M8 | |||
| addic. J, J, -1 | |||
| bgt CCOPYT_L2x8_LOOP | |||
| CCOPYT_L2x4_BEGIN: | |||
| andi. T1, N, 4 | |||
| ble CCOPYT_L2x2_BEGIN | |||
| mr BO, B4 | |||
| COPY_2x4 | |||
| addi B4, B4, 16*SIZE | |||
| CCOPYT_L2x2_BEGIN: | |||
| andi. T1, N, 2 | |||
| ble CCOPYT_L2x1_BEGIN | |||
| mr BO, B2 | |||
| COPY_2x2 | |||
| addi B2, B2, 8*SIZE | |||
| CCOPYT_L2x1_BEGIN: | |||
| andi. T1, N, 1 | |||
| ble CCOPYT_L2_END | |||
| mr BO, B1 | |||
| COPY_2x1 | |||
| addi B1, B1, 4*SIZE | |||
| CCOPYT_L2_END: | |||
| CCOPYT_L1_BEGIN: | |||
| andi. T1, M, 1 | |||
| ble L999 | |||
| mr A0, A | |||
| add A, A0, LDA | |||
| mr B8, B | |||
| addi B, B, 16*SIZE | |||
| sradi. J, N, 3 | |||
| ble CCOPYT_L1x4_BEGIN | |||
| mr BO, B8 | |||
| CCOPYT_L1x8_LOOP: | |||
| COPY_1x8 | |||
| add BO, BO, M8 | |||
| addic. J, J, -1 | |||
| bgt CCOPYT_L1x8_LOOP | |||
| CCOPYT_L1x4_BEGIN: | |||
| andi. T1, N, 4 | |||
| ble CCOPYT_L1x2_BEGIN | |||
| mr BO, B4 | |||
| COPY_1x4 | |||
| addi B4, B4, 8*SIZE | |||
| CCOPYT_L1x2_BEGIN: | |||
| andi. T1, N, 2 | |||
| ble CCOPYT_L1x1_BEGIN | |||
| mr BO, B2 | |||
| COPY_1x2 | |||
| addi B2, B2, 4*SIZE | |||
| CCOPYT_L1x1_BEGIN: | |||
| andi. T1, N, 1 | |||
| ble CCOPYT_L1_END | |||
| mr BO, B1 | |||
| COPY_1x1 | |||
| addi B1, B1, 2*SIZE | |||
| CCOPYT_L1_END: | |||
| @@ -0,0 +1,385 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/23 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x8 | |||
| lxvw4x vs32, o0, A0 | |||
| lxvw4x vs33, o16, A0 | |||
| lxvw4x vs34, o32, A0 | |||
| lxvw4x vs35, o48, A0 | |||
| lxvw4x vs36, o0, A1 | |||
| lxvw4x vs37, o16, A1 | |||
| lxvw4x vs38, o32, A1 | |||
| lxvw4x vs39, o48, A1 | |||
| addi A0, A0, 64 | |||
| addi A1, A1, 64 | |||
| lxvw4x vs40, o0, A2 | |||
| lxvw4x vs41, o16, A2 | |||
| lxvw4x vs42, o32, A2 | |||
| lxvw4x vs43, o48, A2 | |||
| lxvw4x vs44, o0, A3 | |||
| lxvw4x vs45, o16, A3 | |||
| lxvw4x vs46, o32, A3 | |||
| lxvw4x vs47, o48, A3 | |||
| mr T1, BO | |||
| addi A2, A2, 64 | |||
| addi A3, A3, 64 | |||
| stxvw4x vs32, o0, T1 | |||
| stxvw4x vs33, o16, T1 | |||
| stxvw4x vs34, o32, T1 | |||
| stxvw4x vs35, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvw4x vs36, o0, T1 | |||
| stxvw4x vs37, o16, T1 | |||
| stxvw4x vs38, o32, T1 | |||
| stxvw4x vs39, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvw4x vs40, o0, T1 | |||
| stxvw4x vs41, o16, T1 | |||
| stxvw4x vs42, o32, T1 | |||
| stxvw4x vs43, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvw4x vs44, o0, T1 | |||
| stxvw4x vs45, o16, T1 | |||
| stxvw4x vs46, o32, T1 | |||
| stxvw4x vs47, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x4 | |||
| lxvw4x vs32, o0, A0 | |||
| lxvw4x vs33, o16, A0 | |||
| addi A0, A0, 32 | |||
| lxvw4x vs34, o0, A1 | |||
| lxvw4x vs35, o16, A1 | |||
| addi A1, A1, 32 | |||
| lxvw4x vs36, o0, A2 | |||
| lxvw4x vs37, o16, A2 | |||
| addi A2, A2, 32 | |||
| lxvw4x vs38, o0, A3 | |||
| lxvw4x vs39, o16, A3 | |||
| addi A3, A3, 32 | |||
| mr T1, BO | |||
| stxvw4x vs32, o0, T1 | |||
| stxvw4x vs33, o16, T1 | |||
| stxvw4x vs34, o32, T1 | |||
| stxvw4x vs35, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvw4x vs36, o0, T1 | |||
| stxvw4x vs37, o16, T1 | |||
| stxvw4x vs38, o32, T1 | |||
| stxvw4x vs39, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x2 | |||
| lxvw4x vs32, o0, A0 | |||
| addi A0, A0, 16 | |||
| lxvw4x vs33, o0, A1 | |||
| addi A1, A1, 16 | |||
| lxvw4x vs34, o0, A2 | |||
| addi A2, A2, 16 | |||
| lxvw4x vs35, o0, A3 | |||
| addi A3, A3, 16 | |||
| mr T1, BO | |||
| stxvw4x vs32, o0, T1 | |||
| stxvw4x vs33, o16, T1 | |||
| stxvw4x vs34, o32, T1 | |||
| stxvw4x vs35, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x1 | |||
| lxsspx vs32, o0, A0 | |||
| lxsspx vs33, o4, A0 | |||
| addi A0, A0, 8 | |||
| lxsspx vs34, o0, A1 | |||
| lxsspx vs35, o4, A1 | |||
| addi A1, A1, 8 | |||
| lxsspx vs36, o0, A2 | |||
| lxsspx vs37, o4, A2 | |||
| addi A2, A2, 8 | |||
| lxsspx vs38, o0, A3 | |||
| lxsspx vs39, o4, A3 | |||
| addi A3, A3, 8 | |||
| mr T1, BO | |||
| stxsspx vs32, o0, T1 | |||
| stxsspx vs33, o4, T1 | |||
| addi T1, T1, 8 | |||
| stxsspx vs34, o0, T1 | |||
| stxsspx vs35, o4, T1 | |||
| addi T1, T1, 8 | |||
| stxsspx vs36, o0, T1 | |||
| stxsspx vs37, o4, T1 | |||
| addi T1, T1, 8 | |||
| stxsspx vs38, o0, T1 | |||
| stxsspx vs39, o4, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x8 | |||
| lxvw4x vs32, o0, A0 | |||
| lxvw4x vs33, o16, A0 | |||
| lxvw4x vs34, o32, A0 | |||
| lxvw4x vs35, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvw4x vs36, o0, A1 | |||
| lxvw4x vs37, o16, A1 | |||
| lxvw4x vs38, o32, A1 | |||
| lxvw4x vs39, o48, A1 | |||
| addi A1, A1, 64 | |||
| mr T1, BO | |||
| stxvw4x vs32, o0, T1 | |||
| stxvw4x vs33, o16, T1 | |||
| stxvw4x vs34, o32, T1 | |||
| stxvw4x vs35, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvw4x vs36, o0, T1 | |||
| stxvw4x vs37, o16, T1 | |||
| stxvw4x vs38, o32, T1 | |||
| stxvw4x vs39, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x4 | |||
| lxvw4x vs32, o0, A0 | |||
| lxvw4x vs33, o16, A0 | |||
| addi A0, A0, 32 | |||
| lxvw4x vs34, o0, A1 | |||
| lxvw4x vs35, o16, A1 | |||
| addi A1, A1, 32 | |||
| mr T1, BO | |||
| stxvw4x vs32, o0, T1 | |||
| stxvw4x vs33, o16, T1 | |||
| stxvw4x vs34, o32, T1 | |||
| stxvw4x vs35, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x2 | |||
| lxvw4x vs32, o0, A0 | |||
| addi A0, A0, 16 | |||
| lxvw4x vs33, o0, A1 | |||
| addi A1, A1, 16 | |||
| mr T1, BO | |||
| stxvw4x vs32, o0, T1 | |||
| stxvw4x vs33, o16, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x1 | |||
| lxsspx vs32, o0, A0 | |||
| lxsspx vs33, o4, A0 | |||
| addi A0, A0, 8 | |||
| lxsspx vs34, o0, A1 | |||
| lxsspx vs35, o4, A1 | |||
| addi A1, A1, 8 | |||
| mr T1, BO | |||
| stxsspx vs32, o0, T1 | |||
| stxsspx vs33, o4, T1 | |||
| addi T1, T1, 8 | |||
| stxsspx vs34, o0, T1 | |||
| stxsspx vs35, o4, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x8 | |||
| lxvw4x vs32, o0, A0 | |||
| lxvw4x vs33, o16, A0 | |||
| lxvw4x vs34, o32, A0 | |||
| lxvw4x vs35, o48, A0 | |||
| addi A0, A0, 64 | |||
| mr T1, BO | |||
| stxvw4x vs32, o0, T1 | |||
| stxvw4x vs33, o16, T1 | |||
| stxvw4x vs34, o32, T1 | |||
| stxvw4x vs35, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x4 | |||
| lxvw4x vs32, o0, A0 | |||
| lxvw4x vs33, o16, A0 | |||
| addi A0, A0, 32 | |||
| mr T1, BO | |||
| stxvw4x vs32, o0, T1 | |||
| stxvw4x vs33, o16, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x2 | |||
| lxvw4x vs32, o0, A0 | |||
| addi A0, A0, 16 | |||
| mr T1, BO | |||
| stxvw4x vs32, o0, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x1 | |||
| lxsspx vs32, o0, A0 | |||
| lxsspx vs33, o4, A0 | |||
| addi A0, A0, 8 | |||
| mr T1, BO | |||
| stxsspx vs32, o0, T1 | |||
| stxsspx vs33, o4, T1 | |||
| .endm | |||
| @@ -131,13 +131,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define o0 0 | |||
| #define T4 r12 | |||
| #define T3 r11 | |||
| #define o40 r12 | |||
| #define o56 r11 | |||
| #define o112 r14 | |||
| #define o8 r15 | |||
| #define o24 r16 | |||
| #define ALPHA r17 | |||
| #define o64 r17 | |||
| #define L r18 | |||
| #define T1 r19 | |||
| #define KK r20 | |||
| #define BB r21 | |||
| #define o80 r20 | |||
| #define o96 r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| @@ -202,6 +209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| #else | |||
| stw r31, 144(SP) | |||
| stw r30, 148(SP) | |||
| @@ -220,6 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stw r17, 200(SP) | |||
| stw r16, 204(SP) | |||
| stw r15, 208(SP) | |||
| stw r14, 212(SP) | |||
| #endif | |||
| stfd f1, ALPHA_SP | |||
| @@ -260,19 +269,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ble .L999_H1 | |||
| #ifdef __64BIT__ | |||
| addi ALPHA, SP, 296 | |||
| addi T1, SP, 296 | |||
| #else | |||
| addi ALPHA, SP, 224 | |||
| addi T1, SP, 224 | |||
| #endif | |||
| li PRE, 256 | |||
| li PRE, 384 | |||
| li o8 , 8 | |||
| li o16, 16 | |||
| li o24, 24 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| li o64, 64 | |||
| li o80, 80 | |||
| li o96, 96 | |||
| li o112, 112 | |||
| lxvdsx alpha_r, 0, ALPHA | |||
| lxvdsx alpha_r, 0, T1 | |||
| #include "dgemm_logic_16x4_power8.S" | |||
| @@ -320,6 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| #else | |||
| lwz r31, 144(SP) | |||
| lwz r30, 148(SP) | |||
| @@ -338,6 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lwz r17, 200(SP) | |||
| lwz r16, 204(SP) | |||
| lwz r15, 208(SP) | |||
| lwz r14, 212(SP) | |||
| #endif | |||
| addi SP, SP, STACKSIZE | |||
| @@ -47,88 +47,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvdsx vs24, 0, BO | |||
| lxvdsx vs25, o8, BO | |||
| addi AO, AO, 64 | |||
| lxvd2x vs4, 0, AO | |||
| lxvd2x vs5, o16, AO | |||
| lxvd2x vs6, o32, AO | |||
| lxvd2x vs7, o48, AO | |||
| lxvd2x vs4, o64, AO | |||
| lxvd2x vs5, o80, AO | |||
| lxvd2x vs6, o96, AO | |||
| lxvd2x vs7, o112, AO | |||
| lxvdsx vs26, o16, BO | |||
| lxvdsx vs27, o24, BO | |||
| addi AO, AO, 64 | |||
| addi AO, AO, 128 | |||
| addi BO, BO, 32 | |||
| .endm | |||
| .macro KERNEL4x16_I1 | |||
| xvmuldp vs32, vs0, vs24 | |||
| xvmuldp vs33, vs1, vs24 | |||
| xvmuldp vs34, vs2, vs24 | |||
| xvmuldp vs35, vs3, vs24 | |||
| xvmuldp vs32, vs0, vs24 | |||
| xvmuldp vs33, vs1, vs24 | |||
| xvmuldp vs34, vs2, vs24 | |||
| xvmuldp vs35, vs3, vs24 | |||
| lxvd2x vs8, 0, AO | |||
| lxvd2x vs8, o0, AO | |||
| lxvd2x vs9, o16, AO | |||
| lxvd2x vs10, o32, AO | |||
| lxvd2x vs11, o48, AO | |||
| xvmuldp vs36, vs4, vs24 | |||
| xvmuldp vs37, vs5, vs24 | |||
| xvmuldp vs38, vs6, vs24 | |||
| xvmuldp vs39, vs7, vs24 | |||
| xvmuldp vs36, vs4, vs24 | |||
| xvmuldp vs37, vs5, vs24 | |||
| xvmuldp vs38, vs6, vs24 | |||
| xvmuldp vs39, vs7, vs24 | |||
| lxvdsx vs28, 0, BO | |||
| lxvdsx vs29, o8, BO | |||
| xvmuldp vs40, vs0, vs25 | |||
| xvmuldp vs41, vs1, vs25 | |||
| xvmuldp vs42, vs2, vs25 | |||
| xvmuldp vs43, vs3, vs25 | |||
| xvmuldp vs40, vs0, vs25 | |||
| xvmuldp vs41, vs1, vs25 | |||
| xvmuldp vs42, vs2, vs25 | |||
| xvmuldp vs43, vs3, vs25 | |||
| lxvd2x vs10, o32, AO | |||
| lxvd2x vs11, o48, AO | |||
| xvmuldp vs44, vs4, vs25 | |||
| xvmuldp vs45, vs5, vs25 | |||
| xvmuldp vs46, vs6, vs25 | |||
| xvmuldp vs47, vs7, vs25 | |||
| xvmuldp vs44, vs4, vs25 | |||
| xvmuldp vs45, vs5, vs25 | |||
| xvmuldp vs46, vs6, vs25 | |||
| xvmuldp vs47, vs7, vs25 | |||
| addi AO, AO, 64 | |||
| xvmuldp vs48, vs0, vs26 | |||
| xvmuldp vs49, vs1, vs26 | |||
| xvmuldp vs50, vs2, vs26 | |||
| xvmuldp vs51, vs3, vs26 | |||
| xvmuldp vs48, vs0, vs26 | |||
| xvmuldp vs49, vs1, vs26 | |||
| xvmuldp vs50, vs2, vs26 | |||
| xvmuldp vs51, vs3, vs26 | |||
| lxvd2x vs12, 0, AO | |||
| lxvd2x vs13, o16, AO | |||
| lxvd2x vs12, o64, AO | |||
| lxvd2x vs13, o80, AO | |||
| xvmuldp vs52, vs4, vs26 | |||
| xvmuldp vs53, vs5, vs26 | |||
| xvmuldp vs54, vs6, vs26 | |||
| xvmuldp vs55, vs7, vs26 | |||
| xvmuldp vs52, vs4, vs26 | |||
| xvmuldp vs53, vs5, vs26 | |||
| xvmuldp vs54, vs6, vs26 | |||
| xvmuldp vs55, vs7, vs26 | |||
| lxvd2x vs14, o32, AO | |||
| lxvd2x vs15, o48, AO | |||
| lxvd2x vs14, o96, AO | |||
| lxvd2x vs15, o112, AO | |||
| xvmuldp vs56, vs0, vs27 | |||
| xvmuldp vs57, vs1, vs27 | |||
| xvmuldp vs58, vs2, vs27 | |||
| xvmuldp vs59, vs3, vs27 | |||
| xvmuldp vs56, vs0, vs27 | |||
| xvmuldp vs57, vs1, vs27 | |||
| xvmuldp vs58, vs2, vs27 | |||
| xvmuldp vs59, vs3, vs27 | |||
| lxvdsx vs30, o16, BO | |||
| lxvdsx vs31, o24, BO | |||
| xvmuldp vs60, vs4, vs27 | |||
| xvmuldp vs61, vs5, vs27 | |||
| xvmuldp vs62, vs6, vs27 | |||
| xvmuldp vs63, vs7, vs27 | |||
| xvmuldp vs60, vs4, vs27 | |||
| xvmuldp vs61, vs5, vs27 | |||
| xvmuldp vs62, vs6, vs27 | |||
| xvmuldp vs63, vs7, vs27 | |||
| addi AO, AO, 64 | |||
| addi BO, BO, 32 | |||
| addi AO, AO, 128 | |||
| .endm | |||
| .macro KERNEL4x16_1 | |||
| xvmaddadp vs32, vs0, vs24 | |||
| @@ -136,8 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs34, vs2, vs24 | |||
| xvmaddadp vs35, vs3, vs24 | |||
| lxvd2x vs8, 0, AO | |||
| lxvd2x vs8, o0, AO | |||
| lxvd2x vs9, o16, AO | |||
| lxvd2x vs10, o32, AO | |||
| lxvd2x vs11, o48, AO | |||
| xvmaddadp vs36, vs4, vs24 | |||
| xvmaddadp vs37, vs5, vs24 | |||
| @@ -152,31 +154,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs42, vs2, vs25 | |||
| xvmaddadp vs43, vs3, vs25 | |||
| lxvd2x vs10, o32, AO | |||
| lxvd2x vs11, o48, AO | |||
| xvmaddadp vs44, vs4, vs25 | |||
| xvmaddadp vs45, vs5, vs25 | |||
| xvmaddadp vs46, vs6, vs25 | |||
| xvmaddadp vs47, vs7, vs25 | |||
| addi AO, AO, 64 | |||
| xvmaddadp vs48, vs0, vs26 | |||
| xvmaddadp vs49, vs1, vs26 | |||
| xvmaddadp vs50, vs2, vs26 | |||
| xvmaddadp vs51, vs3, vs26 | |||
| lxvd2x vs12, 0, AO | |||
| lxvd2x vs13, o16, AO | |||
| lxvd2x vs12, o64, AO | |||
| lxvd2x vs13, o80, AO | |||
| xvmaddadp vs52, vs4, vs26 | |||
| xvmaddadp vs53, vs5, vs26 | |||
| xvmaddadp vs54, vs6, vs26 | |||
| xvmaddadp vs55, vs7, vs26 | |||
| lxvd2x vs14, o32, AO | |||
| lxvd2x vs15, o48, AO | |||
| lxvd2x vs14, o96, AO | |||
| lxvd2x vs15, o112, AO | |||
| xvmaddadp vs56, vs0, vs27 | |||
| xvmaddadp vs57, vs1, vs27 | |||
| @@ -192,7 +191,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs62, vs6, vs27 | |||
| xvmaddadp vs63, vs7, vs27 | |||
| addi AO, AO, 64 | |||
| addi AO, AO, 128 | |||
| addi BO, BO, 32 | |||
| .endm | |||
| @@ -228,23 +227,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs46, vs14, vs29 | |||
| xvmaddadp vs47, vs15, vs29 | |||
| addi AO, AO, 64 | |||
| xvmaddadp vs48, vs8, vs30 | |||
| xvmaddadp vs49, vs9, vs30 | |||
| xvmaddadp vs50, vs10, vs30 | |||
| xvmaddadp vs51, vs11, vs30 | |||
| lxvd2x vs4, 0, AO | |||
| lxvd2x vs5, o16, AO | |||
| lxvd2x vs4, o64, AO | |||
| lxvd2x vs5, o80, AO | |||
| xvmaddadp vs52, vs12, vs30 | |||
| xvmaddadp vs53, vs13, vs30 | |||
| xvmaddadp vs54, vs14, vs30 | |||
| xvmaddadp vs55, vs15, vs30 | |||
| lxvd2x vs6, o32, AO | |||
| lxvd2x vs7, o48, AO | |||
| lxvd2x vs6, o96, AO | |||
| lxvd2x vs7, o112, AO | |||
| xvmaddadp vs56, vs8, vs31 | |||
| xvmaddadp vs57, vs9, vs31 | |||
| @@ -259,11 +257,144 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs62, vs14, vs31 | |||
| xvmaddadp vs63, vs15, vs31 | |||
| addi AO, AO, 64 | |||
| addi AO, AO, 128 | |||
| addi BO, BO, 32 | |||
| .endm | |||
| .macro KERNEL4x16_L1 | |||
| xvmaddadp vs32, vs0, vs24 | |||
| xvmaddadp vs33, vs1, vs24 | |||
| xvmaddadp vs34, vs2, vs24 | |||
| xvmaddadp vs35, vs3, vs24 | |||
| lxvd2x vs8, o0, AO | |||
| lxvd2x vs9, o16, AO | |||
| lxvd2x vs10, o32, AO | |||
| lxvd2x vs11, o48, AO | |||
| xvmaddadp vs36, vs4, vs24 | |||
| xvmaddadp vs37, vs5, vs24 | |||
| xvmaddadp vs38, vs6, vs24 | |||
| xvmaddadp vs39, vs7, vs24 | |||
| lxvdsx vs28, 0, BO | |||
| lxvdsx vs29, o8, BO | |||
| xvmaddadp vs40, vs0, vs25 | |||
| xvmaddadp vs41, vs1, vs25 | |||
| xvmaddadp vs42, vs2, vs25 | |||
| xvmaddadp vs43, vs3, vs25 | |||
| xvmaddadp vs44, vs4, vs25 | |||
| xvmaddadp vs45, vs5, vs25 | |||
| xvmaddadp vs46, vs6, vs25 | |||
| xvmaddadp vs47, vs7, vs25 | |||
| xvmaddadp vs48, vs0, vs26 | |||
| xvmaddadp vs49, vs1, vs26 | |||
| xvmaddadp vs50, vs2, vs26 | |||
| xvmaddadp vs51, vs3, vs26 | |||
| lxvd2x vs12, o64, AO | |||
| lxvd2x vs13, o80, AO | |||
| xvmaddadp vs52, vs4, vs26 | |||
| xvmaddadp vs53, vs5, vs26 | |||
| xvmaddadp vs54, vs6, vs26 | |||
| xvmaddadp vs55, vs7, vs26 | |||
| lxvd2x vs14, o96, AO | |||
| lxvd2x vs15, o112, AO | |||
| xvmaddadp vs56, vs0, vs27 | |||
| xvmaddadp vs57, vs1, vs27 | |||
| xvmaddadp vs58, vs2, vs27 | |||
| xvmaddadp vs59, vs3, vs27 | |||
| lxvdsx vs30, o16, BO | |||
| lxvdsx vs31, o24, BO | |||
| xvmaddadp vs60, vs4, vs27 | |||
| xvmaddadp vs61, vs5, vs27 | |||
| xvmaddadp vs62, vs6, vs27 | |||
| xvmaddadp vs63, vs7, vs27 | |||
| addi AO, AO, 128 | |||
| .endm | |||
| .macro KERNEL4x16_L2 | |||
| xvmaddadp vs32, vs8, vs28 | |||
| xvmaddadp vs33, vs9, vs28 | |||
| xvmaddadp vs34, vs10, vs28 | |||
| xvmaddadp vs35, vs11, vs28 | |||
| lxvd2x vs0, 0, AO | |||
| lxvd2x vs1, o16, AO | |||
| xvmaddadp vs36, vs12, vs28 | |||
| xvmaddadp vs37, vs13, vs28 | |||
| xvmaddadp vs38, vs14, vs28 | |||
| xvmaddadp vs39, vs15, vs28 | |||
| lxvdsx vs24, o32, BO | |||
| lxvdsx vs25, o40, BO | |||
| xvmaddadp vs40, vs8, vs29 | |||
| xvmaddadp vs41, vs9, vs29 | |||
| xvmaddadp vs42, vs10, vs29 | |||
| xvmaddadp vs43, vs11, vs29 | |||
| lxvd2x vs2, o32, AO | |||
| lxvd2x vs3, o48, AO | |||
| xvmaddadp vs44, vs12, vs29 | |||
| xvmaddadp vs45, vs13, vs29 | |||
| xvmaddadp vs46, vs14, vs29 | |||
| xvmaddadp vs47, vs15, vs29 | |||
| xvmaddadp vs48, vs8, vs30 | |||
| xvmaddadp vs49, vs9, vs30 | |||
| xvmaddadp vs50, vs10, vs30 | |||
| xvmaddadp vs51, vs11, vs30 | |||
| lxvd2x vs4, o64, AO | |||
| lxvd2x vs5, o80, AO | |||
| xvmaddadp vs52, vs12, vs30 | |||
| xvmaddadp vs53, vs13, vs30 | |||
| xvmaddadp vs54, vs14, vs30 | |||
| xvmaddadp vs55, vs15, vs30 | |||
| lxvd2x vs6, o96, AO | |||
| lxvd2x vs7, o112, AO | |||
| xvmaddadp vs56, vs8, vs31 | |||
| xvmaddadp vs57, vs9, vs31 | |||
| xvmaddadp vs58, vs10, vs31 | |||
| xvmaddadp vs59, vs11, vs31 | |||
| lxvdsx vs26, o48, BO | |||
| lxvdsx vs27, o56, BO | |||
| xvmaddadp vs60, vs12, vs31 | |||
| addi AO, AO, 128 | |||
| xvmaddadp vs61, vs13, vs31 | |||
| xvmaddadp vs62, vs14, vs31 | |||
| addi BO, BO, 64 | |||
| xvmaddadp vs63, vs15, vs31 | |||
| .endm | |||
| .macro KERNEL4x16_E2 | |||
| @@ -378,15 +509,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvdsx vs26, o16, BO | |||
| lxvdsx vs27, o24, BO | |||
| addi AO, AO, 64 | |||
| addi BO, BO, 32 | |||
| lxvd2x vs4, 0, AO | |||
| lxvd2x vs5, o16, AO | |||
| lxvd2x vs6, o32, AO | |||
| lxvd2x vs7, o48, AO | |||
| lxvd2x vs4, o64, AO | |||
| lxvd2x vs5, o80, AO | |||
| lxvd2x vs6, o96, AO | |||
| lxvd2x vs7, o112, AO | |||
| addi AO, AO, 64 | |||
| xvmaddadp vs32, vs0, vs24 | |||
| @@ -402,6 +530,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs41, vs1, vs25 | |||
| xvmaddadp vs42, vs2, vs25 | |||
| xvmaddadp vs43, vs3, vs25 | |||
| addi BO, BO, 32 | |||
| xvmaddadp vs44, vs4, vs25 | |||
| xvmaddadp vs45, vs5, vs25 | |||
| xvmaddadp vs46, vs6, vs25 | |||
| @@ -411,6 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs49, vs1, vs26 | |||
| xvmaddadp vs50, vs2, vs26 | |||
| xvmaddadp vs51, vs3, vs26 | |||
| addi AO, AO, 128 | |||
| xvmaddadp vs52, vs4, vs26 | |||
| xvmaddadp vs53, vs5, vs26 | |||
| xvmaddadp vs54, vs6, vs26 | |||
| @@ -430,21 +560,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro SAVE4x16 | |||
| mr T1, CO | |||
| addi T2, T1, 64 | |||
| add T2, T1, LDC | |||
| add T3, T2, LDC | |||
| add T4, T3, LDC | |||
| lxvd2x vs0, 0, CO | |||
| lxvd2x vs1, o16, CO | |||
| lxvd2x vs2, o32, CO | |||
| lxvd2x vs3, o48, CO | |||
| lxvd2x vs4, o64, CO | |||
| lxvd2x vs5, o80, CO | |||
| lxvd2x vs6, o96, CO | |||
| lxvd2x vs7, o112, CO | |||
| lxvd2x vs8, 0, T2 | |||
| lxvd2x vs9, o16, T2 | |||
| lxvd2x vs10, o32, T2 | |||
| lxvd2x vs11, o48, T2 | |||
| lxvd2x vs12, o64, T2 | |||
| lxvd2x vs13, o80, T2 | |||
| lxvd2x vs14, o96, T2 | |||
| lxvd2x vs15, o112, T2 | |||
| lxvd2x vs24, 0, T3 | |||
| lxvd2x vs25, o16, T3 | |||
| lxvd2x vs26, o32, T3 | |||
| lxvd2x vs27, o48, T3 | |||
| lxvd2x vs28, o64, T3 | |||
| lxvd2x vs29, o80, T3 | |||
| lxvd2x vs30, o96, T3 | |||
| lxvd2x vs31, o112, T3 | |||
| #ifndef TRMMKERNEL | |||
| lxvd2x vs0, 0, T1 | |||
| lxvd2x vs1, o16, T1 | |||
| lxvd2x vs2, o32, T1 | |||
| lxvd2x vs3, o48, T1 | |||
| lxvd2x vs4, 0, T2 | |||
| lxvd2x vs5, o16, T2 | |||
| lxvd2x vs6, o32, T2 | |||
| lxvd2x vs7, o48, T2 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| xvmaddadp vs0, vs32, alpha_r | |||
| xvmaddadp vs1, vs33, alpha_r | |||
| xvmaddadp vs2, vs34, alpha_r | |||
| @@ -453,171 +599,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs5, vs37, alpha_r | |||
| xvmaddadp vs6, vs38, alpha_r | |||
| xvmaddadp vs7, vs39, alpha_r | |||
| #else | |||
| xvmuldp vs0, vs32, alpha_r | |||
| xvmuldp vs1, vs33, alpha_r | |||
| xvmuldp vs2, vs34, alpha_r | |||
| xvmuldp vs3, vs35, alpha_r | |||
| xvmuldp vs4, vs36, alpha_r | |||
| xvmuldp vs5, vs37, alpha_r | |||
| xvmuldp vs6, vs38, alpha_r | |||
| xvmuldp vs7, vs39, alpha_r | |||
| #endif | |||
| stxvd2x vs0, 0, T1 | |||
| stxvd2x vs1, o16, T1 | |||
| stxvd2x vs2, o32, T1 | |||
| stxvd2x vs3, o48, T1 | |||
| dcbt T1, PRE | |||
| stxvd2x vs4, 0, T2 | |||
| stxvd2x vs5, o16, T2 | |||
| stxvd2x vs6, o32, T2 | |||
| stxvd2x vs7, o48, T2 | |||
| add T1, T1, LDC | |||
| add T2, T2, LDC | |||
| lxvd2x vs32, 0, T4 | |||
| lxvd2x vs33, o16, T4 | |||
| lxvd2x vs34, o32, T4 | |||
| lxvd2x vs35, o48, T4 | |||
| lxvd2x vs36, o64, T4 | |||
| lxvd2x vs37, o80, T4 | |||
| lxvd2x vs38, o96, T4 | |||
| lxvd2x vs39, o112, T4 | |||
| #ifndef TRMMKERNEL | |||
| lxvd2x vs8, 0, T1 | |||
| lxvd2x vs9, o16, T1 | |||
| lxvd2x vs10, o32, T1 | |||
| lxvd2x vs11, o48, T1 | |||
| lxvd2x vs12, 0, T2 | |||
| lxvd2x vs13, o16, T2 | |||
| lxvd2x vs14, o32, T2 | |||
| lxvd2x vs15, o48, T2 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| xvmaddadp vs8, vs40, alpha_r | |||
| xvmaddadp vs9, vs41, alpha_r | |||
| xvmaddadp vs10, vs42, alpha_r | |||
| xvmaddadp vs11, vs43, alpha_r | |||
| xvmaddadp vs12, vs44, alpha_r | |||
| xvmaddadp vs13, vs45, alpha_r | |||
| xvmaddadp vs14, vs46, alpha_r | |||
| xvmaddadp vs15, vs47, alpha_r | |||
| #else | |||
| xvmuldp vs8, vs40, alpha_r | |||
| xvmuldp vs9, vs41, alpha_r | |||
| xvmuldp vs10, vs42, alpha_r | |||
| xvmuldp vs11, vs43, alpha_r | |||
| xvmuldp vs12, vs44, alpha_r | |||
| xvmuldp vs13, vs45, alpha_r | |||
| xvmuldp vs14, vs46, alpha_r | |||
| xvmuldp vs15, vs47, alpha_r | |||
| #endif | |||
| stxvd2x vs8, 0, T1 | |||
| stxvd2x vs9, o16, T1 | |||
| stxvd2x vs10, o32, T1 | |||
| stxvd2x vs11, o48, T1 | |||
| dcbt T1, PRE | |||
| stxvd2x vs12, 0, T2 | |||
| stxvd2x vs13, o16, T2 | |||
| stxvd2x vs14, o32, T2 | |||
| stxvd2x vs15, o48, T2 | |||
| add T1, T1, LDC | |||
| add T2, T2, LDC | |||
| #ifndef TRMMKERNEL | |||
| lxvd2x vs0, 0, T1 | |||
| lxvd2x vs1, o16, T1 | |||
| lxvd2x vs2, o32, T1 | |||
| lxvd2x vs3, o48, T1 | |||
| lxvd2x vs4, 0, T2 | |||
| lxvd2x vs5, o16, T2 | |||
| lxvd2x vs6, o32, T2 | |||
| lxvd2x vs7, o48, T2 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| xvmaddadp vs0, vs48, alpha_r | |||
| xvmaddadp vs1, vs49, alpha_r | |||
| xvmaddadp vs2, vs50, alpha_r | |||
| xvmaddadp vs3, vs51, alpha_r | |||
| xvmaddadp vs4, vs52, alpha_r | |||
| xvmaddadp vs5, vs53, alpha_r | |||
| xvmaddadp vs6, vs54, alpha_r | |||
| xvmaddadp vs7, vs55, alpha_r | |||
| #else | |||
| xvmuldp vs0, vs48, alpha_r | |||
| xvmuldp vs1, vs49, alpha_r | |||
| xvmuldp vs2, vs50, alpha_r | |||
| xvmuldp vs3, vs51, alpha_r | |||
| xvmuldp vs4, vs52, alpha_r | |||
| xvmuldp vs5, vs53, alpha_r | |||
| xvmuldp vs6, vs54, alpha_r | |||
| xvmuldp vs7, vs55, alpha_r | |||
| #endif | |||
| stxvd2x vs0, 0, T1 | |||
| stxvd2x vs1, o16, T1 | |||
| stxvd2x vs2, o32, T1 | |||
| stxvd2x vs3, o48, T1 | |||
| dcbt T1, PRE | |||
| stxvd2x vs4, 0, T2 | |||
| stxvd2x vs5, o16, T2 | |||
| stxvd2x vs6, o32, T2 | |||
| stxvd2x vs7, o48, T2 | |||
| add T1, T1, LDC | |||
| add T2, T2, LDC | |||
| #ifndef TRMMKERNEL | |||
| lxvd2x vs8, 0, T1 | |||
| lxvd2x vs9, o16, T1 | |||
| lxvd2x vs10, o32, T1 | |||
| lxvd2x vs11, o48, T1 | |||
| lxvd2x vs12, 0, T2 | |||
| lxvd2x vs13, o16, T2 | |||
| lxvd2x vs14, o32, T2 | |||
| lxvd2x vs15, o48, T2 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| xvmaddadp vs8, vs56, alpha_r | |||
| xvmaddadp vs9, vs57, alpha_r | |||
| xvmaddadp vs10, vs58, alpha_r | |||
| xvmaddadp vs11, vs59, alpha_r | |||
| xvmaddadp vs12, vs60, alpha_r | |||
| xvmaddadp vs13, vs61, alpha_r | |||
| xvmaddadp vs14, vs62, alpha_r | |||
| xvmaddadp vs15, vs63, alpha_r | |||
| #else | |||
| xvmuldp vs8, vs56, alpha_r | |||
| xvmuldp vs9, vs57, alpha_r | |||
| xvmuldp vs10, vs58, alpha_r | |||
| xvmuldp vs11, vs59, alpha_r | |||
| xvmuldp vs12, vs60, alpha_r | |||
| xvmuldp vs13, vs61, alpha_r | |||
| xvmuldp vs14, vs62, alpha_r | |||
| xvmuldp vs15, vs63, alpha_r | |||
| #endif | |||
| xvmaddadp vs12, vs44, alpha_r | |||
| xvmaddadp vs13, vs45, alpha_r | |||
| xvmaddadp vs14, vs46, alpha_r | |||
| xvmaddadp vs15, vs47, alpha_r | |||
| stxvd2x vs8, 0, T1 | |||
| stxvd2x vs9, o16, T1 | |||
| stxvd2x vs10, o32, T1 | |||
| stxvd2x vs11, o48, T1 | |||
| stxvd2x vs4, o64, T1 | |||
| stxvd2x vs5, o80, T1 | |||
| stxvd2x vs6, o96, T1 | |||
| stxvd2x vs7, o112, T1 | |||
| xvmaddadp vs24, vs48, alpha_r | |||
| xvmaddadp vs25, vs49, alpha_r | |||
| xvmaddadp vs26, vs50, alpha_r | |||
| xvmaddadp vs27, vs51, alpha_r | |||
| stxvd2x vs8, o0, T2 | |||
| stxvd2x vs9, o16, T2 | |||
| stxvd2x vs10, o32, T2 | |||
| stxvd2x vs11, o48, T2 | |||
| xvmaddadp vs28, vs52, alpha_r | |||
| xvmaddadp vs29, vs53, alpha_r | |||
| xvmaddadp vs30, vs54, alpha_r | |||
| xvmaddadp vs31, vs55, alpha_r | |||
| stxvd2x vs12, o64, T2 | |||
| stxvd2x vs13, o80, T2 | |||
| stxvd2x vs14, o96, T2 | |||
| stxvd2x vs15, o112, T2 | |||
| xvmaddadp vs32, vs56, alpha_r | |||
| xvmaddadp vs33, vs57, alpha_r | |||
| xvmaddadp vs34, vs58, alpha_r | |||
| xvmaddadp vs35, vs59, alpha_r | |||
| stxvd2x vs24, 0, T3 | |||
| stxvd2x vs25, o16, T3 | |||
| stxvd2x vs26, o32, T3 | |||
| stxvd2x vs27, o48, T3 | |||
| xvmaddadp vs36, vs60, alpha_r | |||
| xvmaddadp vs37, vs61, alpha_r | |||
| xvmaddadp vs38, vs62, alpha_r | |||
| xvmaddadp vs39, vs63, alpha_r | |||
| stxvd2x vs28, o64, T3 | |||
| stxvd2x vs29, o80, T3 | |||
| stxvd2x vs30, o96, T3 | |||
| stxvd2x vs31, o112, T3 | |||
| stxvd2x vs32, o0, T4 | |||
| stxvd2x vs33, o16, T4 | |||
| stxvd2x vs34, o32, T4 | |||
| stxvd2x vs35, o48, T4 | |||
| dcbt T1, PRE | |||
| addi CO, CO, 128 | |||
| stxvd2x vs12, 0, T2 | |||
| stxvd2x vs13, o16, T2 | |||
| stxvd2x vs14, o32, T2 | |||
| stxvd2x vs15, o48, T2 | |||
| stxvd2x vs36, o64, T4 | |||
| stxvd2x vs37, o80, T4 | |||
| stxvd2x vs38, o96, T4 | |||
| stxvd2x vs39, o112, T4 | |||
| addi CO, CO, 128 | |||
| .endm | |||
| @@ -0,0 +1,228 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/28 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define M r3 | |||
| #define N r4 | |||
| #define A r5 | |||
| #define LDA r6 | |||
| #define B r7 | |||
| #define A0 r8 | |||
| #define A1 r9 | |||
| #define A2 r10 | |||
| #define A3 r11 | |||
| #define J r12 | |||
| #define PREA r14 | |||
| #define PREB r15 | |||
| #define BO r16 | |||
| #define o64 r17 | |||
| #define o80 r18 | |||
| #define o96 r19 | |||
| #define o112 r20 | |||
| #define o8 r21 | |||
| #define T2 r22 | |||
| #define I r23 | |||
| #define o16 r24 | |||
| #define o32 r25 | |||
| #define o48 r26 | |||
| #define NOTU1 r27 | |||
| #define NOTU2 r30 | |||
| #define T1 r31 | |||
| #define o0 0 | |||
| #include "dgemm_ncopy_macros_4_power8.S" | |||
| #define STACKSIZE 384 | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| cmpwi cr0, M, 0 | |||
| ble- L999 | |||
| cmpwi cr0, N, 0 | |||
| ble- L999 | |||
| slwi LDA, LDA, BASE_SHIFT | |||
| li PREA, 384 | |||
| li PREB, 384 | |||
| li o8, 8 | |||
| li o16, 16 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| li o64, 64 | |||
| li o80, 80 | |||
| li o96, 96 | |||
| li o112, 112 | |||
| #include "dgemm_ncopy_logic_4_power8.S" | |||
| L999: | |||
| li r3, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| @@ -0,0 +1,237 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/28 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| mr BO, B | |||
| srawi. I, N, 2 | |||
| ble DCOPYN_L2_BEGIN | |||
| DCOPYN_L4_BEGIN: | |||
| DCOPYN_L4_LOOP: | |||
| mr A0, A | |||
| add A1, A0, LDA | |||
| add A2, A1, LDA | |||
| add A3, A2, LDA | |||
| add A, A3, LDA | |||
| DCOPYN_L4x16_BEGIN: | |||
| srawi. J, M, 4 | |||
| ble DCOPYN_L4x16_END | |||
| DCOPYN_L4x16_LOOP: | |||
| dcbt A0, PREA | |||
| dcbt A1, PREA | |||
| dcbt A2, PREA | |||
| dcbt A3, PREA | |||
| COPY_4x16 | |||
| addic. J, J, -1 | |||
| bgt DCOPYN_L4x16_LOOP | |||
| DCOPYN_L4x16_END: | |||
| DCOPYN_L4x8_BEGIN: | |||
| andi. J, M, 8 | |||
| ble DCOPYN_L4x8_END | |||
| COPY_4x8 | |||
| DCOPYN_L4x8_END: | |||
| DCOPYN_L4x4_BEGIN: | |||
| andi. J, M, 4 | |||
| ble DCOPYN_L4x4_END | |||
| COPY_4x4 | |||
| DCOPYN_L4x4_END: | |||
| DCOPYN_L4x2_BEGIN: | |||
| andi. J, M, 2 | |||
| ble DCOPYN_L4x2_END | |||
| COPY_4x2 | |||
| DCOPYN_L4x2_END: | |||
| DCOPYN_L4x1_BEGIN: | |||
| andi. J, M, 1 | |||
| ble DCOPYN_L4x1_END | |||
| COPY_4x1 | |||
| DCOPYN_L4x1_END: | |||
| DCOPYN_L4_END: | |||
| addic. I, I, -1 | |||
| bgt DCOPYN_L4_LOOP | |||
| DCOPYN_L2_BEGIN: | |||
| andi. T1, 4, 2 | |||
| ble DCOPYN_L2_END | |||
| DCOPYN_L2_LOOP: | |||
| mr A0, A | |||
| add A1, A0, LDA | |||
| add A, A1, LDA | |||
| DCOPYN_L2x16_BEGIN: | |||
| srawi. J, M, 4 | |||
| ble DCOPYN_L2x16_END | |||
| DCOPYN_L2x16_LOOP: | |||
| COPY_2x16 | |||
| addic. J, J, -1 | |||
| bgt DCOPYN_L2x16_LOOP | |||
| DCOPYN_L2x16_END: | |||
| DCOPYN_L2x8_BEGIN: | |||
| andi. J, M, 8 | |||
| ble DCOPYN_L2x8_END | |||
| COPY_2x8 | |||
| DCOPYN_L2x8_END: | |||
| DCOPYN_L2x4_BEGIN: | |||
| andi. J, M, 4 | |||
| ble DCOPYN_L2x4_END | |||
| COPY_2x4 | |||
| DCOPYN_L2x4_END: | |||
| DCOPYN_L2x2_BEGIN: | |||
| andi. J, M, 2 | |||
| ble DCOPYN_L2x2_END | |||
| COPY_2x2 | |||
| DCOPYN_L2x2_END: | |||
| DCOPYN_L2x1_BEGIN: | |||
| andi. J, M, 1 | |||
| ble DCOPYN_L2x1_END | |||
| COPY_2x1 | |||
| DCOPYN_L2x1_END: | |||
| DCOPYN_L2_END: | |||
| DCOPYN_L1_BEGIN: | |||
| andi. T1, 4, 1 | |||
| ble DCOPYN_L1_END | |||
| DCOPYN_L1_LOOP: | |||
| mr A0, A | |||
| add A, A0, LDA | |||
| DCOPYN_L1x16_BEGIN: | |||
| srawi. J, M, 4 | |||
| ble DCOPYN_L1x16_END | |||
| DCOPYN_L1x16_LOOP: | |||
| COPY_1x16 | |||
| addic. J, J, -1 | |||
| bgt DCOPYN_L1x16_LOOP | |||
| DCOPYN_L1x16_END: | |||
| DCOPYN_L1x8_BEGIN: | |||
| andi. J, M, 8 | |||
| ble DCOPYN_L1x8_END | |||
| COPY_1x8 | |||
| DCOPYN_L1x8_END: | |||
| DCOPYN_L1x4_BEGIN: | |||
| andi. J, M, 4 | |||
| ble DCOPYN_L1x4_END | |||
| COPY_1x4 | |||
| DCOPYN_L1x4_END: | |||
| DCOPYN_L1x2_BEGIN: | |||
| andi. J, M, 2 | |||
| ble DCOPYN_L1x2_END | |||
| COPY_1x2 | |||
| DCOPYN_L1x2_END: | |||
| DCOPYN_L1x1_BEGIN: | |||
| andi. J, M, 1 | |||
| ble DCOPYN_L1x1_END | |||
| COPY_1x1 | |||
| DCOPYN_L1x1_END: | |||
| DCOPYN_L1_END: | |||
| @@ -0,0 +1,691 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/28 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=16 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x16 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs8, o0, A1 | |||
| lxvd2x vs24, o0, A3 | |||
| lxvd2x vs16, o0, A2 | |||
| lxvd2x vs1, o16, A0 | |||
| lxvd2x vs9, o16, A1 | |||
| lxvd2x vs17, o16, A2 | |||
| lxvd2x vs25, o16, A3 | |||
| lxvd2x vs2, o32, A0 | |||
| lxvd2x vs10, o32, A1 | |||
| lxvd2x vs18, o32, A2 | |||
| lxvd2x vs26, o32, A3 | |||
| lxvd2x vs3, o48, A0 | |||
| lxvd2x vs11, o48, A1 | |||
| lxvd2x vs19, o48, A2 | |||
| lxvd2x vs27, o48, A3 | |||
| lxvd2x vs4, o64, A0 | |||
| lxvd2x vs12, o64, A1 | |||
| lxvd2x vs20, o64, A2 | |||
| lxvd2x vs28, o64, A3 | |||
| lxvd2x vs5, o80, A0 | |||
| lxvd2x vs13, o80, A1 | |||
| lxvd2x vs21, o80, A2 | |||
| lxvd2x vs29, o80, A3 | |||
| lxvd2x vs6, o96, A0 | |||
| lxvd2x vs14, o96, A1 | |||
| lxvd2x vs22, o96, A2 | |||
| lxvd2x vs30, o96, A3 | |||
| lxvd2x vs7, o112, A0 | |||
| lxvd2x vs15, o112, A1 | |||
| lxvd2x vs23, o112, A2 | |||
| lxvd2x vs31, o112, A3 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs16, vs24, 0 | |||
| xxpermdi vs34, vs0, vs8, 3 | |||
| xxpermdi vs35, vs16, vs24, 3 | |||
| xxpermdi vs36, vs1, vs9, 0 | |||
| xxpermdi vs37, vs17, vs25, 0 | |||
| xxpermdi vs38, vs1, vs9, 3 | |||
| xxpermdi vs39, vs17, vs25, 3 | |||
| xxpermdi vs40, vs2, vs10, 0 | |||
| xxpermdi vs41, vs18, vs26, 0 | |||
| xxpermdi vs42, vs2, vs10, 3 | |||
| xxpermdi vs43, vs18, vs26, 3 | |||
| xxpermdi vs44, vs3, vs11, 0 | |||
| xxpermdi vs45, vs19, vs27, 0 | |||
| xxpermdi vs46, vs3, vs11, 3 | |||
| xxpermdi vs47, vs19, vs27, 3 | |||
| xxpermdi vs48, vs4, vs12, 0 | |||
| xxpermdi vs49, vs20, vs28, 0 | |||
| xxpermdi vs50, vs4, vs12, 3 | |||
| xxpermdi vs51, vs20, vs28, 3 | |||
| xxpermdi vs52, vs5, vs13, 0 | |||
| xxpermdi vs53, vs21, vs29, 0 | |||
| xxpermdi vs54, vs5, vs13, 3 | |||
| xxpermdi vs55, vs21, vs29, 3 | |||
| addi A0, A0, 128 | |||
| addi A1, A1, 128 | |||
| xxpermdi vs56, vs6, vs14, 0 | |||
| xxpermdi vs57, vs22, vs30, 0 | |||
| xxpermdi vs58, vs6, vs14, 3 | |||
| xxpermdi vs59, vs22, vs30, 3 | |||
| addi A3, A3, 128 | |||
| addi A2, A2, 128 | |||
| xxpermdi vs60, vs7, vs15, 0 | |||
| xxpermdi vs61, vs23, vs31, 0 | |||
| xxpermdi vs62, vs7, vs15, 3 | |||
| xxpermdi vs63, vs23, vs31, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| stxvd2x vs34, o32, BO | |||
| stxvd2x vs35, o48, BO | |||
| stxvd2x vs36, o64, BO | |||
| stxvd2x vs37, o80, BO | |||
| stxvd2x vs38, o96, BO | |||
| stxvd2x vs39, o112, BO | |||
| addi BO, BO, 128 | |||
| stxvd2x vs40, o0, BO | |||
| stxvd2x vs41, o16, BO | |||
| stxvd2x vs42, o32, BO | |||
| stxvd2x vs43, o48, BO | |||
| stxvd2x vs44, o64, BO | |||
| stxvd2x vs45, o80, BO | |||
| stxvd2x vs46, o96, BO | |||
| stxvd2x vs47, o112, BO | |||
| addi BO, BO, 128 | |||
| stxvd2x vs48, o0, BO | |||
| stxvd2x vs49, o16, BO | |||
| stxvd2x vs50, o32, BO | |||
| stxvd2x vs51, o48, BO | |||
| stxvd2x vs52, o64, BO | |||
| stxvd2x vs53, o80, BO | |||
| stxvd2x vs54, o96, BO | |||
| stxvd2x vs55, o112, BO | |||
| addi BO, BO, 128 | |||
| stxvd2x vs56, o0, BO | |||
| stxvd2x vs57, o16, BO | |||
| stxvd2x vs58, o32, BO | |||
| stxvd2x vs59, o48, BO | |||
| stxvd2x vs60, o64, BO | |||
| stxvd2x vs61, o80, BO | |||
| stxvd2x vs62, o96, BO | |||
| stxvd2x vs63, o112, BO | |||
| addi BO, BO, 128 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x8 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| lxvd2x vs2, o32, A0 | |||
| lxvd2x vs3, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvd2x vs8, o0, A1 | |||
| lxvd2x vs9, o16, A1 | |||
| lxvd2x vs10, o32, A1 | |||
| lxvd2x vs11, o48, A1 | |||
| addi A1, A1, 64 | |||
| lxvd2x vs16, o0, A2 | |||
| lxvd2x vs17, o16, A2 | |||
| lxvd2x vs18, o32, A2 | |||
| lxvd2x vs19, o48, A2 | |||
| addi A2, A2, 64 | |||
| lxvd2x vs24, o0, A3 | |||
| lxvd2x vs25, o16, A3 | |||
| lxvd2x vs26, o32, A3 | |||
| lxvd2x vs27, o48, A3 | |||
| addi A3, A3, 64 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs16, vs24, 0 | |||
| xxpermdi vs34, vs0, vs8, 3 | |||
| xxpermdi vs35, vs16, vs24, 3 | |||
| xxpermdi vs36, vs1, vs9, 0 | |||
| xxpermdi vs37, vs17, vs25, 0 | |||
| xxpermdi vs38, vs1, vs9, 3 | |||
| xxpermdi vs39, vs17, vs25, 3 | |||
| xxpermdi vs40, vs2, vs10, 0 | |||
| xxpermdi vs41, vs18, vs26, 0 | |||
| xxpermdi vs42, vs2, vs10, 3 | |||
| xxpermdi vs43, vs18, vs26, 3 | |||
| xxpermdi vs44, vs3, vs11, 0 | |||
| xxpermdi vs45, vs19, vs27, 0 | |||
| xxpermdi vs46, vs3, vs11, 3 | |||
| xxpermdi vs47, vs19, vs27, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| stxvd2x vs34, o32, BO | |||
| stxvd2x vs35, o48, BO | |||
| stxvd2x vs36, o64, BO | |||
| stxvd2x vs37, o80, BO | |||
| stxvd2x vs38, o96, BO | |||
| stxvd2x vs39, o112, BO | |||
| addi BO, BO, 128 | |||
| stxvd2x vs40, o0, BO | |||
| stxvd2x vs41, o16, BO | |||
| stxvd2x vs42, o32, BO | |||
| stxvd2x vs43, o48, BO | |||
| stxvd2x vs44, o64, BO | |||
| stxvd2x vs45, o80, BO | |||
| stxvd2x vs46, o96, BO | |||
| stxvd2x vs47, o112, BO | |||
| addi BO, BO, 128 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x4 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| addi A0, A0, 32 | |||
| lxvd2x vs8, o0, A1 | |||
| lxvd2x vs9, o16, A1 | |||
| addi A1, A1, 32 | |||
| lxvd2x vs16, o0, A2 | |||
| lxvd2x vs17, o16, A2 | |||
| addi A2, A2, 32 | |||
| lxvd2x vs24, o0, A3 | |||
| lxvd2x vs25, o16, A3 | |||
| addi A3, A3, 32 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs16, vs24, 0 | |||
| xxpermdi vs34, vs0, vs8, 3 | |||
| xxpermdi vs35, vs16, vs24, 3 | |||
| xxpermdi vs36, vs1, vs9, 0 | |||
| xxpermdi vs37, vs17, vs25, 0 | |||
| xxpermdi vs38, vs1, vs9, 3 | |||
| xxpermdi vs39, vs17, vs25, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| stxvd2x vs34, o32, BO | |||
| stxvd2x vs35, o48, BO | |||
| stxvd2x vs36, o64, BO | |||
| stxvd2x vs37, o80, BO | |||
| stxvd2x vs38, o96, BO | |||
| stxvd2x vs39, o112, BO | |||
| addi BO, BO, 128 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x2 | |||
| lxvd2x vs0, o0, A0 | |||
| addi A0, A0, 16 | |||
| lxvd2x vs8, o0, A1 | |||
| addi A1, A1, 16 | |||
| lxvd2x vs16, o0, A2 | |||
| addi A2, A2, 16 | |||
| lxvd2x vs24, o0, A3 | |||
| addi A3, A3, 16 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs16, vs24, 0 | |||
| xxpermdi vs34, vs0, vs8, 3 | |||
| xxpermdi vs35, vs16, vs24, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| stxvd2x vs34, o32, BO | |||
| stxvd2x vs35, o48, BO | |||
| addi BO, BO, 64 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x1 | |||
| lxsdx vs0, o0, A0 | |||
| addi A0, A0, 8 | |||
| lxsdx vs8, o0, A1 | |||
| addi A1, A1, 8 | |||
| lxsdx vs16, o0, A2 | |||
| addi A2, A2, 8 | |||
| lxsdx vs24, o0, A3 | |||
| addi A3, A3, 8 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs16, vs24, 0 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| addi BO, BO, 32 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=16 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x16 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| lxvd2x vs2, o32, A0 | |||
| lxvd2x vs3, o48, A0 | |||
| lxvd2x vs4, o64, A0 | |||
| lxvd2x vs5, o80, A0 | |||
| lxvd2x vs6, o96, A0 | |||
| lxvd2x vs7, o112, A0 | |||
| addi A0, A0, 128 | |||
| lxvd2x vs8, o0, A1 | |||
| lxvd2x vs9, o16, A1 | |||
| lxvd2x vs10, o32, A1 | |||
| lxvd2x vs11, o48, A1 | |||
| lxvd2x vs12, o64, A1 | |||
| lxvd2x vs13, o80, A1 | |||
| lxvd2x vs14, o96, A1 | |||
| lxvd2x vs15, o112, A1 | |||
| addi A1, A1, 128 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs0, vs8, 3 | |||
| xxpermdi vs34, vs1, vs9, 0 | |||
| xxpermdi vs35, vs1, vs9, 3 | |||
| xxpermdi vs36, vs2, vs10, 0 | |||
| xxpermdi vs37, vs2, vs10, 3 | |||
| xxpermdi vs38, vs3, vs11, 0 | |||
| xxpermdi vs39, vs3, vs11, 3 | |||
| xxpermdi vs40, vs4, vs12, 0 | |||
| xxpermdi vs41, vs4, vs12, 3 | |||
| xxpermdi vs42, vs5, vs13, 0 | |||
| xxpermdi vs43, vs5, vs13, 3 | |||
| xxpermdi vs44, vs6, vs14, 0 | |||
| xxpermdi vs45, vs6, vs14, 3 | |||
| xxpermdi vs46, vs7, vs15, 0 | |||
| xxpermdi vs47, vs7, vs15, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| stxvd2x vs34, o32, BO | |||
| stxvd2x vs35, o48, BO | |||
| stxvd2x vs36, o64, BO | |||
| stxvd2x vs37, o80, BO | |||
| stxvd2x vs38, o96, BO | |||
| stxvd2x vs39, o112, BO | |||
| addi BO, BO, 128 | |||
| stxvd2x vs40, o0, BO | |||
| stxvd2x vs41, o16, BO | |||
| stxvd2x vs42, o32, BO | |||
| stxvd2x vs43, o48, BO | |||
| stxvd2x vs44, o64, BO | |||
| stxvd2x vs45, o80, BO | |||
| stxvd2x vs46, o96, BO | |||
| stxvd2x vs47, o112, BO | |||
| addi BO, BO, 128 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x8 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| lxvd2x vs2, o32, A0 | |||
| lxvd2x vs3, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvd2x vs8, o0, A1 | |||
| lxvd2x vs9, o16, A1 | |||
| lxvd2x vs10, o32, A1 | |||
| lxvd2x vs11, o48, A1 | |||
| addi A1, A1, 64 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs0, vs8, 3 | |||
| xxpermdi vs34, vs1, vs9, 0 | |||
| xxpermdi vs35, vs1, vs9, 3 | |||
| xxpermdi vs36, vs2, vs10, 0 | |||
| xxpermdi vs37, vs2, vs10, 3 | |||
| xxpermdi vs38, vs3, vs11, 0 | |||
| xxpermdi vs39, vs3, vs11, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| stxvd2x vs34, o32, BO | |||
| stxvd2x vs35, o48, BO | |||
| stxvd2x vs36, o64, BO | |||
| stxvd2x vs37, o80, BO | |||
| stxvd2x vs38, o96, BO | |||
| stxvd2x vs39, o112, BO | |||
| addi BO, BO, 128 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x4 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| addi A0, A0, 32 | |||
| lxvd2x vs8, o0, A1 | |||
| lxvd2x vs9, o16, A1 | |||
| addi A1, A1, 32 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs0, vs8, 3 | |||
| xxpermdi vs34, vs1, vs9, 0 | |||
| xxpermdi vs35, vs1, vs9, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| stxvd2x vs34, o32, BO | |||
| stxvd2x vs35, o48, BO | |||
| addi BO, BO, 64 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x2 | |||
| lxvd2x vs0, o0, A0 | |||
| addi A0, A0, 16 | |||
| lxvd2x vs8, o0, A1 | |||
| addi A1, A1, 16 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs0, vs8, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| addi BO, BO, 32 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x1 | |||
| lxsdx vs0, o0, A0 | |||
| addi A0, A0, 8 | |||
| lxsdx vs8, o0, A1 | |||
| addi A1, A1, 8 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| stxvd2x vs32, o0, BO | |||
| addi BO, BO, 16 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=16 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x16 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| lxvd2x vs2, o32, A0 | |||
| lxvd2x vs3, o48, A0 | |||
| lxvd2x vs4, o64, A0 | |||
| lxvd2x vs5, o80, A0 | |||
| lxvd2x vs6, o96, A0 | |||
| lxvd2x vs7, o112, A0 | |||
| addi A0, A0, 128 | |||
| stxvd2x vs0, o0, BO | |||
| stxvd2x vs1, o16, BO | |||
| stxvd2x vs2, o32, BO | |||
| stxvd2x vs3, o48, BO | |||
| addi BO, BO, 64 | |||
| stxvd2x vs4, o0, BO | |||
| stxvd2x vs5, o16, BO | |||
| stxvd2x vs6, o32, BO | |||
| stxvd2x vs7, o48, BO | |||
| addi BO, BO, 64 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x8 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| lxvd2x vs2, o32, A0 | |||
| lxvd2x vs3, o48, A0 | |||
| addi A0, A0, 64 | |||
| stxvd2x vs0, o0, BO | |||
| stxvd2x vs1, o16, BO | |||
| stxvd2x vs2, o32, BO | |||
| stxvd2x vs3, o48, BO | |||
| addi BO, BO, 64 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x4 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| addi A0, A0, 32 | |||
| stxvd2x vs0, o0, BO | |||
| stxvd2x vs1, o16, BO | |||
| addi BO, BO, 32 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x2 | |||
| lxvd2x vs0, o0, A0 | |||
| addi A0, A0, 16 | |||
| stxvd2x vs0, o0, BO | |||
| addi BO, BO, 16 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x1 | |||
| lxsdx vs0, o0, A0 | |||
| addi A0, A0, 8 | |||
| stxsdx vs0, o0, BO | |||
| addi BO, BO, 8 | |||
| .endm | |||
| @@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add B2, B2, B | |||
| add B1, B1, B | |||
| li PREA, 768 | |||
| li PREA, 256 | |||
| addi PREB, M16, 128 | |||
| li o8, 8 | |||
| @@ -57,16 +57,20 @@ DCOPYT_L4_BEGIN: | |||
| DCOPYT_L4x16_LOOP: | |||
| /* | |||
| addi T1, PREB, 128 | |||
| addi T2, PREB, 256 | |||
| */ | |||
| dcbt A0, PREA | |||
| dcbt A1, PREA | |||
| dcbt A2, PREA | |||
| dcbt A3, PREA | |||
| /* | |||
| dcbtst BO, M16 | |||
| dcbtst BO, PREB | |||
| dcbtst BO, T1 | |||
| dcbtst BO, T2 | |||
| */ | |||
| COPY_4x16 | |||
| add BO, BO, M16 | |||
| @@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define PRE r30 | |||
| #define T2 r31 | |||
| #include "dgemm_macros_16x4_power8.S" | |||
| #include "dtrmm_macros_16x4_power8.S" | |||
| #ifndef NEEDPARAM | |||
| @@ -0,0 +1,207 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/23 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define M r3 | |||
| #define N r4 | |||
| #define A r5 | |||
| #define LDA r6 | |||
| #define B r7 | |||
| #define A0 r8 | |||
| #define A1 r9 | |||
| #define A2 r10 | |||
| #define A3 r11 | |||
| #define J r12 | |||
| #define PREA r14 | |||
| #define PREB r15 | |||
| #define BO r16 | |||
| #define B8 r17 | |||
| #define B4 r18 | |||
| #define B2 r19 | |||
| #define B1 r20 | |||
| #define o4 r21 | |||
| #define T2 r22 | |||
| #define I r23 | |||
| #define o16 r24 | |||
| #define o32 r25 | |||
| #define o48 r26 | |||
| #define NOTU1 r29 | |||
| #define M8 r30 | |||
| #define T1 r31 | |||
| #define o0 0 | |||
| #include "sgemm_tcopy_macros_8_power8.S" | |||
| #define STACKSIZE 384 | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| cmpwi cr0, M, 0 | |||
| ble- L999 | |||
| cmpwi cr0, N, 0 | |||
| ble- L999 | |||
| slwi LDA, LDA, BASE_SHIFT | |||
| slwi M8, M, 3 + BASE_SHIFT | |||
| li T2, -8 | |||
| li PREA, -4 | |||
| li PREB, -2 | |||
| and B4, N, T2 | |||
| and B2, N, PREA | |||
| and B1, N, PREB | |||
| mullw B4, B4, M | |||
| mullw B2, B2, M | |||
| mullw B1, B1, M | |||
| slwi B4, B4, BASE_SHIFT | |||
| slwi B2, B2, BASE_SHIFT | |||
| slwi B1, B1, BASE_SHIFT | |||
| add B4, B4, B | |||
| add B2, B2, B | |||
| add B1, B1, B | |||
| li PREA, 384 | |||
| addi PREB, M8, 128 | |||
| li o4, 4 | |||
| li o16, 16 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| #include "sgemm_tcopy_logic_8_power8.S" | |||
| L999: | |||
| li r3, 0 | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| @@ -0,0 +1,299 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/23 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| srawi. I, M, 2 | |||
| ble SCOPYOT_L2_BEGIN | |||
| SCOPYOT_L4_BEGIN: | |||
| mr A0, A | |||
| add A1, A0, LDA | |||
| add A2, A1, LDA | |||
| add A3, A2, LDA | |||
| add A, A3, LDA | |||
| mr B8, B | |||
| addi B, B, 32*SIZE | |||
| sradi. J, N, 3 | |||
| ble SCOPYOT_L4x4_BEGIN | |||
| mr BO, B8 | |||
| .align 5 | |||
| SCOPYOT_L4x8_LOOP: | |||
| dcbt A0, PREA | |||
| dcbt A1, PREA | |||
| dcbt A2, PREA | |||
| dcbt A3, PREA | |||
| COPY_4x8 | |||
| addi A0, A0, 8*SIZE | |||
| addi A1, A1, 8*SIZE | |||
| addi A2, A2, 8*SIZE | |||
| addi A3, A3, 8*SIZE | |||
| add BO, BO, M8 | |||
| addic. J, J, -1 | |||
| ble SCOPYOT_L4x4_BEGIN | |||
| COPY_4x8 | |||
| addi A0, A0, 8*SIZE | |||
| addi A1, A1, 8*SIZE | |||
| addi A2, A2, 8*SIZE | |||
| addi A3, A3, 8*SIZE | |||
| add BO, BO, M8 | |||
| addic. J, J, -1 | |||
| ble SCOPYOT_L4x4_BEGIN | |||
| COPY_4x8 | |||
| addi A0, A0, 8*SIZE | |||
| addi A1, A1, 8*SIZE | |||
| addi A2, A2, 8*SIZE | |||
| addi A3, A3, 8*SIZE | |||
| add BO, BO, M8 | |||
| addic. J, J, -1 | |||
| ble SCOPYOT_L4x4_BEGIN | |||
| COPY_4x8 | |||
| addi A0, A0, 8*SIZE | |||
| addi A1, A1, 8*SIZE | |||
| addi A2, A2, 8*SIZE | |||
| addi A3, A3, 8*SIZE | |||
| add BO, BO, M8 | |||
| addic. J, J, -1 | |||
| bgt SCOPYOT_L4x8_LOOP | |||
| SCOPYOT_L4x4_BEGIN: | |||
| andi. T1, N, 4 | |||
| ble SCOPYOT_L4x2_BEGIN | |||
| mr BO, B4 | |||
| COPY_4x4 | |||
| addi A0, A0, 4*SIZE | |||
| addi A1, A1, 4*SIZE | |||
| addi A2, A2, 4*SIZE | |||
| addi A3, A3, 4*SIZE | |||
| addi B4, B4, 16*SIZE | |||
| SCOPYOT_L4x2_BEGIN: | |||
| andi. T1, N, 2 | |||
| ble SCOPYOT_L4x1_BEGIN | |||
| mr BO, B2 | |||
| COPY_4x2 | |||
| addi A0, A0, 2*SIZE | |||
| addi A1, A1, 2*SIZE | |||
| addi A2, A2, 2*SIZE | |||
| addi A3, A3, 2*SIZE | |||
| addi B2, B2, 8*SIZE | |||
| SCOPYOT_L4x1_BEGIN: | |||
| andi. T1, N, 1 | |||
| ble SCOPYOT_L4_END | |||
| mr BO, B1 | |||
| COPY_4x1 | |||
| addi A0, A0, 1*SIZE | |||
| addi A1, A1, 1*SIZE | |||
| addi A2, A2, 1*SIZE | |||
| addi A3, A3, 1*SIZE | |||
| addi B1, B1, 4*SIZE | |||
| SCOPYOT_L4_END: | |||
| addic. I, I, -1 | |||
| bgt SCOPYOT_L4_BEGIN | |||
| SCOPYOT_L2_BEGIN: | |||
| andi. T1, M, 2 | |||
| ble SCOPYOT_L1_BEGIN | |||
| mr A0, A | |||
| add A1, A0, LDA | |||
| add A, A1, LDA | |||
| mr B8, B | |||
| addi B, B, 16*SIZE | |||
| sradi. J, N, 3 | |||
| ble SCOPYOT_L2x4_BEGIN | |||
| mr BO, B8 | |||
| SCOPYOT_L2x8_LOOP: | |||
| COPY_2x8 | |||
| addi A0, A0, 8*SIZE | |||
| addi A1, A1, 8*SIZE | |||
| add BO, BO, M8 | |||
| addic. J, J, -1 | |||
| bgt SCOPYOT_L2x8_LOOP | |||
| SCOPYOT_L2x4_BEGIN: | |||
| andi. T1, N, 4 | |||
| ble SCOPYOT_L2x2_BEGIN | |||
| mr BO, B4 | |||
| COPY_2x4 | |||
| addi A0, A0, 4*SIZE | |||
| addi A1, A1, 4*SIZE | |||
| addi B4, B4, 8*SIZE | |||
| SCOPYOT_L2x2_BEGIN: | |||
| andi. T1, N, 2 | |||
| ble SCOPYOT_L2x1_BEGIN | |||
| mr BO, B2 | |||
| COPY_2x2 | |||
| addi A0, A0, 2*SIZE | |||
| addi A1, A1, 2*SIZE | |||
| addi B2, B2, 4*SIZE | |||
| SCOPYOT_L2x1_BEGIN: | |||
| andi. T1, N, 1 | |||
| ble SCOPYOT_L2_END | |||
| mr BO, B1 | |||
| COPY_2x1 | |||
| addi A0, A0, 1*SIZE | |||
| addi A1, A1, 1*SIZE | |||
| addi B1, B1, 2*SIZE | |||
| SCOPYOT_L2_END: | |||
| SCOPYOT_L1_BEGIN: | |||
| andi. T1, M, 1 | |||
| ble L999 | |||
| mr A0, A | |||
| add A, A0, LDA | |||
| mr B8, B | |||
| addi B, B, 8*SIZE | |||
| sradi. J, N, 3 | |||
| ble SCOPYOT_L1x4_BEGIN | |||
| mr BO, B8 | |||
| SCOPYOT_L1x8_LOOP: | |||
| COPY_1x8 | |||
| addi A0, A0, 8*SIZE | |||
| add BO, BO, M8 | |||
| addic. J, J, -1 | |||
| bgt SCOPYOT_L1x8_LOOP | |||
| SCOPYOT_L1x4_BEGIN: | |||
| andi. T1, N, 4 | |||
| ble SCOPYOT_L1x2_BEGIN | |||
| mr BO, B4 | |||
| COPY_1x4 | |||
| addi A0, A0, 4*SIZE | |||
| addi B4, B4, 4*SIZE | |||
| SCOPYOT_L1x2_BEGIN: | |||
| andi. T1, N, 2 | |||
| ble SCOPYOT_L1x1_BEGIN | |||
| mr BO, B2 | |||
| COPY_1x2 | |||
| addi A0, A0, 2*SIZE | |||
| addi B2, B2, 2*SIZE | |||
| SCOPYOT_L1x1_BEGIN: | |||
| andi. T1, N, 1 | |||
| ble SCOPYOT_L1_END | |||
| mr BO, B1 | |||
| COPY_1x1 | |||
| addi A0, A0, 1*SIZE | |||
| addi B1, B1, 1*SIZE | |||
| SCOPYOT_L1_END: | |||
| @@ -0,0 +1,308 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/23 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x8 | |||
| lxvw4x vs32, o0, A0 | |||
| lxvw4x vs33, o16, A0 | |||
| lxvw4x vs34, o0, A1 | |||
| lxvw4x vs35, o16, A1 | |||
| lxvw4x vs36, o0, A2 | |||
| lxvw4x vs37, o16, A2 | |||
| lxvw4x vs38, o0, A3 | |||
| lxvw4x vs39, o16, A3 | |||
| mr T1, BO | |||
| stxvw4x vs32, o0, T1 | |||
| stxvw4x vs33, o16, T1 | |||
| stxvw4x vs34, o32, T1 | |||
| stxvw4x vs35, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvw4x vs36, o0, T1 | |||
| stxvw4x vs37, o16, T1 | |||
| stxvw4x vs38, o32, T1 | |||
| stxvw4x vs39, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x4 | |||
| lxvw4x vs32, o0, A0 | |||
| lxvw4x vs33, o0, A1 | |||
| lxvw4x vs34, o0, A2 | |||
| lxvw4x vs35, o0, A3 | |||
| mr T1, BO | |||
| stxvw4x vs32, o0, T1 | |||
| stxvw4x vs33, o16, T1 | |||
| stxvw4x vs34, o32, T1 | |||
| stxvw4x vs35, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x2 | |||
| lxsspx vs32, o0, A0 | |||
| lxsspx vs33, o4, A0 | |||
| lxsspx vs34, o0, A1 | |||
| lxsspx vs35, o4, A1 | |||
| lxsspx vs36, o0, A2 | |||
| lxsspx vs37, o4, A2 | |||
| lxsspx vs38, o0, A3 | |||
| lxsspx vs39, o4, A3 | |||
| mr T1, BO | |||
| stxsspx vs32, o0, T1 | |||
| stxsspx vs33, o4, T1 | |||
| addi T1, T1, 8 | |||
| stxsspx vs34, o0, T1 | |||
| stxsspx vs35, o4, T1 | |||
| addi T1, T1, 8 | |||
| stxsspx vs36, o0, T1 | |||
| stxsspx vs37, o4, T1 | |||
| addi T1, T1, 8 | |||
| stxsspx vs38, o0, T1 | |||
| stxsspx vs39, o4, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x1 | |||
| lxsspx vs32, o0, A0 | |||
| lxsspx vs33, o0, A1 | |||
| lxsspx vs34, o0, A2 | |||
| lxsspx vs35, o0, A3 | |||
| mr T1, BO | |||
| stxsspx vs32, o0, T1 | |||
| stxsspx vs33, o4, T1 | |||
| addi T1, T1, 8 | |||
| stxsspx vs34, o0, T1 | |||
| stxsspx vs35, o4, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x8 | |||
| lxvw4x vs32, o0, A0 | |||
| lxvw4x vs33, o16, A0 | |||
| lxvw4x vs34, o0, A1 | |||
| lxvw4x vs35, o16, A1 | |||
| mr T1, BO | |||
| stxvw4x vs32, o0, T1 | |||
| stxvw4x vs33, o16, T1 | |||
| stxvw4x vs34, o32, T1 | |||
| stxvw4x vs35, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x4 | |||
| lxvw4x vs32, o0, A0 | |||
| lxvw4x vs33, o0, A1 | |||
| mr T1, BO | |||
| stxvw4x vs32, o0, T1 | |||
| stxvw4x vs33, o16, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x2 | |||
| lxsspx vs32, o0, A0 | |||
| lxsspx vs33, o4, A0 | |||
| lxsspx vs34, o0, A1 | |||
| lxsspx vs35, o4, A1 | |||
| mr T1, BO | |||
| stxsspx vs32, o0, T1 | |||
| stxsspx vs33, o4, T1 | |||
| addi T1, T1, 8 | |||
| stxsspx vs34, o0, T1 | |||
| stxsspx vs35, o4, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x1 | |||
| lxsspx vs32, o0, A0 | |||
| lxsspx vs33, o0, A1 | |||
| mr T1, BO | |||
| stxsspx vs32, o0, T1 | |||
| stxsspx vs33, o4, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x8 | |||
| lxvw4x vs32, o0, A0 | |||
| lxvw4x vs33, o16, A0 | |||
| mr T1, BO | |||
| stxvw4x vs32, o0, T1 | |||
| stxvw4x vs33, o16, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x4 | |||
| lxvw4x vs32, o0, A0 | |||
| mr T1, BO | |||
| stxvw4x vs32, o0, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x2 | |||
| lxsspx vs32, o0, A0 | |||
| lxsspx vs33, o4, A0 | |||
| mr T1, BO | |||
| stxsspx vs32, o0, T1 | |||
| stxsspx vs33, o4, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x1 | |||
| lxsspx vs32, o0, A0 | |||
| mr T1, BO | |||
| stxsspx vs32, o0, T1 | |||
| .endm | |||
| @@ -1,3 +1,73 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/22 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/22 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| @@ -250,7 +320,7 @@ | |||
| ble L999 | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| li PRE, 384 | |||
| li PRE, 512 | |||
| li o8 , 8 | |||
| li o16 , 16 | |||
| li o24 , 24 | |||
| @@ -1,3 +1,39 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/22 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| srawi. J, N, 1 | |||
| ble ZGEMM_L2_END | |||
| @@ -5,20 +41,34 @@ ZGEMM_L2_BEGIN: | |||
| mr BO, B | |||
| mr BBO, BBUFFER | |||
| slwi T1, K, 1 | |||
| srawi. T1, K, 2 | |||
| ble ZGEMM_L2_COPYB1 | |||
| ZGEMM_L2_COPYB: | |||
| ZGEMM_L2_COPYB8: | |||
| lxvdsx vs4, o0, BO // b0_r | |||
| lxvdsx vs5, o8, BO // b0_i | |||
| addi BO, BO, 16 | |||
| stxvd2x vs4, o0, BBO | |||
| stxvd2x vs5, o16, BBO | |||
| addi T2, PRE, 128 | |||
| dcbt BO, PRE | |||
| dcbtst BBO, PRE | |||
| dcbtst BBO, T2 | |||
| ZCOPYB_8x1 | |||
| addic. T1, T1, -1 | |||
| addi BBO, BBO, 32 | |||
| bge ZGEMM_L2_COPYB | |||
| bgt ZGEMM_L2_COPYB8 | |||
| ZGEMM_L2_COPYB1: | |||
| andi. T1, K, 3 | |||
| ble ZGEMM_L2_COPYB_END | |||
| ZGEMM_L2_COPYB_LOOP: | |||
| ZCOPYB_1x1 | |||
| ZCOPYB_1x1 | |||
| addic. T1, T1, -1 | |||
| bgt ZGEMM_L2_COPYB_LOOP | |||
| ZGEMM_L2_COPYB_END: | |||
| mr CO, C | |||
| mr AO, A | |||
| @@ -493,6 +543,7 @@ ZGEMM_L1_BEGIN: | |||
| slwi T1, K, 0 | |||
| ZGEMM_L1_COPYB: | |||
| dcbtst BBO, PRE | |||
| lxvdsx vs4, o0, BO // b0_r | |||
| lxvdsx vs5, o8, BO // b0_i | |||
| @@ -1,3 +1,38 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/22 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define XSFADD_R1 xsadddp | |||
| @@ -3055,3 +3090,76 @@ | |||
| .endm | |||
| .macro ZCOPYB_1x1 | |||
| lxvdsx vs4, o0, BO // b0_r | |||
| lxvdsx vs5, o8, BO // b0_i | |||
| addi BO, BO, 16 | |||
| stxvd2x vs4, o0, BBO | |||
| stxvd2x vs5, o16, BBO | |||
| addi BBO, BBO, 32 | |||
| .endm | |||
| .macro ZCOPYB_8x1 | |||
| lxvd2x vs32, o0, BO | |||
| lxvd2x vs33, o16, BO | |||
| lxvd2x vs34, o32, BO | |||
| lxvd2x vs35, o48, BO | |||
| addi BO, BO, 64 | |||
| lxvd2x vs36, o0, BO | |||
| lxvd2x vs37, o16, BO | |||
| lxvd2x vs38, o32, BO | |||
| lxvd2x vs39, o48, BO | |||
| addi BO, BO, 64 | |||
| xxspltd vs40, vs32, 0 | |||
| xxspltd vs41, vs32, 1 | |||
| xxspltd vs42, vs33, 0 | |||
| xxspltd vs43, vs33, 1 | |||
| xxspltd vs44, vs34, 0 | |||
| xxspltd vs45, vs34, 1 | |||
| xxspltd vs46, vs35, 0 | |||
| xxspltd vs47, vs35, 1 | |||
| xxspltd vs48, vs36, 0 | |||
| xxspltd vs49, vs36, 1 | |||
| xxspltd vs50, vs37, 0 | |||
| xxspltd vs51, vs37, 1 | |||
| xxspltd vs52, vs38, 0 | |||
| xxspltd vs53, vs38, 1 | |||
| xxspltd vs54, vs39, 0 | |||
| xxspltd vs55, vs39, 1 | |||
| stxvd2x vs40, o0, BBO | |||
| stxvd2x vs41, o16, BBO | |||
| stxvd2x vs42, o32, BBO | |||
| stxvd2x vs43, o48, BBO | |||
| addi BBO, BBO, 64 | |||
| stxvd2x vs44, o0, BBO | |||
| stxvd2x vs45, o16, BBO | |||
| stxvd2x vs46, o32, BBO | |||
| stxvd2x vs47, o48, BBO | |||
| addi BBO, BBO, 64 | |||
| stxvd2x vs48, o0, BBO | |||
| stxvd2x vs49, o16, BBO | |||
| stxvd2x vs50, o32, BBO | |||
| stxvd2x vs51, o48, BBO | |||
| addi BBO, BBO, 64 | |||
| stxvd2x vs52, o0, BBO | |||
| stxvd2x vs53, o16, BBO | |||
| stxvd2x vs54, o32, BBO | |||
| stxvd2x vs55, o48, BBO | |||
| addi BBO, BBO, 64 | |||
| .endm | |||
| @@ -0,0 +1,205 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/22 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define M r3 | |||
| #define N r4 | |||
| #define A r5 | |||
| #define LDA r6 | |||
| #define B r7 | |||
| #define A0 r8 | |||
| #define A1 r9 | |||
| #define A2 r10 | |||
| #define A3 r11 | |||
| #define J r12 | |||
| #define PREA r14 | |||
| #define PREB r15 | |||
| #define BO r16 | |||
| #define B8 r17 | |||
| #define B4 r18 | |||
| #define B2 r19 | |||
| #define B1 r20 | |||
| #define NOTUS1 r21 | |||
| #define T2 r22 | |||
| #define I r23 | |||
| #define o16 r24 | |||
| #define o32 r25 | |||
| #define o48 r26 | |||
| #define NOTUS2 r27 | |||
| #define M8 r30 | |||
| #define T1 r31 | |||
| #define o0 0 | |||
| #include "zgemm_tcopy_macros_8_power8.S" | |||
| #define STACKSIZE 384 | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| cmpwi cr0, M, 0 | |||
| ble- L999 | |||
| cmpwi cr0, N, 0 | |||
| ble- L999 | |||
| slwi LDA, LDA, ZBASE_SHIFT | |||
| slwi M8, M, 3 + ZBASE_SHIFT | |||
| li T2, -8 | |||
| li PREA, -4 | |||
| li PREB, -2 | |||
| and B4, N, T2 | |||
| and B2, N, PREA | |||
| and B1, N, PREB | |||
| mullw B4, B4, M | |||
| mullw B2, B2, M | |||
| mullw B1, B1, M | |||
| slwi B4, B4, ZBASE_SHIFT | |||
| slwi B2, B2, ZBASE_SHIFT | |||
| slwi B1, B1, ZBASE_SHIFT | |||
| add B4, B4, B | |||
| add B2, B2, B | |||
| add B1, B1, B | |||
| li PREA, 384 | |||
| addi PREB, M8, 128 | |||
| li o16, 16 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| #include "zgemm_tcopy_logic_8_power8.S" | |||
| L999: | |||
| li r3, 0 | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| @@ -0,0 +1,246 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/22 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| srawi. I, M, 2 | |||
| ble ZCOPYT_L2_BEGIN | |||
| ZCOPYT_L4_BEGIN: | |||
| mr A0, A | |||
| add A1, A0, LDA | |||
| add A2, A1, LDA | |||
| add A3, A2, LDA | |||
| add A, A3, LDA | |||
| mr B8, B | |||
| addi B, B, 64*SIZE | |||
| sradi. J, N, 3 | |||
| ble ZCOPYT_L4x4_BEGIN | |||
| mr BO, B8 | |||
| .align 5 | |||
| ZCOPYT_L4x8_LOOP: | |||
| addi T1, PREB, 128 | |||
| addi T2, PREB, 256 | |||
| dcbt A0, PREA | |||
| dcbt A1, PREA | |||
| dcbt A2, PREA | |||
| dcbt A3, PREA | |||
| dcbtst BO, M8 | |||
| dcbtst BO, PREB | |||
| dcbtst BO, T1 | |||
| dcbtst BO, T2 | |||
| COPY_4x8 | |||
| add BO, BO, M8 | |||
| addic. J, J, -1 | |||
| bgt ZCOPYT_L4x8_LOOP | |||
| ZCOPYT_L4x4_BEGIN: | |||
| andi. T1, N, 4 | |||
| ble ZCOPYT_L4x2_BEGIN | |||
| mr BO, B4 | |||
| COPY_4x4 | |||
| addi B4, B4, 32*SIZE | |||
| ZCOPYT_L4x2_BEGIN: | |||
| andi. T1, N, 2 | |||
| ble ZCOPYT_L4x1_BEGIN | |||
| mr BO, B2 | |||
| COPY_4x2 | |||
| addi B2, B2, 16*SIZE | |||
| ZCOPYT_L4x1_BEGIN: | |||
| andi. T1, N, 1 | |||
| ble ZCOPYT_L4_END | |||
| mr BO, B1 | |||
| COPY_4x1 | |||
| addi B1, B1, 8*SIZE | |||
| ZCOPYT_L4_END: | |||
| addic. I, I, -1 | |||
| bgt ZCOPYT_L4_BEGIN | |||
| ZCOPYT_L2_BEGIN: | |||
| andi. T1, M, 2 | |||
| ble ZCOPYT_L1_BEGIN | |||
| mr A0, A | |||
| add A1, A0, LDA | |||
| add A, A1, LDA | |||
| mr B8, B | |||
| addi B, B, 32*SIZE | |||
| sradi. J, N, 3 | |||
| ble ZCOPYT_L2x4_BEGIN | |||
| mr BO, B8 | |||
| ZCOPYT_L2x8_LOOP: | |||
| COPY_2x8 | |||
| add BO, BO, M8 | |||
| addic. J, J, -1 | |||
| bgt ZCOPYT_L2x8_LOOP | |||
| ZCOPYT_L2x4_BEGIN: | |||
| andi. T1, N, 4 | |||
| ble ZCOPYT_L2x2_BEGIN | |||
| mr BO, B4 | |||
| COPY_2x4 | |||
| addi B4, B4, 16*SIZE | |||
| ZCOPYT_L2x2_BEGIN: | |||
| andi. T1, N, 2 | |||
| ble ZCOPYT_L2x1_BEGIN | |||
| mr BO, B2 | |||
| COPY_2x2 | |||
| addi B2, B2, 8*SIZE | |||
| ZCOPYT_L2x1_BEGIN: | |||
| andi. T1, N, 1 | |||
| ble ZCOPYT_L2_END | |||
| mr BO, B1 | |||
| COPY_2x1 | |||
| addi B1, B1, 4*SIZE | |||
| ZCOPYT_L2_END: | |||
| ZCOPYT_L1_BEGIN: | |||
| andi. T1, M, 1 | |||
| ble L999 | |||
| mr A0, A | |||
| add A, A0, LDA | |||
| mr B8, B | |||
| addi B, B, 16*SIZE | |||
| sradi. J, N, 3 | |||
| ble ZCOPYT_L1x4_BEGIN | |||
| mr BO, B8 | |||
| ZCOPYT_L1x8_LOOP: | |||
| COPY_1x8 | |||
| add BO, BO, M8 | |||
| addic. J, J, -1 | |||
| bgt ZCOPYT_L1x8_LOOP | |||
| ZCOPYT_L1x4_BEGIN: | |||
| andi. T1, N, 4 | |||
| ble ZCOPYT_L1x2_BEGIN | |||
| mr BO, B4 | |||
| COPY_1x4 | |||
| addi B4, B4, 8*SIZE | |||
| ZCOPYT_L1x2_BEGIN: | |||
| andi. T1, N, 2 | |||
| ble ZCOPYT_L1x1_BEGIN | |||
| mr BO, B2 | |||
| COPY_1x2 | |||
| addi B2, B2, 4*SIZE | |||
| ZCOPYT_L1x1_BEGIN: | |||
| andi. T1, N, 1 | |||
| ble ZCOPYT_L1_END | |||
| mr BO, B1 | |||
| COPY_1x1 | |||
| addi B1, B1, 2*SIZE | |||
| ZCOPYT_L1_END: | |||
| @@ -0,0 +1,535 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/22 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x8 | |||
| lxvd2x vs32, o0, A0 | |||
| lxvd2x vs33, o16, A0 | |||
| lxvd2x vs34, o32, A0 | |||
| lxvd2x vs35, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvd2x vs36, o0, A0 | |||
| lxvd2x vs37, o16, A0 | |||
| lxvd2x vs38, o32, A0 | |||
| lxvd2x vs39, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvd2x vs40, o0, A1 | |||
| lxvd2x vs41, o16, A1 | |||
| lxvd2x vs42, o32, A1 | |||
| lxvd2x vs43, o48, A1 | |||
| addi A1, A1, 64 | |||
| lxvd2x vs44, o0, A1 | |||
| lxvd2x vs45, o16, A1 | |||
| lxvd2x vs46, o32, A1 | |||
| lxvd2x vs47, o48, A1 | |||
| addi A1, A1, 64 | |||
| lxvd2x vs48, o0, A2 | |||
| lxvd2x vs49, o16, A2 | |||
| lxvd2x vs50, o32, A2 | |||
| lxvd2x vs51, o48, A2 | |||
| addi A2, A2, 64 | |||
| lxvd2x vs52, o0, A2 | |||
| lxvd2x vs53, o16, A2 | |||
| lxvd2x vs54, o32, A2 | |||
| lxvd2x vs55, o48, A2 | |||
| addi A2, A2, 64 | |||
| lxvd2x vs56, o0, A3 | |||
| lxvd2x vs57, o16, A3 | |||
| lxvd2x vs58, o32, A3 | |||
| lxvd2x vs59, o48, A3 | |||
| addi A3, A3, 64 | |||
| lxvd2x vs60, o0, A3 | |||
| lxvd2x vs61, o16, A3 | |||
| lxvd2x vs62, o32, A3 | |||
| lxvd2x vs63, o48, A3 | |||
| addi A3, A3, 64 | |||
| mr T1, BO | |||
| stxvd2x vs32, o0, T1 | |||
| stxvd2x vs33, o16, T1 | |||
| stxvd2x vs34, o32, T1 | |||
| stxvd2x vs35, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs36, o0, T1 | |||
| stxvd2x vs37, o16, T1 | |||
| stxvd2x vs38, o32, T1 | |||
| stxvd2x vs39, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs40, o0, T1 | |||
| stxvd2x vs41, o16, T1 | |||
| stxvd2x vs42, o32, T1 | |||
| stxvd2x vs43, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs44, o0, T1 | |||
| stxvd2x vs45, o16, T1 | |||
| stxvd2x vs46, o32, T1 | |||
| stxvd2x vs47, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs48, o0, T1 | |||
| stxvd2x vs49, o16, T1 | |||
| stxvd2x vs50, o32, T1 | |||
| stxvd2x vs51, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs52, o0, T1 | |||
| stxvd2x vs53, o16, T1 | |||
| stxvd2x vs54, o32, T1 | |||
| stxvd2x vs55, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs56, o0, T1 | |||
| stxvd2x vs57, o16, T1 | |||
| stxvd2x vs58, o32, T1 | |||
| stxvd2x vs59, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs60, o0, T1 | |||
| stxvd2x vs61, o16, T1 | |||
| stxvd2x vs62, o32, T1 | |||
| stxvd2x vs63, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x4 | |||
| lxvd2x vs32, o0, A0 | |||
| lxvd2x vs33, o16, A0 | |||
| lxvd2x vs34, o32, A0 | |||
| lxvd2x vs35, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvd2x vs36, o0, A1 | |||
| lxvd2x vs37, o16, A1 | |||
| lxvd2x vs38, o32, A1 | |||
| lxvd2x vs39, o48, A1 | |||
| addi A1, A1, 64 | |||
| lxvd2x vs40, o0, A2 | |||
| lxvd2x vs41, o16, A2 | |||
| lxvd2x vs42, o32, A2 | |||
| lxvd2x vs43, o48, A2 | |||
| addi A2, A2, 64 | |||
| lxvd2x vs44, o0, A3 | |||
| lxvd2x vs45, o16, A3 | |||
| lxvd2x vs46, o32, A3 | |||
| lxvd2x vs47, o48, A3 | |||
| addi A3, A3, 64 | |||
| mr T1, BO | |||
| stxvd2x vs32, o0, T1 | |||
| stxvd2x vs33, o16, T1 | |||
| stxvd2x vs34, o32, T1 | |||
| stxvd2x vs35, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs36, o0, T1 | |||
| stxvd2x vs37, o16, T1 | |||
| stxvd2x vs38, o32, T1 | |||
| stxvd2x vs39, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs40, o0, T1 | |||
| stxvd2x vs41, o16, T1 | |||
| stxvd2x vs42, o32, T1 | |||
| stxvd2x vs43, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs44, o0, T1 | |||
| stxvd2x vs45, o16, T1 | |||
| stxvd2x vs46, o32, T1 | |||
| stxvd2x vs47, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x2 | |||
| lxvd2x vs32, o0, A0 | |||
| lxvd2x vs33, o16, A0 | |||
| addi A0, A0, 32 | |||
| lxvd2x vs34, o0, A1 | |||
| lxvd2x vs35, o16, A1 | |||
| addi A1, A1, 32 | |||
| lxvd2x vs36, o0, A2 | |||
| lxvd2x vs37, o16, A2 | |||
| addi A2, A2, 32 | |||
| lxvd2x vs38, o0, A3 | |||
| lxvd2x vs39, o16, A3 | |||
| addi A3, A3, 32 | |||
| mr T1, BO | |||
| stxvd2x vs32, o0, T1 | |||
| stxvd2x vs33, o16, T1 | |||
| stxvd2x vs34, o32, T1 | |||
| stxvd2x vs35, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs36, o0, T1 | |||
| stxvd2x vs37, o16, T1 | |||
| stxvd2x vs38, o32, T1 | |||
| stxvd2x vs39, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x1 | |||
| lxvd2x vs32, o0, A0 | |||
| addi A0, A0, 16 | |||
| lxvd2x vs33, o0, A1 | |||
| addi A1, A1, 16 | |||
| lxvd2x vs34, o0, A2 | |||
| addi A2, A2, 16 | |||
| lxvd2x vs35, o0, A3 | |||
| addi A3, A3, 16 | |||
| mr T1, BO | |||
| stxvd2x vs32, o0, T1 | |||
| stxvd2x vs33, o16, T1 | |||
| stxvd2x vs34, o32, T1 | |||
| stxvd2x vs35, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x8 | |||
| lxvd2x vs32, o0, A0 | |||
| lxvd2x vs33, o16, A0 | |||
| lxvd2x vs34, o32, A0 | |||
| lxvd2x vs35, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvd2x vs36, o0, A0 | |||
| lxvd2x vs37, o16, A0 | |||
| lxvd2x vs38, o32, A0 | |||
| lxvd2x vs39, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvd2x vs40, o0, A1 | |||
| lxvd2x vs41, o16, A1 | |||
| lxvd2x vs42, o32, A1 | |||
| lxvd2x vs43, o48, A1 | |||
| addi A1, A1, 64 | |||
| lxvd2x vs44, o0, A1 | |||
| lxvd2x vs45, o16, A1 | |||
| lxvd2x vs46, o32, A1 | |||
| lxvd2x vs47, o48, A1 | |||
| addi A1, A1, 64 | |||
| mr T1, BO | |||
| stxvd2x vs32, o0, T1 | |||
| stxvd2x vs33, o16, T1 | |||
| stxvd2x vs34, o32, T1 | |||
| stxvd2x vs35, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs36, o0, T1 | |||
| stxvd2x vs37, o16, T1 | |||
| stxvd2x vs38, o32, T1 | |||
| stxvd2x vs39, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs40, o0, T1 | |||
| stxvd2x vs41, o16, T1 | |||
| stxvd2x vs42, o32, T1 | |||
| stxvd2x vs43, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs44, o0, T1 | |||
| stxvd2x vs45, o16, T1 | |||
| stxvd2x vs46, o32, T1 | |||
| stxvd2x vs47, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x4 | |||
| lxvd2x vs32, o0, A0 | |||
| lxvd2x vs33, o16, A0 | |||
| lxvd2x vs34, o32, A0 | |||
| lxvd2x vs35, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvd2x vs36, o0, A1 | |||
| lxvd2x vs37, o16, A1 | |||
| lxvd2x vs38, o32, A1 | |||
| lxvd2x vs39, o48, A1 | |||
| addi A1, A1, 64 | |||
| mr T1, BO | |||
| stxvd2x vs32, o0, T1 | |||
| stxvd2x vs33, o16, T1 | |||
| stxvd2x vs34, o32, T1 | |||
| stxvd2x vs35, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs36, o0, T1 | |||
| stxvd2x vs37, o16, T1 | |||
| stxvd2x vs38, o32, T1 | |||
| stxvd2x vs39, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x2 | |||
| lxvd2x vs32, o0, A0 | |||
| lxvd2x vs33, o16, A0 | |||
| addi A0, A0, 32 | |||
| lxvd2x vs34, o0, A1 | |||
| lxvd2x vs35, o16, A1 | |||
| addi A1, A1, 32 | |||
| mr T1, BO | |||
| stxvd2x vs32, o0, T1 | |||
| stxvd2x vs33, o16, T1 | |||
| stxvd2x vs34, o32, T1 | |||
| stxvd2x vs35, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x1 | |||
| lxvd2x vs32, o0, A0 | |||
| addi A0, A0, 16 | |||
| lxvd2x vs33, o0, A1 | |||
| addi A1, A1, 16 | |||
| mr T1, BO | |||
| stxvd2x vs32, o0, T1 | |||
| stxvd2x vs33, o16, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x8 | |||
| lxvd2x vs32, o0, A0 | |||
| lxvd2x vs33, o16, A0 | |||
| lxvd2x vs34, o32, A0 | |||
| lxvd2x vs35, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvd2x vs36, o0, A0 | |||
| lxvd2x vs37, o16, A0 | |||
| lxvd2x vs38, o32, A0 | |||
| lxvd2x vs39, o48, A0 | |||
| addi A0, A0, 64 | |||
| mr T1, BO | |||
| stxvd2x vs32, o0, T1 | |||
| stxvd2x vs33, o16, T1 | |||
| stxvd2x vs34, o32, T1 | |||
| stxvd2x vs35, o48, T1 | |||
| addi T1, T1, 64 | |||
| stxvd2x vs36, o0, T1 | |||
| stxvd2x vs37, o16, T1 | |||
| stxvd2x vs38, o32, T1 | |||
| stxvd2x vs39, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x4 | |||
| lxvd2x vs32, o0, A0 | |||
| lxvd2x vs33, o16, A0 | |||
| lxvd2x vs34, o32, A0 | |||
| lxvd2x vs35, o48, A0 | |||
| addi A0, A0, 64 | |||
| mr T1, BO | |||
| stxvd2x vs32, o0, T1 | |||
| stxvd2x vs33, o16, T1 | |||
| stxvd2x vs34, o32, T1 | |||
| stxvd2x vs35, o48, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x2 | |||
| lxvd2x vs32, o0, A0 | |||
| lxvd2x vs33, o16, A0 | |||
| addi A0, A0, 32 | |||
| mr T1, BO | |||
| stxvd2x vs32, o0, T1 | |||
| stxvd2x vs33, o16, T1 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x1 | |||
| lxvd2x vs32, o0, A0 | |||
| addi A0, A0, 16 | |||
| mr T1, BO | |||
| stxvd2x vs32, o0, T1 | |||
| .endm | |||
| @@ -933,6 +933,23 @@ static void init_parameter(void) { | |||
| #endif | |||
| #endif | |||
| #ifdef EXCAVATOR | |||
| #ifdef DEBUG | |||
| fprintf(stderr, "Excavator\n"); | |||
| #endif | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #ifdef EXPRECISION | |||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||
| #endif | |||
| #endif | |||
| #ifdef PILEDRIVER | |||
| #ifdef DEBUG | |||
| @@ -1,3 +1,7 @@ | |||
| DSCALKERNEL = dscal.c | |||
| CSCALKERNEL = cscal.c | |||
| ZSCALKERNEL = zscal.c | |||
| SAXPYKERNEL = saxpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| CAXPYKERNEL = caxpy.c | |||
| @@ -20,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c | |||
| DGEMVNKERNEL = dgemv_n_4.c | |||
| DGEMVTKERNEL = dgemv_t_4.c | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVNKERNEL = zgemv_n_4.c | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| DCOPYKERNEL = dcopy_bulldozer.S | |||
| @@ -68,25 +72,23 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| STRSMKERNEL_LN = strsm_kernel_LN_bulldozer.c | |||
| STRSMKERNEL_LT = strsm_kernel_LT_bulldozer.c | |||
| STRSMKERNEL_RN = strsm_kernel_RN_bulldozer.c | |||
| STRSMKERNEL_RT = strsm_kernel_RT_bulldozer.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LN = dtrsm_kernel_LN_bulldozer.c | |||
| DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S | |||
| DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_RT = dtrsm_kernel_RT_bulldozer.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ctrsm_kernel_LN_bulldozer.c | |||
| CTRSMKERNEL_LT = ctrsm_kernel_LT_bulldozer.c | |||
| CTRSMKERNEL_RN = ctrsm_kernel_RN_bulldozer.c | |||
| CTRSMKERNEL_RT = ctrsm_kernel_RT_bulldozer.c | |||
| ZTRSMKERNEL_LN = ztrsm_kernel_LN_bulldozer.c | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_bulldozer.c | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_RN_bulldozer.c | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_bulldozer.c | |||
| @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #if defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "caxpy_microk_steamroller-2.c" | |||
| #elif defined(BULLDOZER) | |||
| #include "caxpy_microk_bulldozer-2.c" | |||
| @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(BULLDOZER) | |||
| #include "cdot_microk_bulldozer-2.c" | |||
| #elif defined(STEAMROLLER) || defined(PILEDRIVER) | |||
| #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) | |||
| #include "cdot_microk_steamroller-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "cdot_microk_haswell-2.c" | |||
| @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(HASWELL) | |||
| #include "cgemv_n_microk_haswell-4.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "cgemv_n_microk_bulldozer-4.c" | |||
| #endif | |||
| @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(HASWELL) | |||
| #include "cgemv_t_microk_haswell-4.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "cgemv_t_microk_bulldozer-4.c" | |||
| #endif | |||
| @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "cscal_microk_haswell-2.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) | |||
| #include "cscal_microk_bulldozer-2.c" | |||
| #elif defined(STEAMROLLER) | |||
| #elif defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "cscal_microk_steamroller-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "cscal_microk_bulldozer-2.c" | |||
| @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "daxpy_microk_nehalem-2.c" | |||
| #elif defined(BULLDOZER) | |||
| #include "daxpy_microk_bulldozer-2.c" | |||
| #elif defined(STEAMROLLER) | |||
| #elif defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "daxpy_microk_steamroller-2.c" | |||
| #elif defined(PILEDRIVER) | |||
| #include "daxpy_microk_piledriver-2.c" | |||
| @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(BULLDOZER) | |||
| #include "ddot_microk_bulldozer-2.c" | |||
| #elif defined(STEAMROLLER) | |||
| #elif defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "ddot_microk_steamroller-2.c" | |||
| #elif defined(PILEDRIVER) | |||
| #include "ddot_microk_piledriver-2.c" | |||
| @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NEHALEM) | |||
| #include "dgemv_n_microk_nehalem-4.c" | |||
| #elif defined(HASWELL) || defined(STEAMROLLER) | |||
| #elif defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "dgemv_n_microk_haswell-4.c" | |||
| #endif | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(HASWELL) || defined(STEAMROLLER) | |||
| #if defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "dgemv_t_microk_haswell-4.c" | |||
| #endif | |||
| @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "dscal_microk_bulldozer-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "dscal_microk_sandy-2.c" | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "dsymv_L_microk_bulldozer-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "dsymv_L_microk_haswell-2.c" | |||
| @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "dsymv_U_microk_bulldozer-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "dsymv_U_microk_haswell-2.c" | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "saxpy_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "saxpy_microk_sandy-2.c" | |||
| #elif defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "saxpy_microk_piledriver-2.c" | |||
| #endif | |||
| @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(BULLDOZER) | |||
| #include "sdot_microk_bulldozer-2.c" | |||
| #elif defined(STEAMROLLER) || defined(PILEDRIVER) | |||
| #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) | |||
| #include "sdot_microk_steamroller-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "sdot_microk_nehalem-2.c" | |||
| @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "sgemv_n_microk_bulldozer-4.c" | |||
| #elif defined(NEHALEM) | |||
| #include "sgemv_n_microk_nehalem-4.c" | |||
| @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "sgemv_n_microk_haswell-4.c" | |||
| #endif | |||
| #if defined(STEAMROLLER) | |||
| #if defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #define NBMAX 2048 | |||
| #else | |||
| #define NBMAX 4096 | |||
| @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NEHALEM) | |||
| #include "sgemv_t_microk_nehalem-4.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "sgemv_t_microk_bulldozer-4.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "sgemv_t_microk_sandy-4.c" | |||
| @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "sgemv_t_microk_haswell-4.c" | |||
| #endif | |||
| #if defined(STEAMROLLER) | |||
| #if defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #define NBMAX 2048 | |||
| #else | |||
| #define NBMAX 4096 | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "ssymv_L_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "ssymv_L_microk_nehalem-2.c" | |||
| @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "ssymv_U_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "ssymv_U_microk_nehalem-2.c" | |||
| @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(BULLDOZER) | |||
| #include "zaxpy_microk_bulldozer-2.c" | |||
| #elif defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "zaxpy_microk_steamroller-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "zaxpy_microk_haswell-2.c" | |||
| @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(BULLDOZER) | |||
| #include "zdot_microk_bulldozer-2.c" | |||
| #elif defined(STEAMROLLER) || defined(PILEDRIVER) | |||
| #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) | |||
| #include "zdot_microk_steamroller-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "zdot_microk_haswell-2.c" | |||
| @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "zgemv_n_microk_haswell-4.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "zgemv_n_microk_sandy-4.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "zgemv_n_microk_bulldozer-4.c" | |||
| #endif | |||
| @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "zgemv_t_microk_bulldozer-4.c" | |||
| #elif defined(HASWELL) | |||
| #include "zgemv_t_microk_haswell-4.c" | |||
| @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "zscal_microk_haswell-2.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) | |||
| #include "zscal_microk_bulldozer-2.c" | |||
| #elif defined(STEAMROLLER) | |||
| #elif defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| #include "zscal_microk_steamroller-2.c" | |||
| #endif | |||
| @@ -1977,15 +1977,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ZGEMM_DEFAULT_UNROLL_M 8 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define SGEMM_DEFAULT_P 960 | |||
| #define DGEMM_DEFAULT_P 480 | |||
| #define CGEMM_DEFAULT_P 720 | |||
| #define ZGEMM_DEFAULT_P 480 | |||
| #define SGEMM_DEFAULT_Q 720 | |||
| #define DGEMM_DEFAULT_Q 720 | |||
| #define CGEMM_DEFAULT_Q 720 | |||
| #define ZGEMM_DEFAULT_Q 720 | |||
| #define SGEMM_DEFAULT_P 1280 | |||
| #define DGEMM_DEFAULT_P 640 | |||
| #define CGEMM_DEFAULT_P 640 | |||
| #define ZGEMM_DEFAULT_P 320 | |||
| #define SGEMM_DEFAULT_Q 640 | |||
| #define DGEMM_DEFAULT_Q 640 | |||
| #define CGEMM_DEFAULT_Q 640 | |||
| #define ZGEMM_DEFAULT_Q 640 | |||
| #define SYMV_P 8 | |||
| @@ -4,6 +4,7 @@ include ../Makefile.system | |||
| all :: level1 level2 level3 | |||
| level1 : sblat1 dblat1 cblat1 zblat1 | |||
| ifndef CROSS | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1 | |||
| @@ -21,8 +22,10 @@ else | |||
| OPENBLAS_NUM_THREADS=2 ./zblat1 | |||
| endif | |||
| endif | |||
| endif | |||
| level2 : sblat2 dblat2 cblat2 zblat2 | |||
| ifndef CROSS | |||
| rm -f ?BLAT2.SUMM | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat | |||
| @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 | |||
| @@ -54,8 +57,10 @@ else | |||
| @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 | |||
| endif | |||
| endif | |||
| endif | |||
| level3 : sblat3 dblat3 cblat3 zblat3 | |||
| ifndef CROSS | |||
| rm -f ?BLAT3.SUMM | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat | |||
| @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 | |||
| @@ -87,9 +92,11 @@ else | |||
| @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 | |||
| endif | |||
| endif | |||
| endif | |||
| level3_3m : zblat3_3m cblat3_3m | |||
| ifndef CROSS | |||
| rm -f ?BLAT3_3M.SUMM | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat | |||
| @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 | |||
| @@ -109,6 +116,7 @@ else | |||
| @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -21,7 +21,9 @@ $(UTESTBIN): $(OBJS) | |||
| $(CC) $(CFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) | |||
| run_test: $(UTESTBIN) | |||
| ifndef CROSS | |||
| ./$(UTESTBIN) | |||
| endif | |||
| clean: | |||
| -rm -f *.o $(UTESTBIN) | |||