Browse Source

conflict resolved by syncing with 'xianyi:develop'

Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
tags/v0.2.19^2
Shivraj Patil 10 years ago
parent
commit
085cf236c2
61 changed files with 8446 additions and 800 deletions
  1. +4
    -0
      CONTRIBUTORS.md
  2. +0
    -4
      Makefile
  3. +49
    -49
      Makefile.install
  4. +1
    -0
      README.md
  5. +9
    -9
      c_check
  6. +6
    -0
      ctest/Makefile
  7. +1
    -1
      driver/others/dynamic.c
  8. +83
    -26
      driver/others/init.c
  9. +2
    -2
      driver/others/parameter.c
  10. +2
    -2
      exports/Makefile
  11. +9
    -9
      kernel/power/KERNEL.POWER8
  12. +206
    -0
      kernel/power/cgemm_tcopy_8_power8.S
  13. +247
    -0
      kernel/power/cgemm_tcopy_logic_8_power8.S
  14. +385
    -0
      kernel/power/cgemm_tcopy_macros_8_power8.S
  15. +22
    -7
      kernel/power/dgemm_kernel_16x4_power8.S
  16. +411
    -394
      kernel/power/dgemm_logic_16x4_power8.S
  17. +297
    -234
      kernel/power/dgemm_macros_16x4_power8.S
  18. +228
    -0
      kernel/power/dgemm_ncopy_4_power8.S
  19. +237
    -0
      kernel/power/dgemm_ncopy_logic_4_power8.S
  20. +691
    -0
      kernel/power/dgemm_ncopy_macros_4_power8.S
  21. +1
    -1
      kernel/power/dgemm_tcopy_16_power8.S
  22. +4
    -0
      kernel/power/dgemm_tcopy_logic_16_power8.S
  23. +1
    -1
      kernel/power/dtrmm_kernel_16x4_power8.S
  24. +3431
    -0
      kernel/power/dtrmm_macros_16x4_power8.S
  25. +207
    -0
      kernel/power/sgemm_tcopy_8_power8.S
  26. +299
    -0
      kernel/power/sgemm_tcopy_logic_8_power8.S
  27. +308
    -0
      kernel/power/sgemm_tcopy_macros_8_power8.S
  28. +71
    -1
      kernel/power/zgemm_kernel_8x2_power8.S
  29. +60
    -9
      kernel/power/zgemm_logic_8x2_power8.S
  30. +108
    -0
      kernel/power/zgemm_macros_8x2_power8.S
  31. +205
    -0
      kernel/power/zgemm_tcopy_8_power8.S
  32. +246
    -0
      kernel/power/zgemm_tcopy_logic_8_power8.S
  33. +535
    -0
      kernel/power/zgemm_tcopy_macros_8_power8.S
  34. +17
    -0
      kernel/setparam-ref.c
  35. +19
    -17
      kernel/x86_64/KERNEL.EXCAVATOR
  36. +1
    -1
      kernel/x86_64/caxpy.c
  37. +1
    -1
      kernel/x86_64/cdot.c
  38. +1
    -1
      kernel/x86_64/cgemv_n_4.c
  39. +1
    -1
      kernel/x86_64/cgemv_t_4.c
  40. +1
    -1
      kernel/x86_64/cscal.c
  41. +1
    -1
      kernel/x86_64/daxpy.c
  42. +1
    -1
      kernel/x86_64/ddot.c
  43. +1
    -1
      kernel/x86_64/dgemv_n_4.c
  44. +1
    -1
      kernel/x86_64/dgemv_t_4.c
  45. +1
    -1
      kernel/x86_64/dscal.c
  46. +1
    -1
      kernel/x86_64/dsymv_L.c
  47. +1
    -1
      kernel/x86_64/dsymv_U.c
  48. +1
    -1
      kernel/x86_64/saxpy.c
  49. +1
    -1
      kernel/x86_64/sdot.c
  50. +2
    -2
      kernel/x86_64/sgemv_n_4.c
  51. +2
    -2
      kernel/x86_64/sgemv_t_4.c
  52. +1
    -1
      kernel/x86_64/ssymv_L.c
  53. +1
    -1
      kernel/x86_64/ssymv_U.c
  54. +1
    -1
      kernel/x86_64/zaxpy.c
  55. +1
    -1
      kernel/x86_64/zdot.c
  56. +1
    -1
      kernel/x86_64/zgemv_n_4.c
  57. +1
    -1
      kernel/x86_64/zgemv_t_4.c
  58. +1
    -1
      kernel/x86_64/zscal.c
  59. +9
    -9
      param.h
  60. +8
    -0
      test/Makefile
  61. +2
    -0
      utest/Makefile

+ 4
- 0
CONTRIBUTORS.md View File

@@ -151,5 +151,9 @@ In chronological order:
* [2016-03-20] Fix compiler error in VisualStudio with CMake
* [2016-03-22] Fix access violation on Windows while static linking

* Paul Mustière <https://github.com/buffer51/>
* [2016-02-04] Fix Android build on ARMV7
* [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8

* Shivraj Patil <https://github.com/sva-img/>
* [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA

+ 0
- 4
Makefile View File

@@ -108,8 +108,6 @@ endif

tests :
ifndef NOFORTRAN
ifndef TARGET
ifndef CROSS
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
@@ -119,8 +117,6 @@ ifndef NO_CBLAS
$(MAKE) -C ctest all
endif
endif
endif
endif

libs :
ifeq ($(CORE), UNKOWN)


+ 49
- 49
Makefile.install View File

@@ -20,75 +20,75 @@ lib.grd :
$(error OpenBLAS: Please run "make" firstly)

install : lib.grd
@-mkdir -p $(DESTDIR)$(PREFIX)
@-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@-mkdir -p "$(DESTDIR)$(PREFIX)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)"
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
#for inc
@echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"

@echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"

ifndef NO_CBLAS
@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
endif

ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
endif

#for install static library
ifndef NO_STATIC
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
@install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
@cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
@cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
endif
ifeq ($(OSNAME), CYGWIN_NT)
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
@@ -96,34 +96,34 @@ endif
endif
#Generating OpenBLASConfig.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"

ifndef NO_SHARED
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
ifeq ($(OSNAME), Darwin)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
else
#only static
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
#Generating OpenBLASConfigVersion.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo Install OK!


+ 1
- 0
README.md View File

@@ -82,6 +82,7 @@ Please read GotoBLAS_01Readme.txt
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
- **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.

## Usages
Link with libopenblas.a or -lopenblas for shared library.


+ 9
- 9
c_check View File

@@ -1,5 +1,7 @@
#!/usr/bin/perl

use File::Basename;

# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
@@ -26,14 +28,12 @@ if ($?) {

$cross_suffix = "";

if ($ARGV[0] =~ /(.*)(-[.\d]+)/) {
if ($1 =~ /(.*-)(.*)/) {
$cross_suffix = $1;
}
} else {
if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) {
$cross_suffix = $1;
}
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}

if (basename($compiler_name) =~ /(.*-)(.*)/) {
$cross_suffix .= $1;
}

$compiler = "";
@@ -243,7 +243,7 @@ print MAKEFILE "BINARY64=\n" if $binformat ne bin64;
print MAKEFILE "BINARY32=1\n" if $binformat eq bin32;
print MAKEFILE "BINARY64=1\n" if $binformat eq bin64;
print MAKEFILE "FU=$need_fu\n" if $need_fu ne "";
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne "";
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne "";
print MAKEFILE "CROSS=1\n" if $cross != 0;
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";



+ 6
- 0
ctest/Makefile View File

@@ -42,6 +42,7 @@ ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
all :: all1 all2 all3

all1: xscblat1 xdcblat1 xccblat1 xzcblat1
ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat1
OMP_NUM_THREADS=2 ./xdcblat1
@@ -53,8 +54,10 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat1
OPENBLAS_NUM_THREADS=2 ./xzcblat1
endif
endif

all2: xscblat2 xdcblat2 xccblat2 xzcblat2
ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat2 < sin2
OMP_NUM_THREADS=2 ./xdcblat2 < din2
@@ -66,8 +69,10 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2
OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2
endif
endif

all3: xscblat3 xdcblat3 xccblat3 xzcblat3
ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat3 < sin3
OMP_NUM_THREADS=2 ./xdcblat3 < din3
@@ -88,6 +93,7 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
endif
endif





+ 1
- 1
driver/others/dynamic.c View File

@@ -439,7 +439,7 @@ static gotoblas_t *force_coretype(char *coretype){
char message[128];
//char mname[20];

for ( i=1 ; i <= 21; i++)
for ( i=1 ; i <= 22; i++)
{
if (!strncasecmp(coretype,corename[i],20))
{


+ 83
- 26
driver/others/init.c View File

@@ -361,6 +361,9 @@ static void numa_mapping(void) {
unsigned long work, bit;
int count = 0;
int bitmask_idx = 0;
int current_cpu;
int current_node = 0;
int cpu_count = 0;

for (node = 0; node < common -> num_nodes; node ++) {
core = 0;
@@ -382,33 +385,84 @@ static void numa_mapping(void) {
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
#endif

h = 1;

while (h < count) h = 2 * h + 1;

while (h > 1) {
h /= 2;
for (i = h; i < count; i++) {
work = common -> cpu_info[i];
bit = CPU_ISSET(i, &cpu_orig_mask[0]);
j = i - h;
while (work < common -> cpu_info[j]) {
common -> cpu_info[j + h] = common -> cpu_info[j];
if (CPU_ISSET(j, &cpu_orig_mask[0])) {
CPU_SET(j + h, &cpu_orig_mask[0]);
} else {
CPU_CLR(j + h, &cpu_orig_mask[0]);
}
j -= h;
if (j < 0) break;
}
common -> cpu_info[j + h] = work;
if (bit) {
CPU_SET(j + h, &cpu_orig_mask[0]);
} else {
CPU_CLR(j + h, &cpu_orig_mask[0]);
current_cpu = sched_getcpu();
for (cpu = 0; cpu < count; cpu++) {
if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) {
current_node = READ_NODE(common -> cpu_info[cpu]);
break;
}
}
for (i = 0; i < MAX_BITMASK_LEN; i++)
cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]);

/*
* If all the processes can be accommodated in the
* in the current node itself, then bind to cores
* from the current node only
*/
if (numprocs <= cpu_count) {
/*
* First sort all the cores in order from the current node.
* Then take remaining nodes one by one in order,
* and sort their cores in order.
*/
for (i = 0; i < count; i++) {
for (j = 0; j < count - 1; j++) {
int node_1, node_2;
int core_1, core_2;
int swap = 0;

node_1 = READ_NODE(common -> cpu_info[j]);
node_2 = READ_NODE(common -> cpu_info[j + 1]);
core_1 = READ_CORE(common -> cpu_info[j]);
core_2 = READ_CORE(common -> cpu_info[j + 1]);

if (node_1 == node_2) {
if (core_1 > core_2)
swap = 1;
} else {
if ((node_2 == current_node) ||
((node_1 != current_node) && (node_1 > node_2)))
swap = 1;
}
if (swap) {
unsigned long temp;

temp = common->cpu_info[j];
common->cpu_info[j] = common->cpu_info[j + 1];
common->cpu_info[j + 1] = temp;
}
}
}
} else {
h = 1;

while (h < count) h = 2 * h + 1;

while (h > 1) {
h /= 2;
for (i = h; i < count; i++) {
work = common -> cpu_info[i];
bit = CPU_ISSET(i, &cpu_orig_mask[0]);
j = i - h;
while (work < common -> cpu_info[j]) {
common -> cpu_info[j + h] = common -> cpu_info[j];
if (CPU_ISSET(j, &cpu_orig_mask[0])) {
CPU_SET(j + h, &cpu_orig_mask[0]);
} else {
CPU_CLR(j + h, &cpu_orig_mask[0]);
}
j -= h;
if (j < 0) break;
}
common -> cpu_info[j + h] = work;
if (bit) {
CPU_SET(j + h, &cpu_orig_mask[0]);
} else {
CPU_CLR(j + h, &cpu_orig_mask[0]);
}

}
}
}

@@ -416,7 +470,10 @@ static void numa_mapping(void) {
fprintf(stderr, "\nSorting ...\n\n");

for (cpu = 0; cpu < count; cpu++)
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu],
READ_CPU(common -> cpu_info[cpu]),
READ_CORE(common -> cpu_info[cpu]),
READ_NODE(common -> cpu_info[cpu]));
#endif

}


+ 2
- 2
driver/others/parameter.c View File

@@ -167,7 +167,7 @@ int get_L2_size(void){
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER)
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)

cpuid(0x80000006, &eax, &ebx, &ecx, &edx);

@@ -251,7 +251,7 @@ int get_L2_size(void){
void blas_set_parameter(void){

int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
int size = 16;
#else
int size = get_L2_size();


+ 2
- 2
exports/Makefile View File

@@ -110,9 +110,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
endif
ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2))
#only build without Fortran
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif

dllinit.$(SUFFIX) : dllinit.c


+ 9
- 9
kernel/power/KERNEL.POWER8 View File

@@ -12,7 +12,7 @@ SGEMMKERNEL = sgemm_kernel_16x8_power8.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = sgemm_tcopy_16_power8.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SGEMMOTCOPY = sgemm_tcopy_8_power8.S
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
@@ -21,16 +21,16 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_16x4_power8.S
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
DGEMMITCOPY = dgemm_tcopy_16_power8.S
DGEMMONCOPY = gemm_ncopy_4.S
DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
DGEMMONCOPY = dgemm_ncopy_4_power8.S
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o

CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMITCOPY = cgemm_tcopy_8_power8.S
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
@@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c
ZGEMMITCOPY = zgemm_tcopy_8_power8.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
ZGEMMINCOPYOBJ = zgemm_incopy.o


+ 206
- 0
kernel/power/cgemm_tcopy_8_power8.S View File

@@ -0,0 +1,206 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/

/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"

#define M r3
#define N r4
#define A r5
#define LDA r6
#define B r7

#define A0 r8
#define A1 r9
#define A2 r10
#define A3 r11

#define J r12

#define PREA r14
#define PREB r15
#define BO r16
#define B8 r17
#define B4 r18
#define B2 r19
#define B1 r20
#define o4 r21
#define T2 r22
#define I r23
#define o16 r24
#define o32 r25
#define o48 r26
#define NOTUS2 r27
#define M8 r30
#define T1 r31

#define o0 0

#include "cgemm_tcopy_macros_8_power8.S"

#define STACKSIZE 384


PROLOGUE
PROFCODE

addi SP, SP, -STACKSIZE
li r0, 0

std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)

cmpwi cr0, M, 0
ble- L999
cmpwi cr0, N, 0
ble- L999

slwi LDA, LDA, ZBASE_SHIFT
slwi M8, M, 3 + ZBASE_SHIFT

li T2, -8
li PREA, -4
li PREB, -2

and B4, N, T2
and B2, N, PREA
and B1, N, PREB
mullw B4, B4, M
mullw B2, B2, M
mullw B1, B1, M

slwi B4, B4, ZBASE_SHIFT
slwi B2, B2, ZBASE_SHIFT
slwi B1, B1, ZBASE_SHIFT

add B4, B4, B
add B2, B2, B
add B1, B1, B

li PREA, 384
addi PREB, M8, 128

li o4, 4
li o16, 16
li o32, 32
li o48, 48

#include "cgemm_tcopy_logic_8_power8.S"

L999:

li r3, 0

ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)

addi SP, SP, STACKSIZE

blr
EPILOGUE



+ 247
- 0
kernel/power/cgemm_tcopy_logic_8_power8.S View File

@@ -0,0 +1,247 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/


srawi. I, M, 2
ble CCOPYT_L2_BEGIN


CCOPYT_L4_BEGIN:

mr A0, A
add A1, A0, LDA
add A2, A1, LDA
add A3, A2, LDA
add A, A3, LDA
mr B8, B
addi B, B, 64*SIZE

sradi. J, N, 3
ble CCOPYT_L4x4_BEGIN

mr BO, B8

CCOPYT_L4x8_LOOP:

dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
dcbtst BO, M8
dcbtst BO, PREB
COPY_4x8

add BO, BO, M8

addic. J, J, -1
ble CCOPYT_L4x4_BEGIN


COPY_4x8

add BO, BO, M8

addic. J, J, -1
bgt CCOPYT_L4x8_LOOP

CCOPYT_L4x4_BEGIN:

andi. T1, N, 4
ble CCOPYT_L4x2_BEGIN

mr BO, B4

COPY_4x4


addi B4, B4, 32*SIZE

CCOPYT_L4x2_BEGIN:

andi. T1, N, 2
ble CCOPYT_L4x1_BEGIN

mr BO, B2

COPY_4x2


addi B2, B2, 16*SIZE

CCOPYT_L4x1_BEGIN:

andi. T1, N, 1
ble CCOPYT_L4_END

mr BO, B1

COPY_4x1


addi B1, B1, 8*SIZE

CCOPYT_L4_END:

addic. I, I, -1
bgt CCOPYT_L4_BEGIN



CCOPYT_L2_BEGIN:

andi. T1, M, 2
ble CCOPYT_L1_BEGIN

mr A0, A
add A1, A0, LDA
add A, A1, LDA
mr B8, B
addi B, B, 32*SIZE

sradi. J, N, 3
ble CCOPYT_L2x4_BEGIN

mr BO, B8

CCOPYT_L2x8_LOOP:

COPY_2x8

add BO, BO, M8

addic. J, J, -1
bgt CCOPYT_L2x8_LOOP

CCOPYT_L2x4_BEGIN:

andi. T1, N, 4
ble CCOPYT_L2x2_BEGIN

mr BO, B4

COPY_2x4


addi B4, B4, 16*SIZE

CCOPYT_L2x2_BEGIN:

andi. T1, N, 2
ble CCOPYT_L2x1_BEGIN

mr BO, B2

COPY_2x2


addi B2, B2, 8*SIZE

CCOPYT_L2x1_BEGIN:

andi. T1, N, 1
ble CCOPYT_L2_END

mr BO, B1

COPY_2x1


addi B1, B1, 4*SIZE

CCOPYT_L2_END:


CCOPYT_L1_BEGIN:

andi. T1, M, 1
ble L999

mr A0, A
add A, A0, LDA
mr B8, B
addi B, B, 16*SIZE

sradi. J, N, 3
ble CCOPYT_L1x4_BEGIN

mr BO, B8

CCOPYT_L1x8_LOOP:

COPY_1x8

add BO, BO, M8

addic. J, J, -1
bgt CCOPYT_L1x8_LOOP

CCOPYT_L1x4_BEGIN:

andi. T1, N, 4
ble CCOPYT_L1x2_BEGIN

mr BO, B4

COPY_1x4


addi B4, B4, 8*SIZE

CCOPYT_L1x2_BEGIN:

andi. T1, N, 2
ble CCOPYT_L1x1_BEGIN

mr BO, B2

COPY_1x2


addi B2, B2, 4*SIZE

CCOPYT_L1x1_BEGIN:

andi. T1, N, 1
ble CCOPYT_L1_END

mr BO, B1

COPY_1x1


addi B1, B1, 2*SIZE

CCOPYT_L1_END:


+ 385
- 0
kernel/power/cgemm_tcopy_macros_8_power8.S View File

@@ -0,0 +1,385 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/


/**********************************************************************************************
* Macros for N=4 and M=8
**********************************************************************************************/

.macro COPY_4x8

lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
lxvw4x vs34, o32, A0
lxvw4x vs35, o48, A0

lxvw4x vs36, o0, A1
lxvw4x vs37, o16, A1
lxvw4x vs38, o32, A1
lxvw4x vs39, o48, A1

addi A0, A0, 64
addi A1, A1, 64

lxvw4x vs40, o0, A2
lxvw4x vs41, o16, A2
lxvw4x vs42, o32, A2
lxvw4x vs43, o48, A2

lxvw4x vs44, o0, A3
lxvw4x vs45, o16, A3
lxvw4x vs46, o32, A3
lxvw4x vs47, o48, A3

mr T1, BO
addi A2, A2, 64
addi A3, A3, 64

stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1

addi T1, T1, 64

stxvw4x vs36, o0, T1
stxvw4x vs37, o16, T1
stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1

addi T1, T1, 64

stxvw4x vs40, o0, T1
stxvw4x vs41, o16, T1
stxvw4x vs42, o32, T1
stxvw4x vs43, o48, T1

addi T1, T1, 64

stxvw4x vs44, o0, T1
stxvw4x vs45, o16, T1
stxvw4x vs46, o32, T1
stxvw4x vs47, o48, T1

.endm

/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/

.macro COPY_4x4

lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
addi A0, A0, 32

lxvw4x vs34, o0, A1
lxvw4x vs35, o16, A1
addi A1, A1, 32

lxvw4x vs36, o0, A2
lxvw4x vs37, o16, A2
addi A2, A2, 32

lxvw4x vs38, o0, A3
lxvw4x vs39, o16, A3
addi A3, A3, 32

mr T1, BO

stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1

stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1

addi T1, T1, 64

stxvw4x vs36, o0, T1
stxvw4x vs37, o16, T1

stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1

.endm

/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/

.macro COPY_4x2

lxvw4x vs32, o0, A0
addi A0, A0, 16

lxvw4x vs33, o0, A1
addi A1, A1, 16

lxvw4x vs34, o0, A2
addi A2, A2, 16

lxvw4x vs35, o0, A3
addi A3, A3, 16

mr T1, BO

stxvw4x vs32, o0, T1

stxvw4x vs33, o16, T1

stxvw4x vs34, o32, T1

stxvw4x vs35, o48, T1

.endm

/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/

.macro COPY_4x1

lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
addi A0, A0, 8

lxsspx vs34, o0, A1
lxsspx vs35, o4, A1
addi A1, A1, 8

lxsspx vs36, o0, A2
lxsspx vs37, o4, A2
addi A2, A2, 8

lxsspx vs38, o0, A3
lxsspx vs39, o4, A3
addi A3, A3, 8

mr T1, BO

stxsspx vs32, o0, T1
stxsspx vs33, o4, T1

addi T1, T1, 8

stxsspx vs34, o0, T1
stxsspx vs35, o4, T1

addi T1, T1, 8

stxsspx vs36, o0, T1
stxsspx vs37, o4, T1

addi T1, T1, 8

stxsspx vs38, o0, T1
stxsspx vs39, o4, T1

.endm

/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/

.macro COPY_2x8

lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
lxvw4x vs34, o32, A0
lxvw4x vs35, o48, A0
addi A0, A0, 64

lxvw4x vs36, o0, A1
lxvw4x vs37, o16, A1
lxvw4x vs38, o32, A1
lxvw4x vs39, o48, A1
addi A1, A1, 64

mr T1, BO

stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1

addi T1, T1, 64

stxvw4x vs36, o0, T1
stxvw4x vs37, o16, T1
stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1

.endm

/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/

.macro COPY_2x4

lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
addi A0, A0, 32

lxvw4x vs34, o0, A1
lxvw4x vs35, o16, A1
addi A1, A1, 32

mr T1, BO

stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1

stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1

.endm

/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/

.macro COPY_2x2

lxvw4x vs32, o0, A0
addi A0, A0, 16

lxvw4x vs33, o0, A1
addi A1, A1, 16

mr T1, BO

stxvw4x vs32, o0, T1

stxvw4x vs33, o16, T1

.endm

/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/

.macro COPY_2x1

lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
addi A0, A0, 8

lxsspx vs34, o0, A1
lxsspx vs35, o4, A1
addi A1, A1, 8

mr T1, BO

stxsspx vs32, o0, T1
stxsspx vs33, o4, T1

addi T1, T1, 8

stxsspx vs34, o0, T1
stxsspx vs35, o4, T1

.endm

/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/

.macro COPY_1x8

lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
lxvw4x vs34, o32, A0
lxvw4x vs35, o48, A0
addi A0, A0, 64

mr T1, BO

stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1

.endm

/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/

.macro COPY_1x4

lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
addi A0, A0, 32

mr T1, BO

stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1

.endm

/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/

.macro COPY_1x2

lxvw4x vs32, o0, A0
addi A0, A0, 16

mr T1, BO

stxvw4x vs32, o0, T1

.endm

/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/

.macro COPY_1x1

lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
addi A0, A0, 8

mr T1, BO

stxsspx vs32, o0, T1
stxsspx vs33, o4, T1

.endm


+ 22
- 7
kernel/power/dgemm_kernel_16x4_power8.S View File

@@ -131,13 +131,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#define o0 0

#define T4 r12
#define T3 r11

#define o40 r12
#define o56 r11

#define o112 r14
#define o8 r15
#define o24 r16
#define ALPHA r17
#define o64 r17
#define L r18
#define T1 r19
#define KK r20
#define BB r21
#define o80 r20
#define o96 r21
#define I r22
#define J r23
#define AO r24
@@ -202,6 +209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
@@ -220,6 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
stw r14, 212(SP)
#endif

stfd f1, ALPHA_SP
@@ -260,19 +269,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble .L999_H1

#ifdef __64BIT__
addi ALPHA, SP, 296
addi T1, SP, 296
#else
addi ALPHA, SP, 224
addi T1, SP, 224
#endif

li PRE, 256
li PRE, 384
li o8 , 8
li o16, 16
li o24, 24
li o32, 32
li o48, 48
li o64, 64
li o80, 80
li o96, 96
li o112, 112

lxvdsx alpha_r, 0, ALPHA
lxvdsx alpha_r, 0, T1

#include "dgemm_logic_16x4_power8.S"

@@ -320,6 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
@@ -338,6 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
lwz r14, 212(SP)
#endif

addi SP, SP, STACKSIZE


+ 411
- 394
kernel/power/dgemm_logic_16x4_power8.S
File diff suppressed because it is too large
View File


+ 297
- 234
kernel/power/dgemm_macros_16x4_power8.S View File

@@ -47,88 +47,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvdsx vs24, 0, BO
lxvdsx vs25, o8, BO

addi AO, AO, 64

lxvd2x vs4, 0, AO
lxvd2x vs5, o16, AO
lxvd2x vs6, o32, AO
lxvd2x vs7, o48, AO
lxvd2x vs4, o64, AO
lxvd2x vs5, o80, AO
lxvd2x vs6, o96, AO
lxvd2x vs7, o112, AO

lxvdsx vs26, o16, BO
lxvdsx vs27, o24, BO

addi AO, AO, 64
addi AO, AO, 128
addi BO, BO, 32

.endm


.macro KERNEL4x16_I1

xvmuldp vs32, vs0, vs24
xvmuldp vs33, vs1, vs24
xvmuldp vs34, vs2, vs24
xvmuldp vs35, vs3, vs24
xvmuldp vs32, vs0, vs24
xvmuldp vs33, vs1, vs24
xvmuldp vs34, vs2, vs24
xvmuldp vs35, vs3, vs24

lxvd2x vs8, 0, AO
lxvd2x vs8, o0, AO
lxvd2x vs9, o16, AO
lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO

xvmuldp vs36, vs4, vs24
xvmuldp vs37, vs5, vs24
xvmuldp vs38, vs6, vs24
xvmuldp vs39, vs7, vs24
xvmuldp vs36, vs4, vs24
xvmuldp vs37, vs5, vs24
xvmuldp vs38, vs6, vs24
xvmuldp vs39, vs7, vs24

lxvdsx vs28, 0, BO
lxvdsx vs29, o8, BO

xvmuldp vs40, vs0, vs25
xvmuldp vs41, vs1, vs25
xvmuldp vs42, vs2, vs25
xvmuldp vs43, vs3, vs25
xvmuldp vs40, vs0, vs25
xvmuldp vs41, vs1, vs25
xvmuldp vs42, vs2, vs25
xvmuldp vs43, vs3, vs25

lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO

xvmuldp vs44, vs4, vs25
xvmuldp vs45, vs5, vs25
xvmuldp vs46, vs6, vs25
xvmuldp vs47, vs7, vs25
xvmuldp vs44, vs4, vs25
xvmuldp vs45, vs5, vs25
xvmuldp vs46, vs6, vs25
xvmuldp vs47, vs7, vs25

addi AO, AO, 64

xvmuldp vs48, vs0, vs26
xvmuldp vs49, vs1, vs26
xvmuldp vs50, vs2, vs26
xvmuldp vs51, vs3, vs26
xvmuldp vs48, vs0, vs26
xvmuldp vs49, vs1, vs26
xvmuldp vs50, vs2, vs26
xvmuldp vs51, vs3, vs26

lxvd2x vs12, 0, AO
lxvd2x vs13, o16, AO
lxvd2x vs12, o64, AO
lxvd2x vs13, o80, AO

xvmuldp vs52, vs4, vs26
xvmuldp vs53, vs5, vs26
xvmuldp vs54, vs6, vs26
xvmuldp vs55, vs7, vs26
xvmuldp vs52, vs4, vs26
xvmuldp vs53, vs5, vs26
xvmuldp vs54, vs6, vs26
xvmuldp vs55, vs7, vs26

lxvd2x vs14, o32, AO
lxvd2x vs15, o48, AO
lxvd2x vs14, o96, AO
lxvd2x vs15, o112, AO

xvmuldp vs56, vs0, vs27
xvmuldp vs57, vs1, vs27
xvmuldp vs58, vs2, vs27
xvmuldp vs59, vs3, vs27

xvmuldp vs56, vs0, vs27
xvmuldp vs57, vs1, vs27
xvmuldp vs58, vs2, vs27
xvmuldp vs59, vs3, vs27

lxvdsx vs30, o16, BO
lxvdsx vs31, o24, BO

xvmuldp vs60, vs4, vs27
xvmuldp vs61, vs5, vs27
xvmuldp vs62, vs6, vs27
xvmuldp vs63, vs7, vs27
xvmuldp vs60, vs4, vs27
xvmuldp vs61, vs5, vs27
xvmuldp vs62, vs6, vs27
xvmuldp vs63, vs7, vs27

addi AO, AO, 64
addi BO, BO, 32
addi AO, AO, 128

.endm



.macro KERNEL4x16_1

xvmaddadp vs32, vs0, vs24
@@ -136,8 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs34, vs2, vs24
xvmaddadp vs35, vs3, vs24

lxvd2x vs8, 0, AO
lxvd2x vs8, o0, AO
lxvd2x vs9, o16, AO
lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO

xvmaddadp vs36, vs4, vs24
xvmaddadp vs37, vs5, vs24
@@ -152,31 +154,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs42, vs2, vs25
xvmaddadp vs43, vs3, vs25

lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO

xvmaddadp vs44, vs4, vs25
xvmaddadp vs45, vs5, vs25
xvmaddadp vs46, vs6, vs25
xvmaddadp vs47, vs7, vs25

addi AO, AO, 64

xvmaddadp vs48, vs0, vs26
xvmaddadp vs49, vs1, vs26
xvmaddadp vs50, vs2, vs26
xvmaddadp vs51, vs3, vs26

lxvd2x vs12, 0, AO
lxvd2x vs13, o16, AO
lxvd2x vs12, o64, AO
lxvd2x vs13, o80, AO

xvmaddadp vs52, vs4, vs26
xvmaddadp vs53, vs5, vs26
xvmaddadp vs54, vs6, vs26
xvmaddadp vs55, vs7, vs26

lxvd2x vs14, o32, AO
lxvd2x vs15, o48, AO
lxvd2x vs14, o96, AO
lxvd2x vs15, o112, AO

xvmaddadp vs56, vs0, vs27
xvmaddadp vs57, vs1, vs27
@@ -192,7 +191,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs62, vs6, vs27
xvmaddadp vs63, vs7, vs27

addi AO, AO, 64
addi AO, AO, 128
addi BO, BO, 32

.endm
@@ -228,23 +227,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs46, vs14, vs29
xvmaddadp vs47, vs15, vs29

addi AO, AO, 64

xvmaddadp vs48, vs8, vs30
xvmaddadp vs49, vs9, vs30
xvmaddadp vs50, vs10, vs30
xvmaddadp vs51, vs11, vs30

lxvd2x vs4, 0, AO
lxvd2x vs5, o16, AO
lxvd2x vs4, o64, AO
lxvd2x vs5, o80, AO

xvmaddadp vs52, vs12, vs30
xvmaddadp vs53, vs13, vs30
xvmaddadp vs54, vs14, vs30
xvmaddadp vs55, vs15, vs30

lxvd2x vs6, o32, AO
lxvd2x vs7, o48, AO
lxvd2x vs6, o96, AO
lxvd2x vs7, o112, AO

xvmaddadp vs56, vs8, vs31
xvmaddadp vs57, vs9, vs31
@@ -259,11 +257,144 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs62, vs14, vs31
xvmaddadp vs63, vs15, vs31

addi AO, AO, 64
addi AO, AO, 128
addi BO, BO, 32

.endm

.macro KERNEL4x16_L1

xvmaddadp vs32, vs0, vs24
xvmaddadp vs33, vs1, vs24
xvmaddadp vs34, vs2, vs24
xvmaddadp vs35, vs3, vs24

lxvd2x vs8, o0, AO
lxvd2x vs9, o16, AO
lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO

xvmaddadp vs36, vs4, vs24
xvmaddadp vs37, vs5, vs24
xvmaddadp vs38, vs6, vs24
xvmaddadp vs39, vs7, vs24

lxvdsx vs28, 0, BO
lxvdsx vs29, o8, BO

xvmaddadp vs40, vs0, vs25
xvmaddadp vs41, vs1, vs25
xvmaddadp vs42, vs2, vs25
xvmaddadp vs43, vs3, vs25


xvmaddadp vs44, vs4, vs25
xvmaddadp vs45, vs5, vs25
xvmaddadp vs46, vs6, vs25
xvmaddadp vs47, vs7, vs25


xvmaddadp vs48, vs0, vs26
xvmaddadp vs49, vs1, vs26
xvmaddadp vs50, vs2, vs26
xvmaddadp vs51, vs3, vs26

lxvd2x vs12, o64, AO
lxvd2x vs13, o80, AO

xvmaddadp vs52, vs4, vs26
xvmaddadp vs53, vs5, vs26
xvmaddadp vs54, vs6, vs26
xvmaddadp vs55, vs7, vs26

lxvd2x vs14, o96, AO
lxvd2x vs15, o112, AO

xvmaddadp vs56, vs0, vs27
xvmaddadp vs57, vs1, vs27
xvmaddadp vs58, vs2, vs27
xvmaddadp vs59, vs3, vs27


lxvdsx vs30, o16, BO
lxvdsx vs31, o24, BO

xvmaddadp vs60, vs4, vs27
xvmaddadp vs61, vs5, vs27
xvmaddadp vs62, vs6, vs27
xvmaddadp vs63, vs7, vs27

addi AO, AO, 128

.endm

.macro KERNEL4x16_L2

xvmaddadp vs32, vs8, vs28
xvmaddadp vs33, vs9, vs28
xvmaddadp vs34, vs10, vs28
xvmaddadp vs35, vs11, vs28

lxvd2x vs0, 0, AO
lxvd2x vs1, o16, AO

xvmaddadp vs36, vs12, vs28
xvmaddadp vs37, vs13, vs28
xvmaddadp vs38, vs14, vs28
xvmaddadp vs39, vs15, vs28

lxvdsx vs24, o32, BO
lxvdsx vs25, o40, BO

xvmaddadp vs40, vs8, vs29
xvmaddadp vs41, vs9, vs29
xvmaddadp vs42, vs10, vs29
xvmaddadp vs43, vs11, vs29

lxvd2x vs2, o32, AO
lxvd2x vs3, o48, AO

xvmaddadp vs44, vs12, vs29
xvmaddadp vs45, vs13, vs29
xvmaddadp vs46, vs14, vs29
xvmaddadp vs47, vs15, vs29


xvmaddadp vs48, vs8, vs30
xvmaddadp vs49, vs9, vs30
xvmaddadp vs50, vs10, vs30
xvmaddadp vs51, vs11, vs30

lxvd2x vs4, o64, AO
lxvd2x vs5, o80, AO

xvmaddadp vs52, vs12, vs30
xvmaddadp vs53, vs13, vs30
xvmaddadp vs54, vs14, vs30
xvmaddadp vs55, vs15, vs30

lxvd2x vs6, o96, AO
lxvd2x vs7, o112, AO

xvmaddadp vs56, vs8, vs31
xvmaddadp vs57, vs9, vs31
xvmaddadp vs58, vs10, vs31
xvmaddadp vs59, vs11, vs31

lxvdsx vs26, o48, BO
lxvdsx vs27, o56, BO

xvmaddadp vs60, vs12, vs31
addi AO, AO, 128
xvmaddadp vs61, vs13, vs31
xvmaddadp vs62, vs14, vs31
addi BO, BO, 64
xvmaddadp vs63, vs15, vs31


.endm


.macro KERNEL4x16_E2


@@ -378,15 +509,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvdsx vs26, o16, BO
lxvdsx vs27, o24, BO

addi AO, AO, 64
addi BO, BO, 32

lxvd2x vs4, 0, AO
lxvd2x vs5, o16, AO
lxvd2x vs6, o32, AO
lxvd2x vs7, o48, AO
lxvd2x vs4, o64, AO
lxvd2x vs5, o80, AO
lxvd2x vs6, o96, AO
lxvd2x vs7, o112, AO

addi AO, AO, 64


xvmaddadp vs32, vs0, vs24
@@ -402,6 +530,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs41, vs1, vs25
xvmaddadp vs42, vs2, vs25
xvmaddadp vs43, vs3, vs25
addi BO, BO, 32
xvmaddadp vs44, vs4, vs25
xvmaddadp vs45, vs5, vs25
xvmaddadp vs46, vs6, vs25
@@ -411,6 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs49, vs1, vs26
xvmaddadp vs50, vs2, vs26
xvmaddadp vs51, vs3, vs26
addi AO, AO, 128
xvmaddadp vs52, vs4, vs26
xvmaddadp vs53, vs5, vs26
xvmaddadp vs54, vs6, vs26
@@ -430,21 +560,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x16

mr T1, CO
addi T2, T1, 64
add T2, T1, LDC
add T3, T2, LDC
add T4, T3, LDC

lxvd2x vs0, 0, CO
lxvd2x vs1, o16, CO
lxvd2x vs2, o32, CO
lxvd2x vs3, o48, CO
lxvd2x vs4, o64, CO
lxvd2x vs5, o80, CO
lxvd2x vs6, o96, CO
lxvd2x vs7, o112, CO

lxvd2x vs8, 0, T2
lxvd2x vs9, o16, T2
lxvd2x vs10, o32, T2
lxvd2x vs11, o48, T2
lxvd2x vs12, o64, T2
lxvd2x vs13, o80, T2
lxvd2x vs14, o96, T2
lxvd2x vs15, o112, T2

lxvd2x vs24, 0, T3
lxvd2x vs25, o16, T3
lxvd2x vs26, o32, T3
lxvd2x vs27, o48, T3
lxvd2x vs28, o64, T3
lxvd2x vs29, o80, T3
lxvd2x vs30, o96, T3
lxvd2x vs31, o112, T3

#ifndef TRMMKERNEL
lxvd2x vs0, 0, T1
lxvd2x vs1, o16, T1
lxvd2x vs2, o32, T1
lxvd2x vs3, o48, T1

lxvd2x vs4, 0, T2
lxvd2x vs5, o16, T2
lxvd2x vs6, o32, T2
lxvd2x vs7, o48, T2
#endif

#ifndef TRMMKERNEL
xvmaddadp vs0, vs32, alpha_r
xvmaddadp vs1, vs33, alpha_r
xvmaddadp vs2, vs34, alpha_r
@@ -453,171 +599,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs5, vs37, alpha_r
xvmaddadp vs6, vs38, alpha_r
xvmaddadp vs7, vs39, alpha_r
#else
xvmuldp vs0, vs32, alpha_r
xvmuldp vs1, vs33, alpha_r
xvmuldp vs2, vs34, alpha_r
xvmuldp vs3, vs35, alpha_r
xvmuldp vs4, vs36, alpha_r
xvmuldp vs5, vs37, alpha_r
xvmuldp vs6, vs38, alpha_r
xvmuldp vs7, vs39, alpha_r
#endif

stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1

dcbt T1, PRE

stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2

add T1, T1, LDC
add T2, T2, LDC
lxvd2x vs32, 0, T4
lxvd2x vs33, o16, T4
lxvd2x vs34, o32, T4
lxvd2x vs35, o48, T4
lxvd2x vs36, o64, T4
lxvd2x vs37, o80, T4
lxvd2x vs38, o96, T4
lxvd2x vs39, o112, T4

#ifndef TRMMKERNEL
lxvd2x vs8, 0, T1
lxvd2x vs9, o16, T1
lxvd2x vs10, o32, T1
lxvd2x vs11, o48, T1

lxvd2x vs12, 0, T2
lxvd2x vs13, o16, T2
lxvd2x vs14, o32, T2
lxvd2x vs15, o48, T2
#endif

#ifndef TRMMKERNEL
xvmaddadp vs8, vs40, alpha_r
xvmaddadp vs9, vs41, alpha_r
xvmaddadp vs10, vs42, alpha_r
xvmaddadp vs11, vs43, alpha_r
xvmaddadp vs12, vs44, alpha_r
xvmaddadp vs13, vs45, alpha_r
xvmaddadp vs14, vs46, alpha_r
xvmaddadp vs15, vs47, alpha_r
#else
xvmuldp vs8, vs40, alpha_r
xvmuldp vs9, vs41, alpha_r
xvmuldp vs10, vs42, alpha_r
xvmuldp vs11, vs43, alpha_r
xvmuldp vs12, vs44, alpha_r
xvmuldp vs13, vs45, alpha_r
xvmuldp vs14, vs46, alpha_r
xvmuldp vs15, vs47, alpha_r
#endif

stxvd2x vs8, 0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1

dcbt T1, PRE

stxvd2x vs12, 0, T2
stxvd2x vs13, o16, T2
stxvd2x vs14, o32, T2
stxvd2x vs15, o48, T2

add T1, T1, LDC
add T2, T2, LDC

#ifndef TRMMKERNEL
lxvd2x vs0, 0, T1
lxvd2x vs1, o16, T1
lxvd2x vs2, o32, T1
lxvd2x vs3, o48, T1

lxvd2x vs4, 0, T2
lxvd2x vs5, o16, T2
lxvd2x vs6, o32, T2
lxvd2x vs7, o48, T2
#endif

#ifndef TRMMKERNEL
xvmaddadp vs0, vs48, alpha_r
xvmaddadp vs1, vs49, alpha_r
xvmaddadp vs2, vs50, alpha_r
xvmaddadp vs3, vs51, alpha_r
xvmaddadp vs4, vs52, alpha_r
xvmaddadp vs5, vs53, alpha_r
xvmaddadp vs6, vs54, alpha_r
xvmaddadp vs7, vs55, alpha_r
#else
xvmuldp vs0, vs48, alpha_r
xvmuldp vs1, vs49, alpha_r
xvmuldp vs2, vs50, alpha_r
xvmuldp vs3, vs51, alpha_r
xvmuldp vs4, vs52, alpha_r
xvmuldp vs5, vs53, alpha_r
xvmuldp vs6, vs54, alpha_r
xvmuldp vs7, vs55, alpha_r
#endif

stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1

dcbt T1, PRE

stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2

add T1, T1, LDC
add T2, T2, LDC

#ifndef TRMMKERNEL
lxvd2x vs8, 0, T1
lxvd2x vs9, o16, T1
lxvd2x vs10, o32, T1
lxvd2x vs11, o48, T1

lxvd2x vs12, 0, T2
lxvd2x vs13, o16, T2
lxvd2x vs14, o32, T2
lxvd2x vs15, o48, T2
#endif

#ifndef TRMMKERNEL
xvmaddadp vs8, vs56, alpha_r
xvmaddadp vs9, vs57, alpha_r
xvmaddadp vs10, vs58, alpha_r
xvmaddadp vs11, vs59, alpha_r
xvmaddadp vs12, vs60, alpha_r
xvmaddadp vs13, vs61, alpha_r
xvmaddadp vs14, vs62, alpha_r
xvmaddadp vs15, vs63, alpha_r
#else
xvmuldp vs8, vs56, alpha_r
xvmuldp vs9, vs57, alpha_r
xvmuldp vs10, vs58, alpha_r
xvmuldp vs11, vs59, alpha_r
xvmuldp vs12, vs60, alpha_r
xvmuldp vs13, vs61, alpha_r
xvmuldp vs14, vs62, alpha_r
xvmuldp vs15, vs63, alpha_r
#endif
xvmaddadp vs12, vs44, alpha_r
xvmaddadp vs13, vs45, alpha_r
xvmaddadp vs14, vs46, alpha_r
xvmaddadp vs15, vs47, alpha_r

stxvd2x vs8, 0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
stxvd2x vs4, o64, T1
stxvd2x vs5, o80, T1
stxvd2x vs6, o96, T1
stxvd2x vs7, o112, T1

xvmaddadp vs24, vs48, alpha_r
xvmaddadp vs25, vs49, alpha_r
xvmaddadp vs26, vs50, alpha_r
xvmaddadp vs27, vs51, alpha_r

stxvd2x vs8, o0, T2
stxvd2x vs9, o16, T2
stxvd2x vs10, o32, T2
stxvd2x vs11, o48, T2

xvmaddadp vs28, vs52, alpha_r
xvmaddadp vs29, vs53, alpha_r
xvmaddadp vs30, vs54, alpha_r
xvmaddadp vs31, vs55, alpha_r

stxvd2x vs12, o64, T2
stxvd2x vs13, o80, T2
stxvd2x vs14, o96, T2
stxvd2x vs15, o112, T2

xvmaddadp vs32, vs56, alpha_r
xvmaddadp vs33, vs57, alpha_r
xvmaddadp vs34, vs58, alpha_r
xvmaddadp vs35, vs59, alpha_r

stxvd2x vs24, 0, T3
stxvd2x vs25, o16, T3
stxvd2x vs26, o32, T3
stxvd2x vs27, o48, T3

xvmaddadp vs36, vs60, alpha_r
xvmaddadp vs37, vs61, alpha_r
xvmaddadp vs38, vs62, alpha_r
xvmaddadp vs39, vs63, alpha_r

stxvd2x vs28, o64, T3
stxvd2x vs29, o80, T3
stxvd2x vs30, o96, T3
stxvd2x vs31, o112, T3

stxvd2x vs32, o0, T4
stxvd2x vs33, o16, T4
stxvd2x vs34, o32, T4
stxvd2x vs35, o48, T4

dcbt T1, PRE
addi CO, CO, 128

stxvd2x vs12, 0, T2
stxvd2x vs13, o16, T2
stxvd2x vs14, o32, T2
stxvd2x vs15, o48, T2
stxvd2x vs36, o64, T4
stxvd2x vs37, o80, T4
stxvd2x vs38, o96, T4
stxvd2x vs39, o112, T4

addi CO, CO, 128

.endm



+ 228
- 0
kernel/power/dgemm_ncopy_4_power8.S View File

@@ -0,0 +1,228 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/

/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"

#define M r3
#define N r4
#define A r5
#define LDA r6
#define B r7

#define A0 r8
#define A1 r9
#define A2 r10
#define A3 r11

#define J r12

#define PREA r14
#define PREB r15
#define BO r16
#define o64 r17
#define o80 r18
#define o96 r19
#define o112 r20
#define o8 r21
#define T2 r22
#define I r23
#define o16 r24
#define o32 r25
#define o48 r26
#define NOTU1 r27
#define NOTU2 r30
#define T1 r31

#define o0 0

#include "dgemm_ncopy_macros_4_power8.S"

#define STACKSIZE 384


PROLOGUE
PROFCODE

addi SP, SP, -STACKSIZE
li r0, 0

stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)


std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)

cmpwi cr0, M, 0
ble- L999
cmpwi cr0, N, 0
ble- L999

slwi LDA, LDA, BASE_SHIFT

li PREA, 384
li PREB, 384

li o8, 8
li o16, 16
li o32, 32
li o48, 48
li o64, 64
li o80, 80
li o96, 96
li o112, 112

#include "dgemm_ncopy_logic_4_power8.S"

L999:

li r3, 0

lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)

ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)

addi SP, SP, STACKSIZE

blr
EPILOGUE



+ 237
- 0
kernel/power/dgemm_ncopy_logic_4_power8.S View File

@@ -0,0 +1,237 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/


mr BO, B
srawi. I, N, 2
ble DCOPYN_L2_BEGIN


DCOPYN_L4_BEGIN:


DCOPYN_L4_LOOP:

mr A0, A
add A1, A0, LDA
add A2, A1, LDA
add A3, A2, LDA
add A, A3, LDA

DCOPYN_L4x16_BEGIN:

srawi. J, M, 4
ble DCOPYN_L4x16_END

DCOPYN_L4x16_LOOP:

dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
COPY_4x16
addic. J, J, -1
bgt DCOPYN_L4x16_LOOP

DCOPYN_L4x16_END:


DCOPYN_L4x8_BEGIN:

andi. J, M, 8
ble DCOPYN_L4x8_END
COPY_4x8

DCOPYN_L4x8_END:


DCOPYN_L4x4_BEGIN:

andi. J, M, 4
ble DCOPYN_L4x4_END
COPY_4x4

DCOPYN_L4x4_END:


DCOPYN_L4x2_BEGIN:

andi. J, M, 2
ble DCOPYN_L4x2_END
COPY_4x2

DCOPYN_L4x2_END:


DCOPYN_L4x1_BEGIN:

andi. J, M, 1
ble DCOPYN_L4x1_END
COPY_4x1

DCOPYN_L4x1_END:


DCOPYN_L4_END:

addic. I, I, -1
bgt DCOPYN_L4_LOOP

DCOPYN_L2_BEGIN:

andi. T1, 4, 2
ble DCOPYN_L2_END

DCOPYN_L2_LOOP:

mr A0, A
add A1, A0, LDA
add A, A1, LDA

DCOPYN_L2x16_BEGIN:

srawi. J, M, 4
ble DCOPYN_L2x16_END

DCOPYN_L2x16_LOOP:

COPY_2x16
addic. J, J, -1
bgt DCOPYN_L2x16_LOOP

DCOPYN_L2x16_END:


DCOPYN_L2x8_BEGIN:

andi. J, M, 8
ble DCOPYN_L2x8_END
COPY_2x8

DCOPYN_L2x8_END:


DCOPYN_L2x4_BEGIN:

andi. J, M, 4
ble DCOPYN_L2x4_END
COPY_2x4

DCOPYN_L2x4_END:


DCOPYN_L2x2_BEGIN:

andi. J, M, 2
ble DCOPYN_L2x2_END
COPY_2x2

DCOPYN_L2x2_END:


DCOPYN_L2x1_BEGIN:

andi. J, M, 1
ble DCOPYN_L2x1_END
COPY_2x1

DCOPYN_L2x1_END:


DCOPYN_L2_END:


DCOPYN_L1_BEGIN:

andi. T1, 4, 1
ble DCOPYN_L1_END

DCOPYN_L1_LOOP:

mr A0, A
add A, A0, LDA

DCOPYN_L1x16_BEGIN:

srawi. J, M, 4
ble DCOPYN_L1x16_END

DCOPYN_L1x16_LOOP:

COPY_1x16
addic. J, J, -1
bgt DCOPYN_L1x16_LOOP

DCOPYN_L1x16_END:


DCOPYN_L1x8_BEGIN:

andi. J, M, 8
ble DCOPYN_L1x8_END
COPY_1x8

DCOPYN_L1x8_END:


DCOPYN_L1x4_BEGIN:

andi. J, M, 4
ble DCOPYN_L1x4_END
COPY_1x4

DCOPYN_L1x4_END:


DCOPYN_L1x2_BEGIN:

andi. J, M, 2
ble DCOPYN_L1x2_END
COPY_1x2

DCOPYN_L1x2_END:


DCOPYN_L1x1_BEGIN:

andi. J, M, 1
ble DCOPYN_L1x1_END
COPY_1x1

DCOPYN_L1x1_END:


DCOPYN_L1_END:


+ 691
- 0
kernel/power/dgemm_ncopy_macros_4_power8.S View File

@@ -0,0 +1,691 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/


/**********************************************************************************************
* Macros for N=4 and M=16
**********************************************************************************************/

.macro COPY_4x16

lxvd2x vs0, o0, A0
lxvd2x vs8, o0, A1
lxvd2x vs24, o0, A3
lxvd2x vs16, o0, A2

lxvd2x vs1, o16, A0
lxvd2x vs9, o16, A1
lxvd2x vs17, o16, A2
lxvd2x vs25, o16, A3

lxvd2x vs2, o32, A0
lxvd2x vs10, o32, A1
lxvd2x vs18, o32, A2
lxvd2x vs26, o32, A3

lxvd2x vs3, o48, A0
lxvd2x vs11, o48, A1
lxvd2x vs19, o48, A2
lxvd2x vs27, o48, A3

lxvd2x vs4, o64, A0
lxvd2x vs12, o64, A1
lxvd2x vs20, o64, A2
lxvd2x vs28, o64, A3

lxvd2x vs5, o80, A0
lxvd2x vs13, o80, A1
lxvd2x vs21, o80, A2
lxvd2x vs29, o80, A3

lxvd2x vs6, o96, A0
lxvd2x vs14, o96, A1
lxvd2x vs22, o96, A2
lxvd2x vs30, o96, A3

lxvd2x vs7, o112, A0
lxvd2x vs15, o112, A1
lxvd2x vs23, o112, A2
lxvd2x vs31, o112, A3


xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3

xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3

xxpermdi vs40, vs2, vs10, 0
xxpermdi vs41, vs18, vs26, 0
xxpermdi vs42, vs2, vs10, 3
xxpermdi vs43, vs18, vs26, 3

xxpermdi vs44, vs3, vs11, 0
xxpermdi vs45, vs19, vs27, 0
xxpermdi vs46, vs3, vs11, 3
xxpermdi vs47, vs19, vs27, 3

xxpermdi vs48, vs4, vs12, 0
xxpermdi vs49, vs20, vs28, 0
xxpermdi vs50, vs4, vs12, 3
xxpermdi vs51, vs20, vs28, 3

xxpermdi vs52, vs5, vs13, 0
xxpermdi vs53, vs21, vs29, 0
xxpermdi vs54, vs5, vs13, 3
xxpermdi vs55, vs21, vs29, 3

addi A0, A0, 128
addi A1, A1, 128

xxpermdi vs56, vs6, vs14, 0
xxpermdi vs57, vs22, vs30, 0
xxpermdi vs58, vs6, vs14, 3
xxpermdi vs59, vs22, vs30, 3

addi A3, A3, 128
addi A2, A2, 128

xxpermdi vs60, vs7, vs15, 0
xxpermdi vs61, vs23, vs31, 0
xxpermdi vs62, vs7, vs15, 3
xxpermdi vs63, vs23, vs31, 3


stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128

stxvd2x vs40, o0, BO
stxvd2x vs41, o16, BO
stxvd2x vs42, o32, BO
stxvd2x vs43, o48, BO
stxvd2x vs44, o64, BO
stxvd2x vs45, o80, BO
stxvd2x vs46, o96, BO
stxvd2x vs47, o112, BO
addi BO, BO, 128

stxvd2x vs48, o0, BO
stxvd2x vs49, o16, BO
stxvd2x vs50, o32, BO
stxvd2x vs51, o48, BO
stxvd2x vs52, o64, BO
stxvd2x vs53, o80, BO
stxvd2x vs54, o96, BO
stxvd2x vs55, o112, BO
addi BO, BO, 128

stxvd2x vs56, o0, BO
stxvd2x vs57, o16, BO
stxvd2x vs58, o32, BO
stxvd2x vs59, o48, BO
stxvd2x vs60, o64, BO
stxvd2x vs61, o80, BO
stxvd2x vs62, o96, BO
stxvd2x vs63, o112, BO
addi BO, BO, 128


.endm


/**********************************************************************************************
* Macros for N=4 and M=8
**********************************************************************************************/

.macro COPY_4x8

lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
addi A0, A0, 64


lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
lxvd2x vs10, o32, A1
lxvd2x vs11, o48, A1
addi A1, A1, 64


lxvd2x vs16, o0, A2
lxvd2x vs17, o16, A2
lxvd2x vs18, o32, A2
lxvd2x vs19, o48, A2
addi A2, A2, 64


lxvd2x vs24, o0, A3
lxvd2x vs25, o16, A3
lxvd2x vs26, o32, A3
lxvd2x vs27, o48, A3
addi A3, A3, 64


xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3

xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3

xxpermdi vs40, vs2, vs10, 0
xxpermdi vs41, vs18, vs26, 0
xxpermdi vs42, vs2, vs10, 3
xxpermdi vs43, vs18, vs26, 3

xxpermdi vs44, vs3, vs11, 0
xxpermdi vs45, vs19, vs27, 0
xxpermdi vs46, vs3, vs11, 3
xxpermdi vs47, vs19, vs27, 3


stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128

stxvd2x vs40, o0, BO
stxvd2x vs41, o16, BO
stxvd2x vs42, o32, BO
stxvd2x vs43, o48, BO
stxvd2x vs44, o64, BO
stxvd2x vs45, o80, BO
stxvd2x vs46, o96, BO
stxvd2x vs47, o112, BO
addi BO, BO, 128


.endm


/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/

.macro COPY_4x4

lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
addi A0, A0, 32


lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
addi A1, A1, 32


lxvd2x vs16, o0, A2
lxvd2x vs17, o16, A2
addi A2, A2, 32


lxvd2x vs24, o0, A3
lxvd2x vs25, o16, A3
addi A3, A3, 32


xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3

xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3


stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128


.endm


/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/

.macro COPY_4x2

lxvd2x vs0, o0, A0
addi A0, A0, 16


lxvd2x vs8, o0, A1
addi A1, A1, 16


lxvd2x vs16, o0, A2
addi A2, A2, 16


lxvd2x vs24, o0, A3
addi A3, A3, 16


xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3


stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
addi BO, BO, 64


.endm


/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/

.macro COPY_4x1

lxsdx vs0, o0, A0
addi A0, A0, 8


lxsdx vs8, o0, A1
addi A1, A1, 8


lxsdx vs16, o0, A2
addi A2, A2, 8


lxsdx vs24, o0, A3
addi A3, A3, 8


xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0


stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
addi BO, BO, 32


.endm


/**********************************************************************************************
* Macros for N=2 and M=16
**********************************************************************************************/

.macro COPY_2x16

lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
lxvd2x vs4, o64, A0
lxvd2x vs5, o80, A0
lxvd2x vs6, o96, A0
lxvd2x vs7, o112, A0
addi A0, A0, 128


lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
lxvd2x vs10, o32, A1
lxvd2x vs11, o48, A1
lxvd2x vs12, o64, A1
lxvd2x vs13, o80, A1
lxvd2x vs14, o96, A1
lxvd2x vs15, o112, A1
addi A1, A1, 128


xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs0, vs8, 3

xxpermdi vs34, vs1, vs9, 0
xxpermdi vs35, vs1, vs9, 3

xxpermdi vs36, vs2, vs10, 0
xxpermdi vs37, vs2, vs10, 3

xxpermdi vs38, vs3, vs11, 0
xxpermdi vs39, vs3, vs11, 3

xxpermdi vs40, vs4, vs12, 0
xxpermdi vs41, vs4, vs12, 3

xxpermdi vs42, vs5, vs13, 0
xxpermdi vs43, vs5, vs13, 3

xxpermdi vs44, vs6, vs14, 0
xxpermdi vs45, vs6, vs14, 3

xxpermdi vs46, vs7, vs15, 0
xxpermdi vs47, vs7, vs15, 3


stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128

stxvd2x vs40, o0, BO
stxvd2x vs41, o16, BO
stxvd2x vs42, o32, BO
stxvd2x vs43, o48, BO
stxvd2x vs44, o64, BO
stxvd2x vs45, o80, BO
stxvd2x vs46, o96, BO
stxvd2x vs47, o112, BO
addi BO, BO, 128


.endm


/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/

.macro COPY_2x8

lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
addi A0, A0, 64


lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
lxvd2x vs10, o32, A1
lxvd2x vs11, o48, A1
addi A1, A1, 64


xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs0, vs8, 3

xxpermdi vs34, vs1, vs9, 0
xxpermdi vs35, vs1, vs9, 3

xxpermdi vs36, vs2, vs10, 0
xxpermdi vs37, vs2, vs10, 3

xxpermdi vs38, vs3, vs11, 0
xxpermdi vs39, vs3, vs11, 3


stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128


.endm


/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/

.macro COPY_2x4

lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
addi A0, A0, 32


lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
addi A1, A1, 32


xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs0, vs8, 3

xxpermdi vs34, vs1, vs9, 0
xxpermdi vs35, vs1, vs9, 3


stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
addi BO, BO, 64


.endm


/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/

.macro COPY_2x2

lxvd2x vs0, o0, A0
addi A0, A0, 16


lxvd2x vs8, o0, A1
addi A1, A1, 16


xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs0, vs8, 3


stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
addi BO, BO, 32


.endm


/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/

.macro COPY_2x1

lxsdx vs0, o0, A0
addi A0, A0, 8


lxsdx vs8, o0, A1
addi A1, A1, 8


xxpermdi vs32, vs0, vs8, 0


stxvd2x vs32, o0, BO
addi BO, BO, 16


.endm


/**********************************************************************************************
* Macros for N=1 and M=16
**********************************************************************************************/

.macro COPY_1x16

lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
lxvd2x vs4, o64, A0
lxvd2x vs5, o80, A0
lxvd2x vs6, o96, A0
lxvd2x vs7, o112, A0
addi A0, A0, 128


stxvd2x vs0, o0, BO
stxvd2x vs1, o16, BO
stxvd2x vs2, o32, BO
stxvd2x vs3, o48, BO
addi BO, BO, 64

stxvd2x vs4, o0, BO
stxvd2x vs5, o16, BO
stxvd2x vs6, o32, BO
stxvd2x vs7, o48, BO
addi BO, BO, 64


.endm


/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/

.macro COPY_1x8

lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
addi A0, A0, 64


stxvd2x vs0, o0, BO
stxvd2x vs1, o16, BO
stxvd2x vs2, o32, BO
stxvd2x vs3, o48, BO
addi BO, BO, 64


.endm


/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/

.macro COPY_1x4

lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
addi A0, A0, 32


stxvd2x vs0, o0, BO
stxvd2x vs1, o16, BO
addi BO, BO, 32


.endm


/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/

.macro COPY_1x2

lxvd2x vs0, o0, A0
addi A0, A0, 16


stxvd2x vs0, o0, BO
addi BO, BO, 16


.endm


/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/

.macro COPY_1x1

lxsdx vs0, o0, A0
addi A0, A0, 8


stxsdx vs0, o0, BO
addi BO, BO, 8


.endm


+ 1
- 1
kernel/power/dgemm_tcopy_16_power8.S View File

@@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add B2, B2, B
add B1, B1, B

li PREA, 768
li PREA, 256
addi PREB, M16, 128

li o8, 8


+ 4
- 0
kernel/power/dgemm_tcopy_logic_16_power8.S View File

@@ -57,16 +57,20 @@ DCOPYT_L4_BEGIN:

DCOPYT_L4x16_LOOP:

/*
addi T1, PREB, 128
addi T2, PREB, 256
*/
dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
/*
dcbtst BO, M16
dcbtst BO, PREB
dcbtst BO, T1
dcbtst BO, T2
*/
COPY_4x16

add BO, BO, M16


+ 1
- 1
kernel/power/dtrmm_kernel_16x4_power8.S View File

@@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define PRE r30
#define T2 r31

#include "dgemm_macros_16x4_power8.S"
#include "dtrmm_macros_16x4_power8.S"


#ifndef NEEDPARAM


+ 3431
- 0
kernel/power/dtrmm_macros_16x4_power8.S
File diff suppressed because it is too large
View File


+ 207
- 0
kernel/power/sgemm_tcopy_8_power8.S View File

@@ -0,0 +1,207 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/


/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"

#define M r3
#define N r4
#define A r5
#define LDA r6
#define B r7

#define A0 r8
#define A1 r9
#define A2 r10
#define A3 r11

#define J r12

#define PREA r14
#define PREB r15
#define BO r16
#define B8 r17
#define B4 r18
#define B2 r19
#define B1 r20
#define o4 r21
#define T2 r22
#define I r23
#define o16 r24
#define o32 r25
#define o48 r26
#define NOTU1 r29
#define M8 r30
#define T1 r31

#define o0 0

#include "sgemm_tcopy_macros_8_power8.S"

#define STACKSIZE 384


PROLOGUE
PROFCODE

addi SP, SP, -STACKSIZE
li r0, 0

std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)

cmpwi cr0, M, 0
ble- L999
cmpwi cr0, N, 0
ble- L999

slwi LDA, LDA, BASE_SHIFT
slwi M8, M, 3 + BASE_SHIFT

li T2, -8
li PREA, -4
li PREB, -2

and B4, N, T2
and B2, N, PREA
and B1, N, PREB
mullw B4, B4, M
mullw B2, B2, M
mullw B1, B1, M

slwi B4, B4, BASE_SHIFT
slwi B2, B2, BASE_SHIFT
slwi B1, B1, BASE_SHIFT

add B4, B4, B
add B2, B2, B
add B1, B1, B

li PREA, 384
addi PREB, M8, 128

li o4, 4
li o16, 16
li o32, 32
li o48, 48

#include "sgemm_tcopy_logic_8_power8.S"

L999:

li r3, 0

ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)

addi SP, SP, STACKSIZE

blr
EPILOGUE



+ 299
- 0
kernel/power/sgemm_tcopy_logic_8_power8.S View File

@@ -0,0 +1,299 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/


srawi. I, M, 2
ble SCOPYOT_L2_BEGIN


SCOPYOT_L4_BEGIN:

mr A0, A
add A1, A0, LDA
add A2, A1, LDA
add A3, A2, LDA
add A, A3, LDA
mr B8, B
addi B, B, 32*SIZE

sradi. J, N, 3
ble SCOPYOT_L4x4_BEGIN

mr BO, B8
.align 5

SCOPYOT_L4x8_LOOP:

dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
COPY_4x8

addi A0, A0, 8*SIZE
addi A1, A1, 8*SIZE
addi A2, A2, 8*SIZE
addi A3, A3, 8*SIZE
add BO, BO, M8

addic. J, J, -1
ble SCOPYOT_L4x4_BEGIN

COPY_4x8

addi A0, A0, 8*SIZE
addi A1, A1, 8*SIZE
addi A2, A2, 8*SIZE
addi A3, A3, 8*SIZE
add BO, BO, M8

addic. J, J, -1
ble SCOPYOT_L4x4_BEGIN

COPY_4x8

addi A0, A0, 8*SIZE
addi A1, A1, 8*SIZE
addi A2, A2, 8*SIZE
addi A3, A3, 8*SIZE
add BO, BO, M8

addic. J, J, -1
ble SCOPYOT_L4x4_BEGIN

COPY_4x8

addi A0, A0, 8*SIZE
addi A1, A1, 8*SIZE
addi A2, A2, 8*SIZE
addi A3, A3, 8*SIZE
add BO, BO, M8

addic. J, J, -1
bgt SCOPYOT_L4x8_LOOP

SCOPYOT_L4x4_BEGIN:

andi. T1, N, 4
ble SCOPYOT_L4x2_BEGIN

mr BO, B4

COPY_4x4

addi A0, A0, 4*SIZE
addi A1, A1, 4*SIZE
addi A2, A2, 4*SIZE
addi A3, A3, 4*SIZE

addi B4, B4, 16*SIZE

SCOPYOT_L4x2_BEGIN:

andi. T1, N, 2
ble SCOPYOT_L4x1_BEGIN

mr BO, B2

COPY_4x2

addi A0, A0, 2*SIZE
addi A1, A1, 2*SIZE
addi A2, A2, 2*SIZE
addi A3, A3, 2*SIZE

addi B2, B2, 8*SIZE

SCOPYOT_L4x1_BEGIN:

andi. T1, N, 1
ble SCOPYOT_L4_END

mr BO, B1

COPY_4x1

addi A0, A0, 1*SIZE
addi A1, A1, 1*SIZE
addi A2, A2, 1*SIZE
addi A3, A3, 1*SIZE

addi B1, B1, 4*SIZE

SCOPYOT_L4_END:

addic. I, I, -1
bgt SCOPYOT_L4_BEGIN



SCOPYOT_L2_BEGIN:

andi. T1, M, 2
ble SCOPYOT_L1_BEGIN

mr A0, A
add A1, A0, LDA
add A, A1, LDA
mr B8, B
addi B, B, 16*SIZE

sradi. J, N, 3
ble SCOPYOT_L2x4_BEGIN

mr BO, B8

SCOPYOT_L2x8_LOOP:

COPY_2x8

addi A0, A0, 8*SIZE
addi A1, A1, 8*SIZE
add BO, BO, M8

addic. J, J, -1
bgt SCOPYOT_L2x8_LOOP

SCOPYOT_L2x4_BEGIN:

andi. T1, N, 4
ble SCOPYOT_L2x2_BEGIN

mr BO, B4

COPY_2x4

addi A0, A0, 4*SIZE
addi A1, A1, 4*SIZE

addi B4, B4, 8*SIZE

SCOPYOT_L2x2_BEGIN:

andi. T1, N, 2
ble SCOPYOT_L2x1_BEGIN

mr BO, B2

COPY_2x2

addi A0, A0, 2*SIZE
addi A1, A1, 2*SIZE

addi B2, B2, 4*SIZE

SCOPYOT_L2x1_BEGIN:

andi. T1, N, 1
ble SCOPYOT_L2_END

mr BO, B1

COPY_2x1

addi A0, A0, 1*SIZE
addi A1, A1, 1*SIZE

addi B1, B1, 2*SIZE

SCOPYOT_L2_END:


SCOPYOT_L1_BEGIN:

andi. T1, M, 1
ble L999

mr A0, A
add A, A0, LDA
mr B8, B
addi B, B, 8*SIZE

sradi. J, N, 3
ble SCOPYOT_L1x4_BEGIN

mr BO, B8

SCOPYOT_L1x8_LOOP:

COPY_1x8

addi A0, A0, 8*SIZE
add BO, BO, M8

addic. J, J, -1
bgt SCOPYOT_L1x8_LOOP

SCOPYOT_L1x4_BEGIN:

andi. T1, N, 4
ble SCOPYOT_L1x2_BEGIN

mr BO, B4

COPY_1x4

addi A0, A0, 4*SIZE

addi B4, B4, 4*SIZE

SCOPYOT_L1x2_BEGIN:

andi. T1, N, 2
ble SCOPYOT_L1x1_BEGIN

mr BO, B2

COPY_1x2

addi A0, A0, 2*SIZE

addi B2, B2, 2*SIZE

SCOPYOT_L1x1_BEGIN:

andi. T1, N, 1
ble SCOPYOT_L1_END

mr BO, B1

COPY_1x1

addi A0, A0, 1*SIZE

addi B1, B1, 1*SIZE

SCOPYOT_L1_END:


+ 308
- 0
kernel/power/sgemm_tcopy_macros_8_power8.S View File

@@ -0,0 +1,308 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/


/**********************************************************************************************
* Macros for N=4 and M=8
**********************************************************************************************/

.macro COPY_4x8

lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0

lxvw4x vs34, o0, A1
lxvw4x vs35, o16, A1

lxvw4x vs36, o0, A2
lxvw4x vs37, o16, A2

lxvw4x vs38, o0, A3
lxvw4x vs39, o16, A3

mr T1, BO

stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1

stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1

addi T1, T1, 64

stxvw4x vs36, o0, T1
stxvw4x vs37, o16, T1

stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1

.endm

/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/

.macro COPY_4x4

lxvw4x vs32, o0, A0

lxvw4x vs33, o0, A1

lxvw4x vs34, o0, A2

lxvw4x vs35, o0, A3

mr T1, BO

stxvw4x vs32, o0, T1

stxvw4x vs33, o16, T1

stxvw4x vs34, o32, T1

stxvw4x vs35, o48, T1

.endm

/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/

.macro COPY_4x2

lxsspx vs32, o0, A0
lxsspx vs33, o4, A0

lxsspx vs34, o0, A1
lxsspx vs35, o4, A1

lxsspx vs36, o0, A2
lxsspx vs37, o4, A2

lxsspx vs38, o0, A3
lxsspx vs39, o4, A3

mr T1, BO

stxsspx vs32, o0, T1
stxsspx vs33, o4, T1

addi T1, T1, 8

stxsspx vs34, o0, T1
stxsspx vs35, o4, T1

addi T1, T1, 8

stxsspx vs36, o0, T1
stxsspx vs37, o4, T1

addi T1, T1, 8

stxsspx vs38, o0, T1
stxsspx vs39, o4, T1

.endm

/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/

.macro COPY_4x1

lxsspx vs32, o0, A0

lxsspx vs33, o0, A1

lxsspx vs34, o0, A2

lxsspx vs35, o0, A3

mr T1, BO

stxsspx vs32, o0, T1

stxsspx vs33, o4, T1

addi T1, T1, 8

stxsspx vs34, o0, T1

stxsspx vs35, o4, T1

.endm

/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/

.macro COPY_2x8

lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0

lxvw4x vs34, o0, A1
lxvw4x vs35, o16, A1

mr T1, BO

stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1

stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1

.endm

/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/

.macro COPY_2x4

lxvw4x vs32, o0, A0

lxvw4x vs33, o0, A1

mr T1, BO

stxvw4x vs32, o0, T1

stxvw4x vs33, o16, T1

.endm

/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/

.macro COPY_2x2

lxsspx vs32, o0, A0
lxsspx vs33, o4, A0

lxsspx vs34, o0, A1
lxsspx vs35, o4, A1

mr T1, BO

stxsspx vs32, o0, T1
stxsspx vs33, o4, T1

addi T1, T1, 8

stxsspx vs34, o0, T1
stxsspx vs35, o4, T1

.endm

/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/

.macro COPY_2x1

lxsspx vs32, o0, A0

lxsspx vs33, o0, A1

mr T1, BO

stxsspx vs32, o0, T1

stxsspx vs33, o4, T1

.endm

/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/

.macro COPY_1x8

lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0

mr T1, BO

stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1

.endm

/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/

.macro COPY_1x4

lxvw4x vs32, o0, A0

mr T1, BO

stxvw4x vs32, o0, T1

.endm

/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/

.macro COPY_1x2

lxsspx vs32, o0, A0
lxsspx vs33, o4, A0

mr T1, BO

stxsspx vs32, o0, T1
stxsspx vs33, o4, T1

.endm

/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/

.macro COPY_1x1

lxsspx vs32, o0, A0

mr T1, BO

stxsspx vs32, o0, T1

.endm


+ 71
- 1
kernel/power/zgemm_kernel_8x2_power8.S View File

@@ -1,3 +1,73 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/

/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/

/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
@@ -250,7 +320,7 @@
ble L999

slwi LDC, LDC, ZBASE_SHIFT
li PRE, 384
li PRE, 512
li o8 , 8
li o16 , 16
li o24 , 24


+ 60
- 9
kernel/power/zgemm_logic_8x2_power8.S View File

@@ -1,3 +1,39 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/


srawi. J, N, 1
ble ZGEMM_L2_END

@@ -5,20 +41,34 @@ ZGEMM_L2_BEGIN:

mr BO, B
mr BBO, BBUFFER
slwi T1, K, 1
srawi. T1, K, 2
ble ZGEMM_L2_COPYB1

ZGEMM_L2_COPYB:
ZGEMM_L2_COPYB8:

lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i
addi BO, BO, 16
stxvd2x vs4, o0, BBO
stxvd2x vs5, o16, BBO
addi T2, PRE, 128
dcbt BO, PRE
dcbtst BBO, PRE
dcbtst BBO, T2
ZCOPYB_8x1
addic. T1, T1, -1
addi BBO, BBO, 32

bge ZGEMM_L2_COPYB
bgt ZGEMM_L2_COPYB8

ZGEMM_L2_COPYB1:

andi. T1, K, 3
ble ZGEMM_L2_COPYB_END

ZGEMM_L2_COPYB_LOOP:

ZCOPYB_1x1
ZCOPYB_1x1
addic. T1, T1, -1

bgt ZGEMM_L2_COPYB_LOOP

ZGEMM_L2_COPYB_END:

mr CO, C
mr AO, A
@@ -493,6 +543,7 @@ ZGEMM_L1_BEGIN:
slwi T1, K, 0

ZGEMM_L1_COPYB:
dcbtst BBO, PRE

lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i


+ 108
- 0
kernel/power/zgemm_macros_8x2_power8.S View File

@@ -1,3 +1,38 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/

#if defined(NN) || defined(NT) || defined(TN) || defined(TT)

#define XSFADD_R1 xsadddp
@@ -3055,3 +3090,76 @@

.endm



.macro ZCOPYB_1x1

lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i
addi BO, BO, 16
stxvd2x vs4, o0, BBO
stxvd2x vs5, o16, BBO
addi BBO, BBO, 32

.endm


.macro ZCOPYB_8x1

lxvd2x vs32, o0, BO
lxvd2x vs33, o16, BO
lxvd2x vs34, o32, BO
lxvd2x vs35, o48, BO
addi BO, BO, 64

lxvd2x vs36, o0, BO
lxvd2x vs37, o16, BO
lxvd2x vs38, o32, BO
lxvd2x vs39, o48, BO
addi BO, BO, 64

xxspltd vs40, vs32, 0
xxspltd vs41, vs32, 1
xxspltd vs42, vs33, 0
xxspltd vs43, vs33, 1
xxspltd vs44, vs34, 0
xxspltd vs45, vs34, 1
xxspltd vs46, vs35, 0
xxspltd vs47, vs35, 1

xxspltd vs48, vs36, 0
xxspltd vs49, vs36, 1
xxspltd vs50, vs37, 0
xxspltd vs51, vs37, 1
xxspltd vs52, vs38, 0
xxspltd vs53, vs38, 1
xxspltd vs54, vs39, 0
xxspltd vs55, vs39, 1

stxvd2x vs40, o0, BBO
stxvd2x vs41, o16, BBO
stxvd2x vs42, o32, BBO
stxvd2x vs43, o48, BBO
addi BBO, BBO, 64

stxvd2x vs44, o0, BBO
stxvd2x vs45, o16, BBO
stxvd2x vs46, o32, BBO
stxvd2x vs47, o48, BBO
addi BBO, BBO, 64

stxvd2x vs48, o0, BBO
stxvd2x vs49, o16, BBO
stxvd2x vs50, o32, BBO
stxvd2x vs51, o48, BBO
addi BBO, BBO, 64

stxvd2x vs52, o0, BBO
stxvd2x vs53, o16, BBO
stxvd2x vs54, o32, BBO
stxvd2x vs55, o48, BBO
addi BBO, BBO, 64

.endm



+ 205
- 0
kernel/power/zgemm_tcopy_8_power8.S View File

@@ -0,0 +1,205 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/

/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"

#define M r3
#define N r4
#define A r5
#define LDA r6
#define B r7

#define A0 r8
#define A1 r9
#define A2 r10
#define A3 r11

#define J r12

#define PREA r14
#define PREB r15
#define BO r16
#define B8 r17
#define B4 r18
#define B2 r19
#define B1 r20
#define NOTUS1 r21
#define T2 r22
#define I r23
#define o16 r24
#define o32 r25
#define o48 r26
#define NOTUS2 r27
#define M8 r30
#define T1 r31

#define o0 0

#include "zgemm_tcopy_macros_8_power8.S"

#define STACKSIZE 384


PROLOGUE
PROFCODE

addi SP, SP, -STACKSIZE
li r0, 0

std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)

cmpwi cr0, M, 0
ble- L999
cmpwi cr0, N, 0
ble- L999

slwi LDA, LDA, ZBASE_SHIFT
slwi M8, M, 3 + ZBASE_SHIFT

li T2, -8
li PREA, -4
li PREB, -2

and B4, N, T2
and B2, N, PREA
and B1, N, PREB
mullw B4, B4, M
mullw B2, B2, M
mullw B1, B1, M

slwi B4, B4, ZBASE_SHIFT
slwi B2, B2, ZBASE_SHIFT
slwi B1, B1, ZBASE_SHIFT

add B4, B4, B
add B2, B2, B
add B1, B1, B

li PREA, 384
addi PREB, M8, 128

li o16, 16
li o32, 32
li o48, 48

#include "zgemm_tcopy_logic_8_power8.S"

L999:

li r3, 0

ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)

addi SP, SP, STACKSIZE

blr
EPILOGUE



+ 246
- 0
kernel/power/zgemm_tcopy_logic_8_power8.S View File

@@ -0,0 +1,246 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/


srawi. I, M, 2
ble ZCOPYT_L2_BEGIN


ZCOPYT_L4_BEGIN:

mr A0, A
add A1, A0, LDA
add A2, A1, LDA
add A3, A2, LDA
add A, A3, LDA
mr B8, B
addi B, B, 64*SIZE

sradi. J, N, 3
ble ZCOPYT_L4x4_BEGIN

mr BO, B8

.align 5

ZCOPYT_L4x8_LOOP:

addi T1, PREB, 128
addi T2, PREB, 256
dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
dcbtst BO, M8
dcbtst BO, PREB
dcbtst BO, T1
dcbtst BO, T2

COPY_4x8

add BO, BO, M8

addic. J, J, -1
bgt ZCOPYT_L4x8_LOOP

ZCOPYT_L4x4_BEGIN:

andi. T1, N, 4
ble ZCOPYT_L4x2_BEGIN

mr BO, B4

COPY_4x4


addi B4, B4, 32*SIZE

ZCOPYT_L4x2_BEGIN:

andi. T1, N, 2
ble ZCOPYT_L4x1_BEGIN

mr BO, B2

COPY_4x2


addi B2, B2, 16*SIZE

ZCOPYT_L4x1_BEGIN:

andi. T1, N, 1
ble ZCOPYT_L4_END

mr BO, B1

COPY_4x1


addi B1, B1, 8*SIZE

ZCOPYT_L4_END:

addic. I, I, -1
bgt ZCOPYT_L4_BEGIN



ZCOPYT_L2_BEGIN:

andi. T1, M, 2
ble ZCOPYT_L1_BEGIN

mr A0, A
add A1, A0, LDA
add A, A1, LDA
mr B8, B
addi B, B, 32*SIZE

sradi. J, N, 3
ble ZCOPYT_L2x4_BEGIN

mr BO, B8

ZCOPYT_L2x8_LOOP:

COPY_2x8

add BO, BO, M8

addic. J, J, -1
bgt ZCOPYT_L2x8_LOOP

ZCOPYT_L2x4_BEGIN:

andi. T1, N, 4
ble ZCOPYT_L2x2_BEGIN

mr BO, B4

COPY_2x4


addi B4, B4, 16*SIZE

ZCOPYT_L2x2_BEGIN:

andi. T1, N, 2
ble ZCOPYT_L2x1_BEGIN

mr BO, B2

COPY_2x2


addi B2, B2, 8*SIZE

ZCOPYT_L2x1_BEGIN:

andi. T1, N, 1
ble ZCOPYT_L2_END

mr BO, B1

COPY_2x1


addi B1, B1, 4*SIZE

ZCOPYT_L2_END:


ZCOPYT_L1_BEGIN:

andi. T1, M, 1
ble L999

mr A0, A
add A, A0, LDA
mr B8, B
addi B, B, 16*SIZE

sradi. J, N, 3
ble ZCOPYT_L1x4_BEGIN

mr BO, B8

ZCOPYT_L1x8_LOOP:

COPY_1x8

add BO, BO, M8

addic. J, J, -1
bgt ZCOPYT_L1x8_LOOP

ZCOPYT_L1x4_BEGIN:

andi. T1, N, 4
ble ZCOPYT_L1x2_BEGIN

mr BO, B4

COPY_1x4


addi B4, B4, 8*SIZE

ZCOPYT_L1x2_BEGIN:

andi. T1, N, 2
ble ZCOPYT_L1x1_BEGIN

mr BO, B2

COPY_1x2


addi B2, B2, 4*SIZE

ZCOPYT_L1x1_BEGIN:

andi. T1, N, 1
ble ZCOPYT_L1_END

mr BO, B1

COPY_1x1


addi B1, B1, 2*SIZE

ZCOPYT_L1_END:


+ 535
- 0
kernel/power/zgemm_tcopy_macros_8_power8.S View File

@@ -0,0 +1,535 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/


/**********************************************************************************************
* Macros for N=4 and M=8
**********************************************************************************************/

.macro COPY_4x8

lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64

lxvd2x vs36, o0, A0
lxvd2x vs37, o16, A0
lxvd2x vs38, o32, A0
lxvd2x vs39, o48, A0
addi A0, A0, 64


lxvd2x vs40, o0, A1
lxvd2x vs41, o16, A1
lxvd2x vs42, o32, A1
lxvd2x vs43, o48, A1
addi A1, A1, 64

lxvd2x vs44, o0, A1
lxvd2x vs45, o16, A1
lxvd2x vs46, o32, A1
lxvd2x vs47, o48, A1
addi A1, A1, 64


lxvd2x vs48, o0, A2
lxvd2x vs49, o16, A2
lxvd2x vs50, o32, A2
lxvd2x vs51, o48, A2
addi A2, A2, 64

lxvd2x vs52, o0, A2
lxvd2x vs53, o16, A2
lxvd2x vs54, o32, A2
lxvd2x vs55, o48, A2
addi A2, A2, 64


lxvd2x vs56, o0, A3
lxvd2x vs57, o16, A3
lxvd2x vs58, o32, A3
lxvd2x vs59, o48, A3
addi A3, A3, 64

lxvd2x vs60, o0, A3
lxvd2x vs61, o16, A3
lxvd2x vs62, o32, A3
lxvd2x vs63, o48, A3
addi A3, A3, 64


mr T1, BO

stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64

stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1

addi T1, T1, 64

stxvd2x vs40, o0, T1
stxvd2x vs41, o16, T1
stxvd2x vs42, o32, T1
stxvd2x vs43, o48, T1
addi T1, T1, 64

stxvd2x vs44, o0, T1
stxvd2x vs45, o16, T1
stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1

addi T1, T1, 64

stxvd2x vs48, o0, T1
stxvd2x vs49, o16, T1
stxvd2x vs50, o32, T1
stxvd2x vs51, o48, T1
addi T1, T1, 64

stxvd2x vs52, o0, T1
stxvd2x vs53, o16, T1
stxvd2x vs54, o32, T1
stxvd2x vs55, o48, T1

addi T1, T1, 64

stxvd2x vs56, o0, T1
stxvd2x vs57, o16, T1
stxvd2x vs58, o32, T1
stxvd2x vs59, o48, T1
addi T1, T1, 64

stxvd2x vs60, o0, T1
stxvd2x vs61, o16, T1
stxvd2x vs62, o32, T1
stxvd2x vs63, o48, T1

.endm


/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/

.macro COPY_4x4

lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64


lxvd2x vs36, o0, A1
lxvd2x vs37, o16, A1
lxvd2x vs38, o32, A1
lxvd2x vs39, o48, A1
addi A1, A1, 64


lxvd2x vs40, o0, A2
lxvd2x vs41, o16, A2
lxvd2x vs42, o32, A2
lxvd2x vs43, o48, A2
addi A2, A2, 64


lxvd2x vs44, o0, A3
lxvd2x vs45, o16, A3
lxvd2x vs46, o32, A3
lxvd2x vs47, o48, A3
addi A3, A3, 64


mr T1, BO

stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1

addi T1, T1, 64

stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1

addi T1, T1, 64

stxvd2x vs40, o0, T1
stxvd2x vs41, o16, T1
stxvd2x vs42, o32, T1
stxvd2x vs43, o48, T1

addi T1, T1, 64

stxvd2x vs44, o0, T1
stxvd2x vs45, o16, T1
stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1

.endm


/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/

.macro COPY_4x2

lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
addi A0, A0, 32


lxvd2x vs34, o0, A1
lxvd2x vs35, o16, A1
addi A1, A1, 32


lxvd2x vs36, o0, A2
lxvd2x vs37, o16, A2
addi A2, A2, 32


lxvd2x vs38, o0, A3
lxvd2x vs39, o16, A3
addi A3, A3, 32


mr T1, BO

stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1

stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1

addi T1, T1, 64

stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1

stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1

.endm


/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/

.macro COPY_4x1

lxvd2x vs32, o0, A0
addi A0, A0, 16


lxvd2x vs33, o0, A1
addi A1, A1, 16


lxvd2x vs34, o0, A2
addi A2, A2, 16


lxvd2x vs35, o0, A3
addi A3, A3, 16


mr T1, BO

stxvd2x vs32, o0, T1

stxvd2x vs33, o16, T1

stxvd2x vs34, o32, T1

stxvd2x vs35, o48, T1

.endm


/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/

.macro COPY_2x8

lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64

lxvd2x vs36, o0, A0
lxvd2x vs37, o16, A0
lxvd2x vs38, o32, A0
lxvd2x vs39, o48, A0
addi A0, A0, 64


lxvd2x vs40, o0, A1
lxvd2x vs41, o16, A1
lxvd2x vs42, o32, A1
lxvd2x vs43, o48, A1
addi A1, A1, 64

lxvd2x vs44, o0, A1
lxvd2x vs45, o16, A1
lxvd2x vs46, o32, A1
lxvd2x vs47, o48, A1
addi A1, A1, 64


mr T1, BO

stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64

stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1

addi T1, T1, 64

stxvd2x vs40, o0, T1
stxvd2x vs41, o16, T1
stxvd2x vs42, o32, T1
stxvd2x vs43, o48, T1
addi T1, T1, 64

stxvd2x vs44, o0, T1
stxvd2x vs45, o16, T1
stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1

.endm


/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/

.macro COPY_2x4

lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64


lxvd2x vs36, o0, A1
lxvd2x vs37, o16, A1
lxvd2x vs38, o32, A1
lxvd2x vs39, o48, A1
addi A1, A1, 64


mr T1, BO

stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1

addi T1, T1, 64

stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1

.endm


/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/

.macro COPY_2x2

lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
addi A0, A0, 32


lxvd2x vs34, o0, A1
lxvd2x vs35, o16, A1
addi A1, A1, 32


mr T1, BO

stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1

stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1

.endm


/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/

.macro COPY_2x1

lxvd2x vs32, o0, A0
addi A0, A0, 16


lxvd2x vs33, o0, A1
addi A1, A1, 16


mr T1, BO

stxvd2x vs32, o0, T1

stxvd2x vs33, o16, T1

.endm


/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/

.macro COPY_1x8

lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64

lxvd2x vs36, o0, A0
lxvd2x vs37, o16, A0
lxvd2x vs38, o32, A0
lxvd2x vs39, o48, A0
addi A0, A0, 64


mr T1, BO

stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64

stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1

.endm


/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/

.macro COPY_1x4

lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64


mr T1, BO

stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1

.endm


/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/

.macro COPY_1x2

lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
addi A0, A0, 32


mr T1, BO

stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1

.endm


/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/

.macro COPY_1x1

lxvd2x vs32, o0, A0
addi A0, A0, 16


mr T1, BO

stxvd2x vs32, o0, T1

.endm


+ 17
- 0
kernel/setparam-ref.c View File

@@ -933,6 +933,23 @@ static void init_parameter(void) {
#endif
#endif

#ifdef EXCAVATOR

#ifdef DEBUG
fprintf(stderr, "Excavator\n");
#endif

TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif


#ifdef PILEDRIVER

#ifdef DEBUG


+ 19
- 17
kernel/x86_64/KERNEL.EXCAVATOR View File

@@ -1,3 +1,7 @@
DSCALKERNEL = dscal.c
CSCALKERNEL = cscal.c
ZSCALKERNEL = zscal.c

SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
@@ -20,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c
DGEMVNKERNEL = dgemv_n_4.c
DGEMVTKERNEL = dgemv_t_4.c

ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVNKERNEL = zgemv_n_4.c
ZGEMVTKERNEL = zgemv_t_4.c

DCOPYKERNEL = dcopy_bulldozer.S
@@ -68,25 +72,23 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

STRSMKERNEL_LN = strsm_kernel_LN_bulldozer.c
STRSMKERNEL_LT = strsm_kernel_LT_bulldozer.c
STRSMKERNEL_RN = strsm_kernel_RN_bulldozer.c
STRSMKERNEL_RT = strsm_kernel_RT_bulldozer.c

DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LN = dtrsm_kernel_LN_bulldozer.c
DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_RT = dtrsm_kernel_RT_bulldozer.c

ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ctrsm_kernel_LN_bulldozer.c
CTRSMKERNEL_LT = ctrsm_kernel_LT_bulldozer.c
CTRSMKERNEL_RN = ctrsm_kernel_RN_bulldozer.c
CTRSMKERNEL_RT = ctrsm_kernel_RT_bulldozer.c

ZTRSMKERNEL_LN = ztrsm_kernel_LN_bulldozer.c
ZTRSMKERNEL_LT = ztrsm_kernel_LT_bulldozer.c
ZTRSMKERNEL_RN = ztrsm_kernel_RN_bulldozer.c
ZTRSMKERNEL_RT = ztrsm_kernel_RT_bulldozer.c


+ 1
- 1
kernel/x86_64/caxpy.c View File

@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"


#if defined(PILEDRIVER) || defined(STEAMROLLER)
#if defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "caxpy_microk_steamroller-2.c"
#elif defined(BULLDOZER)
#include "caxpy_microk_bulldozer-2.c"


+ 1
- 1
kernel/x86_64/cdot.c View File

@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#if defined(BULLDOZER)
#include "cdot_microk_bulldozer-2.c"
#elif defined(STEAMROLLER) || defined(PILEDRIVER)
#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
#include "cdot_microk_steamroller-2.c"
#elif defined(HASWELL)
#include "cdot_microk_haswell-2.c"


+ 1
- 1
kernel/x86_64/cgemv_n_4.c View File

@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#if defined(HASWELL)
#include "cgemv_n_microk_haswell-4.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "cgemv_n_microk_bulldozer-4.c"
#endif



+ 1
- 1
kernel/x86_64/cgemv_t_4.c View File

@@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#if defined(HASWELL)
#include "cgemv_t_microk_haswell-4.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "cgemv_t_microk_bulldozer-4.c"
#endif



+ 1
- 1
kernel/x86_64/cscal.c View File

@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cscal_microk_haswell-2.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER)
#include "cscal_microk_bulldozer-2.c"
#elif defined(STEAMROLLER)
#elif defined(STEAMROLLER) || defined(EXCAVATOR)
#include "cscal_microk_steamroller-2.c"
#elif defined(SANDYBRIDGE)
#include "cscal_microk_bulldozer-2.c"


+ 1
- 1
kernel/x86_64/daxpy.c View File

@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "daxpy_microk_nehalem-2.c"
#elif defined(BULLDOZER)
#include "daxpy_microk_bulldozer-2.c"
#elif defined(STEAMROLLER)
#elif defined(STEAMROLLER) || defined(EXCAVATOR)
#include "daxpy_microk_steamroller-2.c"
#elif defined(PILEDRIVER)
#include "daxpy_microk_piledriver-2.c"


+ 1
- 1
kernel/x86_64/ddot.c View File

@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#if defined(BULLDOZER)
#include "ddot_microk_bulldozer-2.c"
#elif defined(STEAMROLLER)
#elif defined(STEAMROLLER) || defined(EXCAVATOR)
#include "ddot_microk_steamroller-2.c"
#elif defined(PILEDRIVER)
#include "ddot_microk_piledriver-2.c"


+ 1
- 1
kernel/x86_64/dgemv_n_4.c View File

@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#if defined(NEHALEM)
#include "dgemv_n_microk_nehalem-4.c"
#elif defined(HASWELL) || defined(STEAMROLLER)
#elif defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dgemv_n_microk_haswell-4.c"
#endif



+ 1
- 1
kernel/x86_64/dgemv_t_4.c View File

@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

#if defined(HASWELL) || defined(STEAMROLLER)
#if defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dgemv_t_microk_haswell-4.c"
#endif



+ 1
- 1
kernel/x86_64/dscal.c View File

@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dscal_microk_bulldozer-2.c"
#elif defined(SANDYBRIDGE)
#include "dscal_microk_sandy-2.c"


+ 1
- 1
kernel/x86_64/dsymv_L.c View File

@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dsymv_L_microk_bulldozer-2.c"
#elif defined(HASWELL)
#include "dsymv_L_microk_haswell-2.c"


+ 1
- 1
kernel/x86_64/dsymv_U.c View File

@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"


#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "dsymv_U_microk_bulldozer-2.c"
#elif defined(HASWELL)
#include "dsymv_U_microk_haswell-2.c"


+ 1
- 1
kernel/x86_64/saxpy.c View File

@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "saxpy_microk_haswell-2.c"
#elif defined(SANDYBRIDGE)
#include "saxpy_microk_sandy-2.c"
#elif defined(PILEDRIVER) || defined(STEAMROLLER)
#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "saxpy_microk_piledriver-2.c"
#endif



+ 1
- 1
kernel/x86_64/sdot.c View File

@@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#if defined(BULLDOZER)
#include "sdot_microk_bulldozer-2.c"
#elif defined(STEAMROLLER) || defined(PILEDRIVER)
#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
#include "sdot_microk_steamroller-2.c"
#elif defined(NEHALEM)
#include "sdot_microk_nehalem-2.c"


+ 2
- 2
kernel/x86_64/sgemv_n_4.c View File

@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"


#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "sgemv_n_microk_bulldozer-4.c"
#elif defined(NEHALEM)
#include "sgemv_n_microk_nehalem-4.c"
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sgemv_n_microk_haswell-4.c"
#endif

#if defined(STEAMROLLER)
#if defined(STEAMROLLER) || defined(EXCAVATOR)
#define NBMAX 2048
#else
#define NBMAX 4096


+ 2
- 2
kernel/x86_64/sgemv_t_4.c View File

@@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#if defined(NEHALEM)
#include "sgemv_t_microk_nehalem-4.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "sgemv_t_microk_bulldozer-4.c"
#elif defined(SANDYBRIDGE)
#include "sgemv_t_microk_sandy-4.c"
@@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sgemv_t_microk_haswell-4.c"
#endif

#if defined(STEAMROLLER)
#if defined(STEAMROLLER) || defined(EXCAVATOR)
#define NBMAX 2048
#else
#define NBMAX 4096


+ 1
- 1
kernel/x86_64/ssymv_L.c View File

@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "ssymv_L_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "ssymv_L_microk_nehalem-2.c"


+ 1
- 1
kernel/x86_64/ssymv_U.c View File

@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"


#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "ssymv_U_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "ssymv_U_microk_nehalem-2.c"


+ 1
- 1
kernel/x86_64/zaxpy.c View File

@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#if defined(BULLDOZER)
#include "zaxpy_microk_bulldozer-2.c"
#elif defined(PILEDRIVER) || defined(STEAMROLLER)
#elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "zaxpy_microk_steamroller-2.c"
#elif defined(HASWELL)
#include "zaxpy_microk_haswell-2.c"


+ 1
- 1
kernel/x86_64/zdot.c View File

@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#if defined(BULLDOZER)
#include "zdot_microk_bulldozer-2.c"
#elif defined(STEAMROLLER) || defined(PILEDRIVER)
#elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
#include "zdot_microk_steamroller-2.c"
#elif defined(HASWELL)
#include "zdot_microk_haswell-2.c"


+ 1
- 1
kernel/x86_64/zgemv_n_4.c View File

@@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zgemv_n_microk_haswell-4.c"
#elif defined(SANDYBRIDGE)
#include "zgemv_n_microk_sandy-4.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "zgemv_n_microk_bulldozer-4.c"
#endif



+ 1
- 1
kernel/x86_64/zgemv_t_4.c View File

@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"


#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
#include "zgemv_t_microk_bulldozer-4.c"
#elif defined(HASWELL)
#include "zgemv_t_microk_haswell-4.c"


+ 1
- 1
kernel/x86_64/zscal.c View File

@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zscal_microk_haswell-2.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER)
#include "zscal_microk_bulldozer-2.c"
#elif defined(STEAMROLLER)
#elif defined(STEAMROLLER) || defined(EXCAVATOR)
#include "zscal_microk_steamroller-2.c"
#endif



+ 9
- 9
param.h View File

@@ -1977,15 +1977,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2

#define SGEMM_DEFAULT_P 960
#define DGEMM_DEFAULT_P 480
#define CGEMM_DEFAULT_P 720
#define ZGEMM_DEFAULT_P 480
#define SGEMM_DEFAULT_Q 720
#define DGEMM_DEFAULT_Q 720
#define CGEMM_DEFAULT_Q 720
#define ZGEMM_DEFAULT_Q 720
#define SGEMM_DEFAULT_P 1280
#define DGEMM_DEFAULT_P 640
#define CGEMM_DEFAULT_P 640
#define ZGEMM_DEFAULT_P 320
#define SGEMM_DEFAULT_Q 640
#define DGEMM_DEFAULT_Q 640
#define CGEMM_DEFAULT_Q 640
#define ZGEMM_DEFAULT_Q 640

#define SYMV_P 8



+ 8
- 0
test/Makefile View File

@@ -4,6 +4,7 @@ include ../Makefile.system
all :: level1 level2 level3

level1 : sblat1 dblat1 cblat1 zblat1
ifndef CROSS
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1
@@ -21,8 +22,10 @@ else
OPENBLAS_NUM_THREADS=2 ./zblat1
endif
endif
endif

level2 : sblat2 dblat2 cblat2 zblat2
ifndef CROSS
rm -f ?BLAT2.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat
@$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0
@@ -54,8 +57,10 @@ else
@$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0
endif
endif
endif

level3 : sblat3 dblat3 cblat3 zblat3
ifndef CROSS
rm -f ?BLAT3.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat
@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
@@ -87,9 +92,11 @@ else
@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
endif
endif
endif


level3_3m : zblat3_3m cblat3_3m
ifndef CROSS
rm -f ?BLAT3_3M.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat
@$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
@@ -109,6 +116,7 @@ else
@$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
endif
endif
endif





+ 2
- 0
utest/Makefile View File

@@ -21,7 +21,9 @@ $(UTESTBIN): $(OBJS)
$(CC) $(CFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)

run_test: $(UTESTBIN)
ifndef CROSS
./$(UTESTBIN)
endif

clean:
-rm -f *.o $(UTESTBIN)


Loading…
Cancel
Save