Browse Source

Merge branch 'develop' into bulldozer

Conflicts:
	kernel/x86_64/KERNEL.BULLDOZER
tags/v0.2.9.rc1
Zhang Xianyi 13 years ago
parent
commit
72b1edaf1b
100 changed files with 22312 additions and 2519 deletions
  1. +4
    -0
      .gitignore
  2. +24
    -0
      .travis.yml
  3. +87
    -0
      CONTRIBUTORS.md
  4. +50
    -0
      Changelog.txt
  5. +49
    -44
      Makefile
  6. +0
    -5
      Makefile.generic
  7. +18
    -13
      Makefile.install
  8. +0
    -11
      Makefile.power
  9. +6
    -3
      Makefile.rule
  10. +0
    -1
      Makefile.sparc
  11. +121
    -11
      Makefile.system
  12. +0
    -3
      Makefile.x86
  13. +0
    -13
      Makefile.x86_64
  14. +29
    -14
      README.md
  15. +1
    -1
      TargetList.txt
  16. +12
    -6
      c_check
  17. +10
    -0
      cblas.h
  18. +17
    -0
      common.h
  19. +9
    -1
      common_alpha.h
  20. +8
    -1
      common_ia64.h
  21. +14
    -1
      common_linux.h
  22. +10
    -3
      common_mips64.h
  23. +10
    -1
      common_sparc.h
  24. +10
    -3
      common_x86.h
  25. +9
    -1
      common_x86_64.h
  26. +7
    -0
      cpuid.h
  27. +1
    -0
      cpuid_power.c
  28. +68
    -11
      cpuid_x86.c
  29. +14
    -0
      ctest.c
  30. +1
    -1
      ctest/Makefile
  31. +6
    -4
      driver/level2/sbmv_thread.c
  32. +2
    -2
      driver/level3/gemm_thread_n.c
  33. +14
    -1
      driver/level3/level3.c
  34. +24
    -1
      driver/level3/level3_gemm3m_thread.c
  35. +23
    -0
      driver/level3/level3_syrk_threaded.c
  36. +38
    -2
      driver/level3/level3_thread.c
  37. +4
    -1
      driver/others/Makefile
  38. +1
    -0
      driver/others/blas_server.c
  39. +50
    -6
      driver/others/blas_server_omp.c
  40. +2
    -1
      driver/others/blas_server_win32.c
  41. +42
    -7
      driver/others/dynamic.c
  42. +5
    -2
      driver/others/init.c
  43. +22
    -1
      driver/others/memory.c
  44. +52
    -0
      driver/others/openblas_get_parallel.c
  45. +14
    -9
      exports/Makefile
  46. +56
    -21
      exports/gensymbol
  47. +1
    -1
      f_check
  48. +38
    -13
      getarch.c
  49. +40
    -2
      getarch_2nd.c
  50. +13
    -0
      interface/trtri.c
  51. +1
    -1
      kernel/Makefile.L2
  52. +216
    -216
      kernel/Makefile.L3
  53. +16
    -0
      kernel/setparam-ref.c
  54. +59
    -0
      kernel/x86/KERNEL.PILEDRIVER
  55. +5
    -5
      kernel/x86/gemv_n_sse.S
  56. +3
    -3
      kernel/x86/gemv_n_sse2.S
  57. +9
    -19
      kernel/x86/gemv_t_sse.S
  58. +13
    -16
      kernel/x86/gemv_t_sse2.S
  59. +2
    -2
      kernel/x86/lsame.S
  60. +5
    -5
      kernel/x86/trsm_kernel_LN_2x4_sse2.S
  61. +11
    -11
      kernel/x86/trsm_kernel_LN_4x4_sse.S
  62. +5
    -5
      kernel/x86/trsm_kernel_LT_2x4_sse2.S
  63. +11
    -11
      kernel/x86/trsm_kernel_LT_4x4_sse.S
  64. +5
    -5
      kernel/x86/trsm_kernel_RT_2x4_sse2.S
  65. +11
    -11
      kernel/x86/trsm_kernel_RT_4x4_sse.S
  66. +56
    -12
      kernel/x86/zgemv_n_sse.S
  67. +55
    -11
      kernel/x86/zgemv_n_sse2.S
  68. +58
    -13
      kernel/x86/zgemv_t_sse.S
  69. +58
    -14
      kernel/x86/zgemv_t_sse2.S
  70. +2
    -2
      kernel/x86/ztrsm_kernel_LN_2x2_sse.S
  71. +2
    -2
      kernel/x86/ztrsm_kernel_LT_2x2_sse.S
  72. +2
    -2
      kernel/x86/ztrsm_kernel_RT_2x2_sse.S
  73. +45
    -36
      kernel/x86_64/KERNEL.BULLDOZER
  74. +70
    -0
      kernel/x86_64/KERNEL.PILEDRIVER
  75. +5
    -1
      kernel/x86_64/axpy_sse.S
  76. +4
    -1
      kernel/x86_64/axpy_sse2.S
  77. +1900
    -0
      kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
  78. +53
    -4
      kernel/x86_64/cgemv_n.S
  79. +46
    -2
      kernel/x86_64/cgemv_t.S
  80. +408
    -0
      kernel/x86_64/daxpy_bulldozer.S
  81. +291
    -0
      kernel/x86_64/dcopy_bulldozer.S
  82. +311
    -0
      kernel/x86_64/ddot_bulldozer.S
  83. +0
    -1860
      kernel/x86_64/dgemm_kernel_4x4_bulldozer.S
  84. +3880
    -0
      kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
  85. +1821
    -0
      kernel/x86_64/dgemm_ncopy_8_bulldozer.S
  86. +667
    -0
      kernel/x86_64/dgemm_tcopy_8_bulldozer.S
  87. +49
    -7
      kernel/x86_64/dgemv_n.S
  88. +2325
    -0
      kernel/x86_64/dgemv_n_bulldozer.S
  89. +1938
    -0
      kernel/x86_64/dgemv_t_bulldozer.S
  90. +360
    -0
      kernel/x86_64/gemm_ncopy_2_bulldozer.S
  91. +374
    -0
      kernel/x86_64/gemm_tcopy_2_bulldozer.S
  92. +4657
    -0
      kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
  93. +48
    -8
      kernel/x86_64/sgemv_n.S
  94. +5
    -5
      kernel/x86_64/sgemv_t.S
  95. +1
    -1
      kernel/x86_64/symv_L_sse.S
  96. +1
    -1
      kernel/x86_64/symv_L_sse2.S
  97. +1
    -1
      kernel/x86_64/symv_U_sse.S
  98. +1
    -1
      kernel/x86_64/symv_U_sse2.S
  99. +1407
    -0
      kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
  100. +3
    -3
      kernel/x86_64/zgemm_kernel_4x4_sandy.S

+ 4
- 0
.gitignore View File

@@ -4,12 +4,16 @@
*.dylib *.dylib
*.def *.def
*.o *.o
*.out
lapack-3.1.1 lapack-3.1.1
lapack-3.1.1.tgz lapack-3.1.1.tgz
lapack-3.4.1 lapack-3.4.1
lapack-3.4.1.tgz lapack-3.4.1.tgz
lapack-3.4.2 lapack-3.4.2
lapack-3.4.2.tgz lapack-3.4.2.tgz
lapack-netlib/make.inc
lapack-netlib/lapacke/include/lapacke_mangling.h
lapack-netlib/TESTING/testing_results.txt
*.so *.so
*.a *.a
.svn .svn


+ 24
- 0
.travis.yml View File

@@ -0,0 +1,24 @@
language: c
compiler:
- gcc

env:
- TARGET_BOX=LINUX64 BTYPE="BINARY=64"
- TARGET_BOX=LINUX64 BTYPE="BINARY=64 USE_OPENMP=1"
- TARGET_BOX=LINUX64 BTYPE="BINARY=64 INTERFACE64=1"
- TARGET_BOX=LINUX32 BTYPE="BINARY=32"
- TARGET_BOX=WIN64 BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"

before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq gfortran
- if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi

script: make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE

# whitelist
branches:
only:
- master
- develop

+ 87
- 0
CONTRIBUTORS.md View File

@@ -0,0 +1,87 @@
# Contributions to the OpenBLAS project

## Creator & Maintainer

* Zhang Xianyi <traits.zhang@gmail.com>

## Active Developers

* Wang Qian <traz0824@gmail.com>
* Optimize BLAS3 on ICT Loongson 3A.
* Optimize BLAS3 on Intel Sandy Bridge.

* Zaheer Chothia <zaheer.chothia@gmail.com>
* Improve the compatibility about complex number
* Build LAPACKE: C interface to LAPACK
* Improve the windows build.

## Previous Developers

* Chen Shaohu <huhumartinwar@gmail.com>
* Optimize GEMV on the Loongson 3A processor.

* Luo Wen
* Intern. Test Level-2 BLAS.

## Contributors

In chronological order:

* pipping <http://page.mi.fu-berlin.de/pipping>
* [2011-06-11] Make USE_OPENMP=0 disable openmp.

* Stefan Karpinski <stefan@karpinski.org>
* [2011-12-28] Fix a bug about SystemStubs on Mac OS X.

* Alexander Eberspächer <https://github.com/aeberspaecher>
* [2012-05-02] Add note on patch for segfaults on Linux kernel 2.6.32.

* Mike Nolta <mike@nolta.net>
* [2012-05-19] Fix building bug on FreeBSD and NetBSD.

* Sylvestre Ledru <https://github.com/sylvestre>
* [2012-07-01] Improve the detection of sparc. Fix building bug under
Hurd and kfreebsd.

* Jameson Nash <https://github.com/vtjnash>
* [2012-08-20] Provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to
make on the command line.

* Alexander Nasonov <alnsn@yandex.ru>
* [2012-11-10] Fix NetBSD build.

* Sébastien Villemot <sebastien@debian.org>
* [2012-11-14] Fix compilation with TARGET=GENERIC. Patch applied to Debian package.

* Werner Saar <wernsaar@googlemail.com>
* [2013-03-04] Optimize AVX and FMA4 DGEMM on AMD Bulldozer
* [2013-04-27] Optimize AVX and FMA4 TRSM on AMD Bulldozer
* [2013-06-09] Optimize AVX and FMA4 SGEMM on AMD Bulldozer
* [2013-06-11] Optimize AVX and FMA4 ZGEMM on AMD Bulldozer
* [2013-06-12] Optimize AVX and FMA4 CGEMM on AMD Bulldozer
* [2013-06-16] Optimize dgemv_n kernel on AMD Bulldozer
* [2013-06-20] Optimize ddot, daxpy kernel on AMD Bulldozer
* [2013-06-21] Optimize dcopy kernel on AMD Bulldozer

* Kang-Che Sung <Explorer09@gmail.com>
* [2013-05-17] Fix typo in the document. Re-order the architecture list in getarch.c.

* Kenneth Hoste <kenneth.hoste@gmail.com>
* [2013-05-22] Adjust Makefile about downloading LAPACK source files.

* Lei WANG <https://github.com/wlbksy>
* [2013-05-22] Fix a bug about wget.

* Dan Luu <http://www.linkedin.com/in/danluu>
* [2013-06-30] Add Intel Haswell support (using sandybridge optimizations).

* grisuthedragon <https://github.com/grisuthedragon>
* [2013-07-11] create openblas_get_parallel to retrieve information which parallelization
model is used by OpenBLAS.

* Sébastien Fabbro <bicatali@gentoo.org>
* [2013-07-24] Modify makefile to respect user's LDFLAGS
* [2013-07-24] Add stack markings for GNU as arch-independent for assembler files

* [Your name or handle] <[email or website]>
* [Date] [Brief summary of your changes]

+ 50
- 0
Changelog.txt View File

@@ -1,4 +1,54 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
====================================================================
Version 0.2.7
20-Jul-2013
common:
* Support LSB (Linux Standard Base) 4.1.
e.g. make CC=lsbcc
* Include LAPACK 3.4.2 source codes to the repo.
Avoid downloading at compile time.
* Add NO_PARALLEL_MAKE flag to disable parallel make.
* Create openblas_get_parallel to retrieve information which
parallelization model is used by OpenBLAS. (Thank grisuthedragon)
* Detect LLVM/Clang compiler. The default compiler is Clang on Mac OS X.
* Change LIBSUFFIX from .lib to .a on windows.
* A walk round for dtrti_U single thread bug. Replace it with LAPACK codes. (#191)

x86/x86-64:
* Optimize c/zgemm, trsm, dgemv_n, ddot, daxpy, dcopy on
AMD Bulldozer. (Thank Werner Saar)
* Add Intel Haswell support (using Sandybridge optimizations).
(Thank Dan Luu)
* Add AMD Piledriver support (using Bulldozer optimizations).
* Fix the computational error in zgemm avx kernel on
Sandybridge. (#237)
* Fix the overflow bug in gemv.
* Fix the overflow bug in multi-threaded BLAS3, getrf when NUM_THREADS
is very large.(#214, #221, #246).
MIPS64:
* Support loongcc (Open64 based) compiler for ICT Loongson 3A/B.

Power:
* Support Power7 by old Power6 kernels. (#220)

====================================================================
Version 0.2.6
2-Mar-2013
common:
* Improved OpenMP performance slightly. (d744c9)
* Improved cblas.h compatibility with Intel MKL.(#185)
* Fixed the overflowing bug in single thread cholesky factorization.
* Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174)

x86/x86-64:
* Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
We will tune the performance in future.
* Auto-detect Intel Xeon E7540.
* Fixed the overflowing buffer bug of gemv. (#173)
* Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189)

MIPS64:

==================================================================== ====================================================================
Version 0.2.5 Version 0.2.5
26-Nov-2012 26-Nov-2012


+ 49
- 44
Makefile View File

@@ -82,27 +82,27 @@ endif
shared : shared :
ifndef NO_SHARED ifndef NO_SHARED
ifeq ($(OSNAME), Linux) ifeq ($(OSNAME), Linux)
$(MAKE) -C exports so
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
ifeq ($(OSNAME), FreeBSD) ifeq ($(OSNAME), FreeBSD)
$(MAKE) -C exports so
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif endif
ifeq ($(OSNAME), NetBSD) ifeq ($(OSNAME), NetBSD)
$(MAKE) -C exports so
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif endif
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
$(MAKE) -C exports dyn
-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
@$(MAKE) -C exports dyn
@-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
$(MAKE) -C exports dll
@$(MAKE) -C exports dll
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
$(MAKE) -C exports dll
@$(MAKE) -C exports dll
endif endif
endif endif


@@ -131,30 +131,33 @@ endif
ifeq ($(NOFORTRAN), 1) ifeq ($(NOFORTRAN), 1)
$(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.) $(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.)
endif endif
-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
for d in $(SUBDIRS) ; \
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
@for d in $(SUBDIRS) ; \
do if test -d $$d; then \ do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \ $(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \ fi; \
done done
#Save the config files for installation #Save the config files for installation
cp Makefile.conf Makefile.conf_last
cp config.h config_last.h
@cp Makefile.conf Makefile.conf_last
@cp config.h config_last.h
ifdef QUAD_PRECISION ifdef QUAD_PRECISION
echo "#define QUAD_PRECISION">> config_last.h
@echo "#define QUAD_PRECISION">> config_last.h
endif endif
ifeq ($(EXPRECISION), 1) ifeq ($(EXPRECISION), 1)
echo "#define EXPRECISION">> config_last.h
@echo "#define EXPRECISION">> config_last.h
endif endif
## ##
ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(DYNAMIC_ARCH), 1)
$(MAKE) -C kernel commonlibs || exit 1
for d in $(DYNAMIC_CORE) ; \
@$(MAKE) -C kernel commonlibs || exit 1
@for d in $(DYNAMIC_CORE) ; \
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
done done
echo DYNAMIC_ARCH=1 >> Makefile.conf_last
@echo DYNAMIC_ARCH=1 >> Makefile.conf_last
endif endif
touch lib.grd
ifdef USE_THREAD
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
endif
@touch lib.grd


prof : prof_blas prof_lapack prof : prof_blas prof_lapack


@@ -203,19 +206,19 @@ ifeq ($(NO_LAPACK), 1)
netlib : netlib :


else else
netlib : lapack-3.4.2 patch.for_lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc
netlib : lapack_prebuild
ifndef NOFORTRAN ifndef NOFORTRAN
-@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
endif endif
ifndef NO_LAPACKE ifndef NO_LAPACKE
-@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
endif endif
endif endif


prof_lapack : lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc
-@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof


$(NETLIB_LAPACK_DIR)/make.inc :
lapack_prebuild :
ifndef NOFORTRAN ifndef NOFORTRAN
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -224,11 +227,7 @@ ifndef NOFORTRAN
-@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
ifdef INTERFACE64
-@echo "CFLAGS = $(CFLAGS) -DHAVE_LAPACK_CONFIG_H -DLAPACK_ILP64" >> $(NETLIB_LAPACK_DIR)/make.inc
else
-@echo "CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -244,7 +243,7 @@ endif
lapack-3.4.2 : lapack-3.4.2.tgz lapack-3.4.2 : lapack-3.4.2.tgz
ifndef NOFORTRAN ifndef NOFORTRAN
ifndef NO_LAPACK ifndef NO_LAPACK
@if test `$(MD5SUM) lapack-3.4.2.tgz | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \
@if test `$(MD5SUM) $< | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \
echo $(TAR) zxf $< ;\ echo $(TAR) zxf $< ;\
$(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.2) ;\ $(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.2) ;\
rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\ rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\
@@ -262,27 +261,31 @@ lapack-3.4.2.tgz :
ifndef NOFORTRAN ifndef NOFORTRAN
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD)) ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD))
curl -O $(LAPACK_URL)
curl -O $(LAPACK_URL);
else else
ifeq ($(OSNAME), FreeBSD) ifeq ($(OSNAME), FreeBSD)
fetch $(LAPACK_URL)
fetch $(LAPACK_URL);
else else
wget $(LAPACK_URL)
wget -O $@ $(LAPACK_URL);
endif endif
endif endif
endif endif


large.tgz : large.tgz :
ifndef NOFORTRAN ifndef NOFORTRAN
-wget http://www.netlib.org/lapack/timing/large.tgz
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/large.tgz;
fi
endif endif


timing.tgz : timing.tgz :
ifndef NOFORTRAN ifndef NOFORTRAN
-wget http://www.netlib.org/lapack/timing/timing.tgz
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/timing.tgz;
fi
endif endif


lapack-timing : lapack-3.4.2 large.tgz timing.tgz
lapack-timing : large.tgz timing.tgz
ifndef NOFORTRAN ifndef NOFORTRAN
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
@@ -314,10 +317,12 @@ clean ::
#endif #endif
@$(MAKE) -C reference clean @$(MAKE) -C reference clean
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
ifeq ($(OSNAME), Darwin)
@rm -rf getarch.dSYM getarch_2nd.dSYM
endif
@rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib @rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib
@if test -d $(NETLIB_LAPACK_DIR); then \
echo deleting $(NETLIB_LAPACK_DIR); \
rm -rf $(NETLIB_LAPACK_DIR) ;\
fi
@touch $(NETLIB_LAPACK_DIR)/make.inc
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
@rm -f *.grd Makefile.conf_last config_last.h @rm -f *.grd Makefile.conf_last config_last.h
@echo Done.
@echo Done.

+ 0
- 5
Makefile.generic View File

@@ -1,6 +1 @@
COPT = -Wall -O2 # -DGEMMTEST COPT = -Wall -O2 # -DGEMMTEST
ifdef BINARY64
else
# LDFLAGS = -m elf32ppc
LDFLAGS = -m elf_i386
endif

+ 18
- 13
Makefile.install View File

@@ -5,6 +5,7 @@ include ./Makefile.system


OPENBLAS_INCLUDE_DIR:=$(PREFIX)/include OPENBLAS_INCLUDE_DIR:=$(PREFIX)/include
OPENBLAS_LIBRARY_DIR:=$(PREFIX)/lib OPENBLAS_LIBRARY_DIR:=$(PREFIX)/lib
OPENBLAS_BUILD_DIR:=$(CURDIR)


.PHONY : install .PHONY : install
.NOTPARALLEL : install .NOTPARALLEL : install
@@ -48,32 +49,36 @@ endif
#for install static library #for install static library
@echo Copy the static library to $(OPENBLAS_LIBRARY_DIR) @echo Copy the static library to $(OPENBLAS_LIBRARY_DIR)
@cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR) @cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR)
@-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(LIBSUFFIX)
@cd $(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
#for install shared library #for install shared library
@echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR) @echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), Linux) ifeq ($(OSNAME), Linux)
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so.$(MAJOR_VERSION)
@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
@cd $(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
ifeq ($(OSNAME), FreeBSD) ifeq ($(OSNAME), FreeBSD)
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
@cd $(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif endif
ifeq ($(OSNAME), NetBSD) ifeq ($(OSNAME), NetBSD)
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
@cd $(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif endif
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
-cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)
-install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
@-cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)
@-install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
@-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
@-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
@-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
endif endif


@echo Install OK! @echo Install OK!


+ 0
- 11
Makefile.power View File

@@ -17,13 +17,7 @@ endif
endif endif


ifdef BINARY64 ifdef BINARY64
ifeq ($(OSNAME), Linux)
LDFLAGS = -m elf64ppc
endif


ifeq ($(OSNAME), Darwin)
LDFLAGS = -arch ppc64
endif


ifeq ($(OSNAME), AIX) ifeq ($(OSNAME), AIX)
CCOMMON_OPT += -mpowerpc64 -maix64 CCOMMON_OPT += -mpowerpc64 -maix64
@@ -34,17 +28,12 @@ ifeq ($(COMPILER_F77), xlf)
FCOMMON_OPT += -q64 FCOMMON_OPT += -q64
endif endif
ARFLAGS = -X 64 ARFLAGS = -X 64
LDFLAGS = -b64
ASFLAGS = -a64 ASFLAGS = -a64
endif endif
else else
ifeq ($(OSNAME), Linux)
LDFLAGS = -m elf32ppc
endif
ifeq ($(OSNAME), AIX) ifeq ($(OSNAME), AIX)
CCOMMON_OPT += -Wa,-a32 CCOMMON_OPT += -Wa,-a32
ARFLAGS = -X 32 ARFLAGS = -X 32
LDFLAGS = -b32
ASFLAGS = -a32 ASFLAGS = -a32
endif endif
endif endif


+ 6
- 3
Makefile.rule View File

@@ -3,7 +3,7 @@
# #


# This library's version # This library's version
VERSION = 0.2.5
VERSION = 0.2.7


# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -81,6 +81,9 @@ VERSION = 0.2.5
# and OS. However, the performance is low. # and OS. However, the performance is low.
# NO_AVX = 1 # NO_AVX = 1


# Don't use parallel make.
# NO_PARALLEL_MAKE = 1

# If you would like to know minute performance report of GotoBLAS. # If you would like to know minute performance report of GotoBLAS.
# FUNCTION_PROFILE = 1 # FUNCTION_PROFILE = 1


@@ -104,8 +107,8 @@ VERSION = 0.2.5


# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading # with single thread. You can use this flag to avoid the overhead of multi-threading
# in small matrix sizes. The default value is 50.
# GEMM_MULTITHREAD_THRESHOLD = 50
# in small matrix sizes. The default value is 4.
# GEMM_MULTITHREAD_THRESHOLD = 4


# If you need santy check by comparing reference BLAS. It'll be very # If you need santy check by comparing reference BLAS. It'll be very
# slow (Not implemented yet). # slow (Not implemented yet).


+ 0
- 1
Makefile.sparc View File

@@ -10,7 +10,6 @@ endif
ifeq ($(COMPILER_F77), f90) ifeq ($(COMPILER_F77), f90)
FCOMMON_OPT += -xarch=v9 FCOMMON_OPT += -xarch=v9
endif endif
LDFLAGS = -64
else else


CCOMMON_OPT += -mcpu=v9 CCOMMON_OPT += -mcpu=v9


+ 121
- 11
Makefile.system View File

@@ -9,9 +9,7 @@ ifndef TOPDIR
TOPDIR = . TOPDIR = .
endif endif


ifndef NETLIB_LAPACK_DIR
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.2
endif
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib


# Default C compiler # Default C compiler
# - Only set if not specified on the command line or inherited from the environment. # - Only set if not specified on the command line or inherited from the environment.
@@ -20,6 +18,12 @@ endif
# - Default value is 'cc' which is not always a valid command (e.g. MinGW). # - Default value is 'cc' which is not always a valid command (e.g. MinGW).
ifeq ($(origin CC),default) ifeq ($(origin CC),default)
CC = gcc CC = gcc
# Change the default compile to clang on Mac OSX.
# http://stackoverflow.com/questions/714100/os-detecting-makefile
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
CC = clang
endif
endif endif


# Default Fortran compiler (FC) is selected by f_check. # Default Fortran compiler (FC) is selected by f_check.
@@ -53,7 +57,7 @@ GETARCH_FLAGS += -DUSE64BITINT
endif endif


ifndef GEMM_MULTITHREAD_THRESHOLD ifndef GEMM_MULTITHREAD_THRESHOLD
GEMM_MULTITHREAD_THRESHOLD=50
GEMM_MULTITHREAD_THRESHOLD=4
endif endif
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)


@@ -65,6 +69,19 @@ ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g GETARCH_FLAGS += -g
endif endif


ifeq ($(QUIET_MAKE), 1)
MAKE += -s
endif

ifndef NO_PARALLEL_MAKE
NO_PARALLEL_MAKE=0
endif
GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE)

ifeq ($(HOSTCC), loongcc)
GETARCH_FLAGS += -static
endif

# This operation is expensive, so execution should be once. # This operation is expensive, so execution should be once.
ifndef GOTOBLAS_MAKEFILE ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1 export GOTOBLAS_MAKEFILE = 1
@@ -148,7 +165,12 @@ EXTRALIB += -defaultlib:advapi32


SUFFIX = obj SUFFIX = obj
PSUFFIX = pobj PSUFFIX = pobj
LIBSUFFIX = lib
LIBSUFFIX = a

ifeq ($(C_COMPILER), CLANG)
CCOMMON_OPT += -DMS_ABI
endif

ifeq ($(C_COMPILER), GCC) ifeq ($(C_COMPILER), GCC)
#Test for supporting MS_ABI #Test for supporting MS_ABI
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
@@ -167,8 +189,15 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1)
CCOMMON_OPT += -DMS_ABI CCOMMON_OPT += -DMS_ABI
endif endif
endif endif
endif


# Ensure the correct stack alignment on Win32
# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97
ifeq ($(ARCH), x86)
CCOMMON_OPT += -mincoming-stack-boundary=2
FCOMMON_OPT += -mincoming-stack-boundary=2
endif endif

endif endif


ifeq ($(OSNAME), Interix) ifeq ($(OSNAME), Interix)
@@ -223,11 +252,17 @@ NO_BINARY_MODE = 1
endif endif
ifndef NO_EXPRECISION ifndef NO_EXPRECISION
ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), GCC)
# ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
EXPRECISION = 1 EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION -m128bit-long-double CCOMMON_OPT += -DEXPRECISION -m128bit-long-double
FCOMMON_OPT += -m128bit-long-double FCOMMON_OPT += -m128bit-long-double
endif endif
ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double
endif
endif endif
endif endif
endif endif
@@ -235,11 +270,17 @@ endif
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
ifndef NO_EXPRECISION ifndef NO_EXPRECISION
ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), GCC)
# ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
EXPRECISION = 1 EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION -m128bit-long-double CCOMMON_OPT += -DEXPRECISION -m128bit-long-double
FCOMMON_OPT += -m128bit-long-double FCOMMON_OPT += -m128bit-long-double
endif endif
ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double
endif
endif endif
endif endif
endif endif
@@ -249,7 +290,13 @@ CCOMMON_OPT += -wd981
endif endif


ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
ifeq ($(C_COMPILER), GCC)
# ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
CCOMMON_OPT += -fopenmp
endif

ifeq ($(C_COMPILER), CLANG)
$(error OpenBLAS: Clang didn't support OpenMP yet.)
CCOMMON_OPT += -fopenmp CCOMMON_OPT += -fopenmp
endif endif


@@ -277,14 +324,14 @@ ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1) ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
endif endif
endif endif


ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1) ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
endif endif
endif endif


@@ -318,11 +365,18 @@ endif
# C Compiler dependent settings # C Compiler dependent settings
# #


ifeq ($(C_COMPILER), GCC)

# ifeq logical or. GCC or CLANG or LSB
# http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG LSB))
CCOMMON_OPT += -Wall CCOMMON_OPT += -Wall
COMMON_PROF += -fno-inline COMMON_PROF += -fno-inline
NO_UNINITIALIZED_WARN = -Wno-uninitialized NO_UNINITIALIZED_WARN = -Wno-uninitialized


ifeq ($(QUIET_MAKE), 1)
CCOMMON_OPT += $(NO_UNINITIALIZED_WARN) -Wno-unused
endif

ifdef NO_BINARY_MODE ifdef NO_BINARY_MODE


ifeq ($(ARCH), mips64) ifeq ($(ARCH), mips64)
@@ -407,7 +461,12 @@ endif
ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(F_COMPILER), GFORTRAN)
CCOMMON_OPT += -DF_INTERFACE_GFORT CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall FCOMMON_OPT += -Wall
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NO_LAPACK), 1)
ifneq ($(C_COMPILER), LSB)
EXTRALIB += -lgfortran EXTRALIB += -lgfortran
endif
endif
ifdef NO_BINARY_MODE ifdef NO_BINARY_MODE
ifeq ($(ARCH), mips64) ifeq ($(ARCH), mips64)
ifdef BINARY64 ifdef BINARY64
@@ -514,11 +573,28 @@ ifdef INTERFACE64
FCOMMON_OPT += -i8 FCOMMON_OPT += -i8
endif endif
endif endif

ifeq ($(ARCH), mips64)
ifndef BINARY64
FCOMMON_OPT += -n32
else
FCOMMON_OPT += -n64
endif
ifeq ($(CORE), LOONGSON3A)
FCOMMON_OPT += -loongson3 -static
endif

ifeq ($(CORE), LOONGSON3B)
FCOMMON_OPT += -loongson3 -static
endif

else
ifndef BINARY64 ifndef BINARY64
FCOMMON_OPT += -m32 FCOMMON_OPT += -m32
else else
FCOMMON_OPT += -m64 FCOMMON_OPT += -m64
endif endif
endif


ifdef USE_OPENMP ifdef USE_OPENMP
FEXTRALIB += -lstdc++ FEXTRALIB += -lstdc++
@@ -527,12 +603,30 @@ endif
endif endif


ifeq ($(C_COMPILER), OPEN64) ifeq ($(C_COMPILER), OPEN64)

ifeq ($(ARCH), mips64)
ifndef BINARY64
CCOMMON_OPT += -n32
else
CCOMMON_OPT += -n64
endif
ifeq ($(CORE), LOONGSON3A)
CCOMMON_OPT += -loongson3 -static
endif

ifeq ($(CORE), LOONGSON3B)
CCOMMON_OPT += -loongson3 -static
endif

else

ifndef BINARY64 ifndef BINARY64
CCOMMON_OPT += -m32 CCOMMON_OPT += -m32
else else
CCOMMON_OPT += -m64 CCOMMON_OPT += -m64
endif endif
endif endif
endif


ifeq ($(C_COMPILER), SUN) ifeq ($(C_COMPILER), SUN)
CCOMMON_OPT += -w CCOMMON_OPT += -w
@@ -741,6 +835,15 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES = #MAKEOVERRIDES =


LAPACK_CFLAGS = $(CFLAGS)
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
ifdef INTERFACE64
LAPACK_CFLAGS += -DLAPACK_ILP64
endif
ifeq ($(C_COMPILER), LSB)
LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE
endif

ifndef SUFFIX ifndef SUFFIX
SUFFIX = o SUFFIX = o
endif endif
@@ -835,6 +938,13 @@ export ZGEMM_UNROLL_M
export ZGEMM_UNROLL_N export ZGEMM_UNROLL_N
export XGEMM_UNROLL_M export XGEMM_UNROLL_M
export XGEMM_UNROLL_N export XGEMM_UNROLL_N
export CGEMM3M_UNROLL_M
export CGEMM3M_UNROLL_N
export ZGEMM3M_UNROLL_M
export ZGEMM3M_UNROLL_N
export XGEMM3M_UNROLL_M
export XGEMM3M_UNROLL_N



ifdef USE_CUDA ifdef USE_CUDA
export CUDADIR export CUDADIR


+ 0
- 3
Makefile.x86 View File

@@ -1,8 +1,5 @@
# COMPILER_PREFIX = mingw32- # COMPILER_PREFIX = mingw32-


ifeq ($(OSNAME), Linux)
LDFLAGS = -melf_i386
endif


ifeq ($(OSNAME), Interix) ifeq ($(OSNAME), Interix)
ARFLAGS = -m x86 ARFLAGS = -m x86


+ 0
- 13
Makefile.x86_64 View File

@@ -2,25 +2,12 @@


ifeq ($(OSNAME), SunOS) ifeq ($(OSNAME), SunOS)
ifdef BINARY64 ifdef BINARY64
LDFLAGS = -64
ifeq ($(F_COMPILER), SUN) ifeq ($(F_COMPILER), SUN)
FCOMMON_OPT += -m64 FCOMMON_OPT += -m64
endif endif
endif endif
endif endif


ifeq ($(OSNAME), FreeBSD)
LDFLAGS = -m elf_x86_64_fbsd
endif

ifeq ($(OSNAME), Linux)
LDFLAGS = -m elf_x86_64
endif

ifeq ($(OSNAME), Darwin)
LDFLAGS =
endif

ifeq ($(OSNAME), Interix) ifeq ($(OSNAME), Interix)
ARFLAGS = -m x64 ARFLAGS = -m x64
endif endif


+ 29
- 14
README.md View File

@@ -1,11 +1,20 @@
# OpenBLAS # OpenBLAS


[![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)

## Introduction ## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS <http://www.rdcps.ac.cn>.
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.


Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>. Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.


## Installation
## Binary Packages
We provide binary packages for the following platform.

* Windows x86/x86_64

You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).

## Installation from Source
Download from project homepage. http://xianyi.github.com/OpenBLAS/ Download from project homepage. http://xianyi.github.com/OpenBLAS/


Or, check out codes from git://github.com/xianyi/OpenBLAS.git Or, check out codes from git://github.com/xianyi/OpenBLAS.git
@@ -23,11 +32,15 @@ On X86 box, compile this library for loongson3a CPU.


make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A


On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler.

make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32

### Debug version ### Debug version


make DEBUG=1 make DEBUG=1


### Intall to the directory (Optional)
### Install to the directory (optional)


Example: Example:


@@ -43,8 +56,10 @@ Please read GotoBLAS_01Readme.txt
#### x86/x86-64: #### x86/x86-64:
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. - **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge).
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
- **AMD PILEDRIVER**: Used Bulldozer codes.


#### MIPS64: #### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
@@ -54,7 +69,7 @@ Please read GotoBLAS_01Readme.txt
- **GNU/Linux** - **GNU/Linux**
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. - **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supportted by community. We didn't test the library on this OS.
- **FreeBSD**: Supported by community. We didn't test the library on this OS.


## Usages ## Usages
Link with libopenblas.a or -lopenblas for shared library. Link with libopenblas.a or -lopenblas for shared library.
@@ -79,7 +94,7 @@ If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS enviro


### Set the number of threads on runtime. ### Set the number of threads on runtime.


We provided the below functions to controll the number of threads on runtime.
We provided the below functions to control the number of threads on runtime.


void goto_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads);


@@ -91,7 +106,8 @@ If you compile this lib with USE_OPENMP=1, you should use the above functions, t
Please add a issue in https://github.com/xianyi/OpenBLAS/issues Please add a issue in https://github.com/xianyi/OpenBLAS/issues


## Contact ## Contact
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas
* OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
* OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev


## ChangeLog ## ChangeLog
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
@@ -104,10 +120,9 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1. * On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.


## Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
Now, there are 4 branches in github.com.
* The master branch. This a main branch to reflect a production-ready state.
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
* The gh-pages branch. This is for web pages
## Contributing
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
1. Write a test which shows that the bug was fixed or that the feature works as expected.
1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.


+ 1
- 1
TargetList.txt View File

@@ -8,8 +8,8 @@ Supported List:
1.X86/X86_64 1.X86/X86_64
a)Intel CPU: a)Intel CPU:
P2 P2
COPPERMINE
KATMAI KATMAI
COPPERMINE
NORTHWOOD NORTHWOOD
PRESCOTT PRESCOTT
BANIAS BANIAS


+ 12
- 6
c_check View File

@@ -33,6 +33,8 @@ if ($ARGV[0] =~ /(.*)(-[.\d]+)/) {
} }


$compiler = ""; $compiler = "";
$compiler = LSB if ($data =~ /COMPILER_LSB/);
$compiler = CLANG if ($data =~ /COMPILER_CLANG/);
$compiler = PGI if ($data =~ /COMPILER_PGI/); $compiler = PGI if ($data =~ /COMPILER_PGI/);
$compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/); $compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/);
$compiler = INTEL if ($data =~ /COMPILER_INTEL/); $compiler = INTEL if ($data =~ /COMPILER_INTEL/);
@@ -117,7 +119,11 @@ if ($compiler eq "OPEN64") {
$openmp = "-mp"; $openmp = "-mp";
} }


if ($compiler eq "GCC") {
if ($compiler eq "CLANG") {
$openmp = "-fopenmp";
}

if ($compiler eq "GCC" || $compiler eq "LSB") {
$openmp = "-fopenmp"; $openmp = "-fopenmp";
} }


@@ -241,13 +247,13 @@ print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";


if ($os eq "LINUX") { if ($os eq "LINUX") {
@pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`);
# @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`);
if ($pthread[2] ne "") {
print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n";
} else {
# if ($pthread[2] ne "") {
# print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n";
# } else {
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
}
# }
} else { } else {
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
} }


+ 10
- 0
cblas.h View File

@@ -16,6 +16,16 @@ void goto_set_num_threads(int num_threads);
/*Get the build configure on runtime.*/ /*Get the build configure on runtime.*/
char* openblas_get_config(void); char* openblas_get_config(void);


/* Get the parallelization type which is used by OpenBLAS */
int openblas_get_parallel(void);
/* OpenBLAS is compiled for sequential use */
#define OPENBLAS_SEQUENTIAL 0
/* OpenBLAS is compiled using normal threading model */
#define OPENBLAS_THREAD 1
/* OpenBLAS is compiled using OpenMP threading model */
#define OPENBLAS_OPENMP 2


#define CBLAS_INDEX size_t #define CBLAS_INDEX size_t


typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;


+ 17
- 0
common.h View File

@@ -314,6 +314,23 @@ typedef int blasint;
#define YIELDING sched_yield() #define YIELDING sched_yield()
#endif #endif


/***
To alloc job_t on heap or statck.
please https://github.com/xianyi/OpenBLAS/issues/246
***/
#if defined(OS_WINDOWS)
#define GETRF_MEM_ALLOC_THRESHOLD 32
#define BLAS3_MEM_ALLOC_THRESHOLD 32
#endif

#ifndef GETRF_MEM_ALLOC_THRESHOLD
#define GETRF_MEM_ALLOC_THRESHOLD 80
#endif

#ifndef BLAS3_MEM_ALLOC_THRESHOLD
#define BLAS3_MEM_ALLOC_THRESHOLD 160
#endif

#ifdef QUAD_PRECISION #ifdef QUAD_PRECISION
#include "common_quad.h" #include "common_quad.h"
#endif #endif


+ 9
- 1
common_alpha.h View File

@@ -150,9 +150,17 @@ REALNAME:
#define PROFCODE .prologue 0 #define PROFCODE .prologue 0
#endif #endif


#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",%progbits
#else
#define GNUSTACK
#endif

#define EPILOGUE \ #define EPILOGUE \
.end REALNAME; \ .end REALNAME; \
.ident VERSION
.ident VERSION; \
GNUSTACK

#endif #endif


#ifdef DOUBLE #ifdef DOUBLE


+ 8
- 1
common_ia64.h View File

@@ -379,8 +379,15 @@ REALNAME:
#define PROFCODE #define PROFCODE
#endif #endif


#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",%progbits
#else
#define GNUSTACK
#endif

#define EPILOGUE \ #define EPILOGUE \
.endp REALNAME
.endp REALNAME ; \
GNUSTACK


#define START_ADDRESS 0x20000fc800000000UL #define START_ADDRESS 0x20000fc800000000UL




+ 14
- 1
common_linux.h View File

@@ -65,9 +65,16 @@ extern long int syscall (long int __sysno, ...);
#endif #endif
#endif #endif




static inline int my_mbind(void *addr, unsigned long len, int mode, static inline int my_mbind(void *addr, unsigned long len, int mode,
unsigned long *nodemask, unsigned long maxnode, unsigned long *nodemask, unsigned long maxnode,
unsigned flags) { unsigned flags) {
#if defined (__LSB_VERSION__)
// So far, LSB (Linux Standard Base) don't support syscall().
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
return 0;
#else
#if defined (LOONGSON3B) #if defined (LOONGSON3B)
#if defined (__64BIT__) #if defined (__64BIT__)
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
@@ -79,11 +86,17 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
// unsigned long null_nodemask=0; // unsigned long null_nodemask=0;
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
#endif #endif
#endif
} }


static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {

#if defined (__LSB_VERSION__)
// So far, LSB (Linux Standard Base) don't support syscall().
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
return 0;
#else
return syscall(SYS_set_mempolicy, mode, addr, flag); return syscall(SYS_set_mempolicy, mode, addr, flag);
#endif
} }


static inline int my_gettid(void) { static inline int my_gettid(void) {


+ 10
- 3
common_mips64.h View File

@@ -235,10 +235,17 @@ REALNAME: ;\
.set noreorder ;\ .set noreorder ;\
.set nomacro .set nomacro


#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",%progbits
#else
#define GNUSTACK
#endif

#define EPILOGUE \ #define EPILOGUE \
.set macro ;\ .set macro ;\
.set reorder ;\ .set reorder ;\
.end REALNAME
.end REALNAME ;\
GNUSTACK


#define PROFCODE #define PROFCODE
#endif #endif
@@ -255,8 +262,8 @@ REALNAME: ;\
#endif #endif


#if defined(LOONGSON3B) #if defined(LOONGSON3B)
#define PAGESIZE (32UL << 10)
#define FIXED_PAGESIZE (32UL << 10)
#define PAGESIZE (16UL << 10)
#define FIXED_PAGESIZE (16UL << 10)
#endif #endif


#ifndef PAGESIZE #ifndef PAGESIZE


+ 10
- 1
common_sparc.h View File

@@ -199,8 +199,17 @@ static __inline int blas_quickdivide(blasint x, blasint y){
.type REALNAME, #function; \ .type REALNAME, #function; \
.proc 07; \ .proc 07; \
REALNAME:; REALNAME:;

#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",%progbits
#else
#define GNUSTACK
#endif

#define EPILOGUE \ #define EPILOGUE \
.size REALNAME, .-REALNAME
.size REALNAME, .-REALNAME; \
GNUSTACK

#endif #endif


#endif #endif


+ 10
- 3
common_x86.h View File

@@ -171,6 +171,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define MMXSTORE movd #define MMXSTORE movd
#endif #endif


#if defined(PILEDRIVER) || defined(BULLDOZER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

#if defined(HAVE_3DNOW) #if defined(HAVE_3DNOW)
#define EMMS femms #define EMMS femms
#elif defined(HAVE_MMX) #elif defined(HAVE_MMX)
@@ -296,7 +301,9 @@ REALNAME:
#define PROFCODE #define PROFCODE
#endif #endif


#define EPILOGUE .size REALNAME, .-REALNAME
#define EPILOGUE \
.size REALNAME, .-REALNAME; \
.section .note.GNU-stack,"",%progbits


#endif #endif


@@ -335,6 +342,7 @@ REALNAME:
#define ALIGN_2 .align 2 #define ALIGN_2 .align 2
#define ALIGN_3 .align 3 #define ALIGN_3 .align 3
#define ALIGN_4 .align 4 #define ALIGN_4 .align 4
#define ALIGN_5 .align 5
#define ffreep fstp #define ffreep fstp
#endif #endif


@@ -356,11 +364,10 @@ REALNAME:


#ifndef ALIGN_6 #ifndef ALIGN_6
#define ALIGN_6 .align 64 #define ALIGN_6 .align 64
#endif
// ffreep %st(0). // ffreep %st(0).
// Because Clang didn't support ffreep, we directly use the opcode. // Because Clang didn't support ffreep, we directly use the opcode.
// Please check out http://www.sandpile.org/x86/opc_fpu.htm // Please check out http://www.sandpile.org/x86/opc_fpu.htm
#ifndef ffreep #ifndef ffreep
#define ffreep .byte 0xdf, 0xc0 # #define ffreep .byte 0xdf, 0xc0 #
#endif #endif
#endif

+ 9
- 1
common_x86_64.h View File

@@ -218,6 +218,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){


#ifdef ASSEMBLER #ifdef ASSEMBLER


#if defined(PILEDRIVER) || defined(BULLDOZER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

#if defined(HAVE_3DNOW) #if defined(HAVE_3DNOW)
#define EMMS femms #define EMMS femms
#elif defined(HAVE_MMX) #elif defined(HAVE_MMX)
@@ -367,7 +372,10 @@ REALNAME:
#define PROFCODE #define PROFCODE
#endif #endif


#define EPILOGUE .size REALNAME, .-REALNAME
#define EPILOGUE \
.size REALNAME, .-REALNAME; \
.section .note.GNU-stack,"",%progbits



#endif #endif




+ 7
- 0
cpuid.h View File

@@ -106,6 +106,8 @@
#define CORE_SANDYBRIDGE 20 #define CORE_SANDYBRIDGE 20
#define CORE_BOBCAT 21 #define CORE_BOBCAT 21
#define CORE_BULLDOZER 22 #define CORE_BULLDOZER 22
#define CORE_PILEDRIVER 23
#define CORE_HASWELL CORE_SANDYBRIDGE


#define HAVE_SSE (1 << 0) #define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1) #define HAVE_SSE2 (1 << 1)
@@ -127,6 +129,7 @@
#define HAVE_FASTMOVU (1 << 17) #define HAVE_FASTMOVU (1 << 17)
#define HAVE_AVX (1 << 18) #define HAVE_AVX (1 << 18)
#define HAVE_FMA4 (1 << 19) #define HAVE_FMA4 (1 << 19)
#define HAVE_FMA3 (1 << 20)


#define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2 #define CACHE_INFO_L1_D 2
@@ -196,4 +199,8 @@ typedef struct {
#define CPUTYPE_SANDYBRIDGE 44 #define CPUTYPE_SANDYBRIDGE 44
#define CPUTYPE_BOBCAT 45 #define CPUTYPE_BOBCAT 45
#define CPUTYPE_BULLDOZER 46 #define CPUTYPE_BULLDOZER 46
#define CPUTYPE_PILEDRIVER 47
// this define is because BLAS doesn't have haswell specific optimizations yet
#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE

#endif #endif

+ 1
- 0
cpuid_power.c View File

@@ -114,6 +114,7 @@ int detect(void){
if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970; if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970;
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;




+ 68
- 11
cpuid_x86.c View File

@@ -41,10 +41,14 @@
#include "cpuid.h" #include "cpuid.h"


#ifdef NO_AVX #ifdef NO_AVX
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
#define CORE_HASWELL CORE_NEHALEM
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
#define CORE_SANDYBRIDGE CORE_NEHALEM #define CORE_SANDYBRIDGE CORE_NEHALEM
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
#define CORE_BULLDOZER CORE_BARCELONA #define CORE_BULLDOZER CORE_BARCELONA
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
#define CORE_PILEDRIVER CORE_BARCELONA
#endif #endif


#ifndef CPUIDEMU #ifndef CPUIDEMU
@@ -130,7 +134,7 @@ int support_avx(){
int ret=0; int ret=0;
cpuid(1, &eax, &ebx, &ecx, &edx); cpuid(1, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
xgetbv(0, &eax, &edx); xgetbv(0, &eax, &edx);
if((eax & 6) == 6){ if((eax & 6) == 6){
ret=1; //OS support AVX ret=1; //OS support AVX
@@ -225,6 +229,7 @@ int get_cputype(int gettype){
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
#ifndef NO_AVX #ifndef NO_AVX
if (support_avx()) feature |= HAVE_AVX; if (support_avx()) feature |= HAVE_AVX;
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
#endif #endif


if (have_excpuid() >= 0x01) { if (have_excpuid() >= 0x01) {
@@ -1050,8 +1055,22 @@ int get_cpuname(void){
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 12:
if(support_avx())
return CPUTYPE_HASWELL;
else
return CPUTYPE_NEHALEM;
} }
break; break;
case 4:
switch (model) {
case 5:
if(support_avx())
return CPUTYPE_HASWELL;
else
return CPUTYPE_NEHALEM;
}
break;
} }
break; break;
case 0x7: case 0x7:
@@ -1084,11 +1103,21 @@ int get_cpuname(void){
case 1: case 1:
case 10: case 10:
return CPUTYPE_BARCELONA; return CPUTYPE_BARCELONA;
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return CPUTYPE_BULLDOZER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 6:
switch (model) {
case 1:
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return CPUTYPE_BULLDOZER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 2:
if(support_avx())
return CPUTYPE_PILEDRIVER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
}
break;
case 5: case 5:
return CPUTYPE_BOBCAT; return CPUTYPE_BOBCAT;
} }
@@ -1213,6 +1242,7 @@ static char *cpuname[] = {
"SANDYBRIDGE", "SANDYBRIDGE",
"BOBCAT", "BOBCAT",
"BULLDOZER", "BULLDOZER",
"PILEDRIVER",
}; };


static char *lowercpuname[] = { static char *lowercpuname[] = {
@@ -1262,6 +1292,7 @@ static char *lowercpuname[] = {
"sandybridge", "sandybridge",
"bobcat", "bobcat",
"bulldozer", "bulldozer",
"piledriver",
}; };


static char *corename[] = { static char *corename[] = {
@@ -1288,6 +1319,7 @@ static char *corename[] = {
"SANDYBRIDGE", "SANDYBRIDGE",
"BOBCAT", "BOBCAT",
"BULLDOZER", "BULLDOZER",
"PILEDRIVER",
}; };


static char *corename_lower[] = { static char *corename_lower[] = {
@@ -1314,6 +1346,7 @@ static char *corename_lower[] = {
"sandybridge", "sandybridge",
"bobcat", "bobcat",
"bulldozer", "bulldozer",
"piledriver",
}; };




@@ -1424,8 +1457,22 @@ int get_coretype(void){
return CORE_SANDYBRIDGE; return CORE_SANDYBRIDGE;
else else
return CORE_NEHALEM; //OS doesn't support AVX return CORE_NEHALEM; //OS doesn't support AVX
case 12:
if(support_avx())
return CORE_HASWELL;
else
return CORE_NEHALEM;
} }
break; break;
case 4:
switch (model) {
case 5:
if(support_avx())
return CORE_HASWELL;
else
return CORE_NEHALEM;
}
break;
} }
break; break;


@@ -1442,11 +1489,19 @@ int get_coretype(void){
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCAT; else if (exfamily == 5) return CORE_BOBCAT;
else if (exfamily == 6) { else if (exfamily == 6) {
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return CORE_BULLDOZER;
else
return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
switch (model) {
case 1:
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return CORE_BULLDOZER;
else
return CORE_BARCELONA; //OS don't support AVX.
case 2:
if(support_avx())
return CORE_PILEDRIVER;
else
return CORE_BARCELONA; //OS don't support AVX.
}
}else return CORE_BARCELONA; }else return CORE_BARCELONA;
} }
} }
@@ -1534,6 +1589,7 @@ void get_cpuconfig(void){
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
if (features & HAVE_FMA3 ) printf("#define HAVE_FMA3\n");
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
@@ -1601,5 +1657,6 @@ void get_sse(void){
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n");


} }

+ 14
- 0
ctest.c View File

@@ -1,3 +1,17 @@
//LSB (Linux Standard Base) compiler
//only support lsbc++
#if defined (__LSB_VERSION__)
#if !defined (__cplusplus)
COMPILER_LSB
#else
#error "OpenBLAS only supports lsbcc."
#endif
#endif

#if defined(__clang__)
COMPILER_CLANG
#endif

#if defined(__PGI) || defined(__PGIC__) #if defined(__PGI) || defined(__PGIC__)
COMPILER_PGI COMPILER_PGI
#endif #endif


+ 1
- 1
ctest/Makefile View File

@@ -77,7 +77,7 @@ endif
clean :: clean ::
rm -f x* rm -f x*


FLDFLAGS = $(FFLAGS:-fPIC=)
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
CEXTRALIB = CEXTRALIB =


# Single real # Single real


+ 6
- 4
driver/level2/sbmv_thread.c View File

@@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F


a = (FLOAT *)args -> a; a = (FLOAT *)args -> a;
x = (FLOAT *)args -> b; x = (FLOAT *)args -> b;
y = (FLOAT *)args -> c;


lda = args -> lda; lda = args -> lda;
incx = args -> ldb; incx = args -> ldb;
@@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
n_from = 0; n_from = 0;
n_to = n; n_to = n;


//Use y as each thread's n* COMPSIZE elements in sb buffer
y = buffer;
buffer += ((COMPSIZE * n + 1023) & ~1023);

if (range_m) { if (range_m) {
n_from = *(range_m + 0); n_from = *(range_m + 0);
n_to = *(range_m + 1); n_to = *(range_m + 1);
@@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
a += n_from * lda * COMPSIZE; a += n_from * lda * COMPSIZE;
} }


if (range_n) y += *range_n * COMPSIZE;


if (incx != 1) { if (incx != 1) {
COPY_K(n, x, incx, buffer, 1); COPY_K(n, x, incx, buffer, 1);
@@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x


if (num_cpu) { if (num_cpu) {
queue[0].sa = NULL; queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
queue[0].sb = buffer;
queue[num_cpu - 1].next = NULL; queue[num_cpu - 1].next = NULL;
exec_blas(num_cpu, queue); exec_blas(num_cpu, queue);
@@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
#else #else
ONE, ZERO, ONE, ZERO,
#endif #endif
buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
(FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
} }


AXPYU_K(n, 0, 0, AXPYU_K(n, 0, 0,


+ 2
- 2
driver/level3/gemm_thread_n.c View File

@@ -71,7 +71,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
queue[num_cpu].args = arg; queue[num_cpu].args = arg;
queue[num_cpu].range_m = range_m; queue[num_cpu].range_m = range_m;
queue[num_cpu].range_n = &range[num_cpu]; queue[num_cpu].range_n = &range[num_cpu];
#if defined(LOONGSON3A)
#if 0 //defined(LOONGSON3A)
queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu;
queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;
#else #else
@@ -83,7 +83,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
} }
if (num_cpu) { if (num_cpu) {
#if defined(LOONGSON3A)
#if 0 //defined(LOONGSON3A)
queue[0].sa = sa; queue[0].sa = sa;
queue[0].sb = sa + GEMM_OFFSET_A1 * 5; queue[0].sb = sa + GEMM_OFFSET_A1 * 5;
#else #else


+ 14
- 1
driver/level3/level3.c View File

@@ -332,7 +332,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#else #else
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
else
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#else

if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif

START_RPCC(); START_RPCC();


+ 24
- 1
driver/level3/level3_gemm3m_thread.c View File

@@ -48,6 +48,12 @@
#define SWITCH_RATIO 2 #define SWITCH_RATIO 2
#endif #endif


//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif

#ifndef GEMM3M_LOCAL #ifndef GEMM3M_LOCAL
#if defined(NN) #if defined(NN)
#define GEMM3M_LOCAL GEMM3M_NN #define GEMM3M_LOCAL GEMM3M_NN
@@ -836,7 +842,11 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
BLASLONG range_M[MAX_CPU_NUMBER + 1]; BLASLONG range_M[MAX_CPU_NUMBER + 1];
BLASLONG range_N[MAX_CPU_NUMBER + 1]; BLASLONG range_N[MAX_CPU_NUMBER + 1];


job_t job[MAX_CPU_NUMBER];
#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif


BLASLONG num_cpu_m, num_cpu_n; BLASLONG num_cpu_m, num_cpu_n;


@@ -866,6 +876,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
newarg.alpha = args -> alpha; newarg.alpha = args -> alpha;
newarg.beta = args -> beta; newarg.beta = args -> beta;
newarg.nthreads = args -> nthreads; newarg.nthreads = args -> nthreads;

#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif

newarg.common = (void *)job; newarg.common = (void *)job;
if (!range_m) { if (!range_m) {
@@ -945,6 +964,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
exec_blas(num_cpu_m, queue); exec_blas(num_cpu_m, queue);
} }


#ifdef USE_ALLOC_HEAP
free(job);
#endif

return 0; return 0;
} }




+ 23
- 0
driver/level3/level3_syrk_threaded.c View File

@@ -48,6 +48,12 @@
#define SWITCH_RATIO 2 #define SWITCH_RATIO 2
#endif #endif


//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif

#ifndef SYRK_LOCAL #ifndef SYRK_LOCAL
#if !defined(LOWER) && !defined(TRANS) #if !defined(LOWER) && !defined(TRANS)
#define SYRK_LOCAL SYRK_UN #define SYRK_LOCAL SYRK_UN
@@ -502,7 +508,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO


blas_arg_t newarg; blas_arg_t newarg;


#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER]; job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif

blas_queue_t queue[MAX_CPU_NUMBER]; blas_queue_t queue[MAX_CPU_NUMBER];


BLASLONG range[MAX_CPU_NUMBER + 100]; BLASLONG range[MAX_CPU_NUMBER + 100];
@@ -556,6 +567,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
newarg.ldc = args -> ldc; newarg.ldc = args -> ldc;
newarg.alpha = args -> alpha; newarg.alpha = args -> alpha;
newarg.beta = args -> beta; newarg.beta = args -> beta;

#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif

newarg.common = (void *)job; newarg.common = (void *)job;
if (!range_n) { if (!range_n) {
@@ -668,6 +688,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
exec_blas(num_cpu, queue); exec_blas(num_cpu, queue);
} }
#ifdef USE_ALLOC_HEAP
free(job);
#endif


return 0; return 0;
} }

+ 38
- 2
driver/level3/level3_thread.c View File

@@ -48,6 +48,12 @@
#define SWITCH_RATIO 2 #define SWITCH_RATIO 2
#endif #endif


//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif

#ifndef GEMM_LOCAL #ifndef GEMM_LOCAL
#if defined(NN) #if defined(NN)
#define GEMM_LOCAL GEMM_NN #define GEMM_LOCAL GEMM_NN
@@ -360,8 +366,20 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,


for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs; min_jj = MIN(n_to, xxx + div_n) - jjs;

#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
else
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#else

if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif

START_RPCC(); START_RPCC();
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
@@ -519,7 +537,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG


blas_arg_t newarg; blas_arg_t newarg;


#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER]; job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif

blas_queue_t queue[MAX_CPU_NUMBER]; blas_queue_t queue[MAX_CPU_NUMBER];


BLASLONG range_M[MAX_CPU_NUMBER + 1]; BLASLONG range_M[MAX_CPU_NUMBER + 1];
@@ -563,6 +586,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
newarg.alpha = args -> alpha; newarg.alpha = args -> alpha;
newarg.beta = args -> beta; newarg.beta = args -> beta;
newarg.nthreads = args -> nthreads; newarg.nthreads = args -> nthreads;

#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif

newarg.common = (void *)job; newarg.common = (void *)job;
#ifdef PARAMTEST #ifdef PARAMTEST
@@ -634,7 +666,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
num_cpu_n ++; num_cpu_n ++;
} }
for (j = 0; j < num_cpu_m; j++) { for (j = 0; j < num_cpu_m; j++) {
for (i = 0; i < num_cpu_m; i++) { for (i = 0; i < num_cpu_m; i++) {
for (k = 0; k < DIVIDE_RATE; k++) { for (k = 0; k < DIVIDE_RATE; k++) {
@@ -648,6 +680,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
exec_blas(num_cpu_m, queue); exec_blas(num_cpu_m, queue);
} }


#ifdef USE_ALLOC_HEAP
free(job);
#endif

return 0; return 0;
} }




+ 4
- 1
driver/others/Makefile View File

@@ -1,7 +1,7 @@
TOPDIR = ../.. TOPDIR = ../..
include ../../Makefile.system include ../../Makefile.system


COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX)
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX)


COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)


@@ -106,6 +106,9 @@ openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
openblas_get_config.$(SUFFIX) : openblas_get_config.c openblas_get_config.$(SUFFIX) : openblas_get_config.c
$(CC) $(CFLAGS) -c $< -o $(@F) $(CC) $(CFLAGS) -c $< -o $(@F)


openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c
$(CC) $(CFLAGS) -c $< -o $(@F)

blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
$(CC) $(CFLAGS) -c $< -o $(@F) $(CC) $(CFLAGS) -c $< -o $(@F)




+ 1
- 0
driver/others/blas_server.c View File

@@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} }
} }
queue->sb=sb;
} }
#ifdef MONITOR #ifdef MONITOR


+ 50
- 6
driver/others/blas_server_omp.c View File

@@ -49,8 +49,12 @@


int blas_server_avail = 0; int blas_server_avail = 0;


static void * blas_thread_buffer[MAX_CPU_NUMBER];

void goto_set_num_threads(int num_threads) { void goto_set_num_threads(int num_threads) {


int i=0;

if (num_threads < 1) num_threads = blas_num_threads; if (num_threads < 1) num_threads = blas_num_threads;


if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
@@ -62,7 +66,19 @@ void goto_set_num_threads(int num_threads) {
blas_cpu_number = num_threads; blas_cpu_number = num_threads;


omp_set_num_threads(blas_cpu_number); omp_set_num_threads(blas_cpu_number);

//adjust buffer for each thread
for(i=0; i<blas_cpu_number; i++){
if(blas_thread_buffer[i]==NULL){
blas_thread_buffer[i]=blas_memory_alloc(2);
}
}
for(; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
}
}
#if defined(ARCH_MIPS64) #if defined(ARCH_MIPS64)
//set parameters for different number of threads. //set parameters for different number of threads.
blas_set_parameter(); blas_set_parameter();
@@ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) {


int blas_thread_init(void){ int blas_thread_init(void){


int i=0;

blas_get_cpu_number(); blas_get_cpu_number();


blas_server_avail = 1; blas_server_avail = 1;


for(i=0; i<blas_num_threads; i++){
blas_thread_buffer[i]=blas_memory_alloc(2);
}
for(; i<MAX_CPU_NUMBER; i++){
blas_thread_buffer[i]=NULL;
}

return 0; return 0;
} }


int BLASFUNC(blas_thread_shutdown)(void){ int BLASFUNC(blas_thread_shutdown)(void){

int i=0;
blas_server_avail = 0; blas_server_avail = 0;


for(i=0; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
}
}

return 0; return 0;
} }


@@ -177,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
static void exec_threads(blas_queue_t *queue){ static void exec_threads(blas_queue_t *queue){


void *buffer, *sa, *sb; void *buffer, *sa, *sb;

int pos=0, release_flag=0;
buffer = NULL; buffer = NULL;
sa = queue -> sa; sa = queue -> sa;
sb = queue -> sb; sb = queue -> sb;
@@ -189,9 +222,19 @@ static void exec_threads(blas_queue_t *queue){


if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {


buffer = blas_memory_alloc(2);
pos = omp_get_thread_num();
buffer = blas_thread_buffer[pos];


if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
//fallback
if(buffer==NULL) {
buffer = blas_memory_alloc(2);
release_flag=1;
}

if (sa == NULL) {
sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
queue->sa=sa;
}
if (sb == NULL) { if (sb == NULL) {
if (!(queue -> mode & BLAS_COMPLEX)){ if (!(queue -> mode & BLAS_COMPLEX)){
@@ -224,6 +267,7 @@ static void exec_threads(blas_queue_t *queue){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} }
} }
queue->sb=sb;
} }
} }


@@ -241,7 +285,7 @@ static void exec_threads(blas_queue_t *queue){


} }


if (buffer != NULL) blas_memory_free(buffer);
if (release_flag) blas_memory_free(buffer);


} }




+ 2
- 1
driver/others/blas_server_win32.c View File

@@ -253,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} }
} }
queue->sb=sb;
} }
#ifdef MONITOR #ifdef MONITOR
@@ -495,4 +496,4 @@ void goto_set_num_threads(int num_threads)
void openblas_set_num_threads(int num) void openblas_set_num_threads(int num)
{ {
goto_set_num_threads(num); goto_set_num_threads(num);
}
}

+ 42
- 7
driver/others/dynamic.c View File

@@ -64,12 +64,15 @@ extern gotoblas_t gotoblas_BOBCAT;
#ifndef NO_AVX #ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER; extern gotoblas_t gotoblas_BULLDOZER;
extern gotoblas_t gotoblas_PILEDRIVER;
#else #else
//Use NEHALEM kernels for sandy bridge //Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#endif #endif

//Use sandy bridge kernels for haswell.
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE


#define VENDOR_INTEL 1 #define VENDOR_INTEL 1
#define VENDOR_AMD 2 #define VENDOR_AMD 2
@@ -92,7 +95,7 @@ int support_avx(){
int ret=0; int ret=0;
cpuid(1, &eax, &ebx, &ecx, &edx); cpuid(1, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
xgetbv(0, &eax, &edx); xgetbv(0, &eax, &edx);
if((eax & 6) == 6){ if((eax & 6) == 6){
ret=1; //OS support AVX ret=1; //OS support AVX
@@ -175,7 +178,7 @@ static gotoblas_t *get_coretype(void){
if(support_avx()) if(support_avx())
return &gotoblas_SANDYBRIDGE; return &gotoblas_SANDYBRIDGE;
else{ else{
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n");
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
} }
@@ -186,7 +189,27 @@ static gotoblas_t *get_coretype(void){
if(support_avx()) if(support_avx())
return &gotoblas_SANDYBRIDGE; return &gotoblas_SANDYBRIDGE;
else{ else{
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n");
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Haswell
if (model == 12) {
if(support_avx())
return &gotoblas_HASWELL;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
case 4:
//Intel Haswell
if (model == 5) {
if(support_avx())
return &gotoblas_HASWELL;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
} }
@@ -207,13 +230,23 @@ static gotoblas_t *get_coretype(void){
} else if (exfamily == 5) { } else if (exfamily == 5) {
return &gotoblas_BOBCAT; return &gotoblas_BOBCAT;
} else if (exfamily == 6) { } else if (exfamily == 6) {
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(model == 1){
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx()) if(support_avx())
return &gotoblas_BULLDOZER; return &gotoblas_BULLDOZER;
else{ else{
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n");
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
}else if(model == 2){
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
if(support_avx())
return &gotoblas_PILEDRIVER;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
} else { } else {
return &gotoblas_BARCELONA; return &gotoblas_BARCELONA;
} }
@@ -251,6 +284,7 @@ static char *corename[] = {
"Sandybridge", "Sandybridge",
"Bobcat", "Bobcat",
"Bulldozer", "Bulldozer",
"Piledriver",
}; };


char *gotoblas_corename(void) { char *gotoblas_corename(void) {
@@ -273,6 +307,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17]; if (gotoblas == &gotoblas_BOBCAT) return corename[17];
if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];


return corename[0]; return corename[0];
} }


+ 5
- 2
driver/others/init.c View File

@@ -82,6 +82,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sched.h> #include <sched.h>
#include <dirent.h> #include <dirent.h>
#include <dlfcn.h> #include <dlfcn.h>
#include <unistd.h>
#include <string.h>


#define MAX_NODES 16 #define MAX_NODES 16
#define MAX_CPUS 256 #define MAX_CPUS 256
@@ -314,7 +316,7 @@ static int numa_check(void) {
} }


while ((dir = readdir(dp)) != NULL) { while ((dir = readdir(dp)) != NULL) {
if (*(unsigned int *) dir -> d_name == 0x065646f6eU) {
if (strncmp(dir->d_name, "node", 4)==0) {


node = atoi(&dir -> d_name[4]); node = atoi(&dir -> d_name[4]);


@@ -735,7 +737,8 @@ void gotoblas_affinity_init(void) {
fprintf(stderr, "Shared Memory Initialization.\n"); fprintf(stderr, "Shared Memory Initialization.\n");
#endif #endif


common -> num_procs = get_nprocs();
//returns the number of processors which are currently online
common -> num_procs = sysconf(_SC_NPROCESSORS_ONLN);;


if(common -> num_procs > MAX_CPUS) { if(common -> num_procs > MAX_CPUS) {
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);


+ 22
- 1
driver/others/memory.c View File

@@ -105,6 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#if defined(OS_FREEBSD) || defined(OS_DARWIN) #if defined(OS_FREEBSD) || defined(OS_DARWIN)
#include <sys/sysctl.h> #include <sys/sysctl.h>
#include <sys/resource.h>
#endif #endif


#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
@@ -125,7 +126,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NO_WARMUP #define NO_WARMUP
#endif #endif


#ifdef ALLOC_HUGETLB
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000 #define SHM_HUGETLB 04000
#endif #endif


@@ -216,6 +217,25 @@ int get_num_procs(void) {
} }
return nums; return nums;
} }
/*
void set_stack_limit(int limitMB){
int result=0;
struct rlimit rl;
rlim_t StackSize;

StackSize=limitMB*1024*1024;
result=getrlimit(RLIMIT_STACK, &rl);
if(result==0){
if(rl.rlim_cur < StackSize){
rl.rlim_cur=StackSize;
result=setrlimit(RLIMIT_STACK, &rl);
if(result !=0){
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
}
}
}
}
*/
#endif #endif


/* /*
@@ -1248,6 +1268,7 @@ void CONSTRUCTOR gotoblas_init(void) {


if (gotoblas_initialized) return; if (gotoblas_initialized) return;



#ifdef PROFILE #ifdef PROFILE
moncontrol (0); moncontrol (0);
#endif #endif


+ 52
- 0
driver/others/openblas_get_parallel.c View File

@@ -0,0 +1,52 @@
/*****************************************************************************
Copyright (c) 2013 Martin Koehler, grisuthedragon@users.github.com
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

**********************************************************************************/

#include "common.h"

#if defined(USE_OPENMP)
static int parallel = 2 ;
#elif defined(SMP_SERVER)
static int parallel = 1;
#else
static int parallel = 0;
#endif

int CNAME() {
return parallel;
}

int NAME() {
return parallel;
}




+ 14
- 9
exports/Makefile View File

@@ -89,7 +89,7 @@ else
endif endif


libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def
$(CC) $(CFLAGS) libgoto2_shared.def -shared -o $(@F) \
$(CC) $(CFLAGS) $(LDFLAGS) libgoto2_shared.def -shared -o $(@F) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB)


@@ -116,10 +116,15 @@ ifeq ($(OSNAME), Linux)
so : ../$(LIBSONAME) so : ../$(LIBSONAME)


../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) -Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
ifneq ($(C_COMPILER), LSB)
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
else
#Use FC on LSB
$(FC) $(FFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
endif
rm -f linktest rm -f linktest


endif endif
@@ -130,10 +135,10 @@ ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD))
so : ../$(LIBSONAME) so : ../$(LIBSONAME)


../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB) -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB)
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
rm -f linktest rm -f linktest


endif endif
@@ -143,15 +148,15 @@ ifeq ($(OSNAME), OSF1)
so : ../$(LIBSONAME) so : ../$(LIBSONAME)


../$(LIBSONAME) : ../$(LIBSONAME) :
$(CC) -shared -o ../$(LIBSONAME) ../$(LIBNAME)
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) ../$(LIBNAME)
endif endif


ifeq ($(OSNAME), SunOS) ifeq ($(OSNAME), SunOS)


so : ../$(LIBSONAME) so : ../$(LIBSONAME)
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(EXTRALIB) -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(EXTRALIB)
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
rm -f linktest rm -f linktest


endif endif
@@ -194,7 +199,7 @@ symbol.S : gensymbol
perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > symbol.S perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > symbol.S


test : linktest.c test : linktest.c
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
rm -f linktest rm -f linktest


linktest.c : gensymbol ../Makefile.system ../getarch.c linktest.c : gensymbol ../Makefile.system ../getarch.c


+ 56
- 21
exports/gensymbol View File

@@ -49,7 +49,7 @@
cblas_zhemv, cblas_zher2, cblas_zher2k, cblas_zher, cblas_zherk, cblas_zhpmv, cblas_zhpr2, cblas_zhemv, cblas_zher2, cblas_zher2k, cblas_zher, cblas_zherk, cblas_zhpmv, cblas_zhpr2,
cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk,
cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm,
cblas_ztrsv);
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub );


@exblasobjs = ( @exblasobjs = (
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
@@ -72,13 +72,18 @@
zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m, zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m,
); );



#both underscore and no underscore
@misc_common_objs = (
openblas_set_num_threads, openblas_get_parallel,
);

@misc_no_underscore_objs = ( @misc_no_underscore_objs = (
openblas_set_num_threads, goto_set_num_threads,
goto_set_num_threads,
openblas_get_config, openblas_get_config,
); );


@misc_underscore_objs = ( @misc_underscore_objs = (
openblas_set_num_threads,
); );


@lapackobjs = ( @lapackobjs = (
@@ -111,7 +116,7 @@
# already provided by @blasobjs: xerbla, lsame # already provided by @blasobjs: xerbla, lsame
ilaenv, ieeeck, lsamen, xerbla_array, iparmq, ilaenv, ieeeck, lsamen, xerbla_array, iparmq,
ilaprec, ilatrans, ilauplo, iladiag, chla_transtype, ilaprec, ilatrans, ilauplo, iladiag, chla_transtype,
ilaver, slamch,
ilaver, slamch, slamc3,


# SCLAUX -- Auxiliary routines called from both REAL and COMPLEX. # SCLAUX -- Auxiliary routines called from both REAL and COMPLEX.
# excluded: second_$(TIMER) # excluded: second_$(TIMER)
@@ -148,7 +153,7 @@
dlasr, dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc, dlasr, dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc,
dsteqr, dsterf, dlaisnan, disnan, dsteqr, dsterf, dlaisnan, disnan,
dlartgp, dlartgs, dlartgp, dlartgs,
dlamch,
dlamch, dlamc3,


# SLASRC -- Single precision real LAPACK routines # SLASRC -- Single precision real LAPACK routines
# already provided by @lapackobjs: # already provided by @lapackobjs:
@@ -2671,7 +2676,7 @@ if ($ARGV[5] == 1) {
#NO_LAPACK=1 #NO_LAPACK=1
@underscore_objs = (@blasobjs, @misc_underscore_objs); @underscore_objs = (@blasobjs, @misc_underscore_objs);
} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1" || } elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1" ||
-d "../lapack-3.4.2") {
-d "../lapack-3.4.2" || -d "../lapack-netlib") {
@underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs); @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs);
} else { } else {
@underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs); @underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs);
@@ -2679,7 +2684,7 @@ if ($ARGV[5] == 1) {


if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); }; if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); };


if ($ARGV[1] eq "X86_64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); };
if ($ARGV[1] eq "x86_64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); };


if ($ARGV[1] eq "x86"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[1] eq "x86"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); };


@@ -2716,6 +2721,10 @@ $bu = $ARGV[2];
$bu = "" if (($bu eq "0") || ($bu eq "1")); $bu = "" if (($bu eq "0") || ($bu eq "1"));


if ($ARGV[0] eq "linux"){ if ($ARGV[0] eq "linux"){

@underscore_objs = (@underscore_objs, @misc_common_objs);
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);

foreach $objs (@underscore_objs) { foreach $objs (@underscore_objs) {
print $objs, $bu, "\n"; print $objs, $bu, "\n";
} }
@@ -2733,6 +2742,10 @@ if ($ARGV[0] eq "linux"){
} }


if ($ARGV[0] eq "osx"){ if ($ARGV[0] eq "osx"){

@underscore_objs = (@underscore_objs, @misc_common_objs);
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);

foreach $objs (@underscore_objs) { foreach $objs (@underscore_objs) {
print "_", $objs, $bu, "\n"; print "_", $objs, $bu, "\n";
} }
@@ -2746,6 +2759,10 @@ if ($ARGV[0] eq "osx"){
} }


if ($ARGV[0] eq "aix"){ if ($ARGV[0] eq "aix"){

@underscore_objs = (@underscore_objs, @misc_common_objs);
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);

foreach $objs (@underscore_objs) { foreach $objs (@underscore_objs) {
print $objs, $bu, "\n"; print $objs, $bu, "\n";
} }
@@ -2761,23 +2778,31 @@ if ($ARGV[0] eq "aix"){
if ($ARGV[0] eq "win2k"){ if ($ARGV[0] eq "win2k"){
print "EXPORTS\n"; print "EXPORTS\n";
$count = 1; $count = 1;

@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);

foreach $objs (@underscore_objs) { foreach $objs (@underscore_objs) {
unless ($objs =~ /openblas_set_num_threads/) { #remove openblas_set_num_threads
$uppercase = $objs;
$uppercase =~ tr/[a-z]/[A-Z]/;
print "\t$objs=$objs","_ \@", $count, "\n";
$count ++;
print "\t",$objs, "_=$objs","_ \@", $count, "\n";
$count ++;
print "\t$uppercase=$objs", "_ \@", $count, "\n";
$count ++;
}
$uppercase = $objs;
$uppercase =~ tr/[a-z]/[A-Z]/;
print "\t$objs=$objs","_ \@", $count, "\n";
$count ++;
print "\t",$objs, "_=$objs","_ \@", $count, "\n";
$count ++;
print "\t$uppercase=$objs", "_ \@", $count, "\n";
$count ++;
}
#for misc_common_objs
foreach $objs (@misc_common_objs) {

$uppercase = $objs;
$uppercase =~ tr/[a-z]/[A-Z]/;
print "\t",$objs, "_=$objs","_ \@", $count, "\n";
$count ++;
print "\t$uppercase=$objs", "_ \@", $count, "\n";
$count ++;
} }
#for openblas_set_num_threads
print "\topenblas_set_num_threads_=openblas_set_num_threads_ \@", $count, "\n";
$count ++;
foreach $objs (@no_underscore_objs) { foreach $objs (@no_underscore_objs) {
print "\t",$objs,"=$objs"," \@", $count, "\n"; print "\t",$objs,"=$objs"," \@", $count, "\n";
@@ -2810,6 +2835,9 @@ if ($ARGV[0] eq "win2khpl"){
} }


if ($ARGV[0] eq "microsoft"){ if ($ARGV[0] eq "microsoft"){

@underscore_objs = (@underscore_objs, @misc_common_objs);

print "EXPORTS\n"; print "EXPORTS\n";
$count = 1; $count = 1;
foreach $objs (@underscore_objs) { foreach $objs (@underscore_objs) {
@@ -2828,6 +2856,9 @@ if ($ARGV[0] eq "microsoft"){
} }


if ($ARGV[0] eq "win2kasm"){ if ($ARGV[0] eq "win2kasm"){

@underscore_objs = (@underscore_objs, @misc_common_objs);

print "\t.text\n"; print "\t.text\n";
foreach $objs (@underscore_objs) { foreach $objs (@underscore_objs) {
$uppercase = $objs; $uppercase = $objs;
@@ -2841,6 +2872,10 @@ if ($ARGV[0] eq "win2kasm"){
} }


if ($ARGV[0] eq "linktest"){ if ($ARGV[0] eq "linktest"){

@underscore_objs = (@underscore_objs, @misc_common_objs);
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);

print "int main(void){\n"; print "int main(void){\n";
foreach $objs (@underscore_objs) { foreach $objs (@underscore_objs) {
print $objs, $bu, "();\n" if $objs ne "xerbla"; print $objs, $bu, "();\n" if $objs ne "xerbla";


+ 1
- 1
f_check View File

@@ -24,7 +24,7 @@ $compiler = "" if $compiler eq "f77";


if ($compiler eq "") { if ($compiler eq "") {


@lists = ("f77", "g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95",
@lists = ("g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95",
"sunf77", "sunf90", "sunf95", "sunf77", "sunf90", "sunf95",
"xlf95", "xlf90", "xlf", "xlf95", "xlf90", "xlf",
"ppuf77", "ppuf95", "ppuf90", "ppuxlf", "ppuf77", "ppuf95", "ppuf90", "ppuxlf",


+ 38
- 13
getarch.c View File

@@ -83,6 +83,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#ifdef linux #ifdef linux
#include <sys/sysinfo.h> #include <sys/sysinfo.h>
#include <unistd.h>
#endif #endif


/* #define FORCE_P2 */ /* #define FORCE_P2 */
@@ -96,14 +97,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_PENRYN */ /* #define FORCE_PENRYN */
/* #define FORCE_DUNNINGTON */ /* #define FORCE_DUNNINGTON */
/* #define FORCE_NEHALEM */ /* #define FORCE_NEHALEM */
/* #define FORCE_SANDYBRIDGE */
/* #define FORCE_ATOM */
/* #define FORCE_ATHLON */ /* #define FORCE_ATHLON */
/* #define FORCE_OPTERON */ /* #define FORCE_OPTERON */
/* #define FORCE_OPTERON_SSE3 */ /* #define FORCE_OPTERON_SSE3 */
/* #define FORCE_BARCELONA */ /* #define FORCE_BARCELONA */
/* #define FORCE_SHANGHAI */ /* #define FORCE_SHANGHAI */
/* #define FORCE_ISTANBUL */ /* #define FORCE_ISTANBUL */
/* #define FORCE_BOBCAT */
/* #define FORCE_BULLDOZER */ /* #define FORCE_BULLDOZER */
/* #define FORCE_BOBCAT */
/* #define FORCE_PILEDRIVER */
/* #define FORCE_SSE_GENERIC */ /* #define FORCE_SSE_GENERIC */
/* #define FORCE_VIAC3 */ /* #define FORCE_VIAC3 */
/* #define FORCE_NANO */ /* #define FORCE_NANO */
@@ -118,12 +122,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_PPC440FP2 */ /* #define FORCE_PPC440FP2 */
/* #define FORCE_CELL */ /* #define FORCE_CELL */
/* #define FORCE_SICORTEX */ /* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */
/* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */
/* #define FORCE_ITANIUM2 */ /* #define FORCE_ITANIUM2 */
/* #define FORCE_GENERIC */
/* #define FORCE_SPARC */ /* #define FORCE_SPARC */
/* #define FORCE_SPARCV7 */ /* #define FORCE_SPARCV7 */
/* #define FORCE_GENERIC */


#ifdef FORCE_P2 #ifdef FORCE_P2
#define FORCE #define FORCE
@@ -139,32 +143,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "P5" #define CORENAME "P5"
#endif #endif


#ifdef FORCE_COPPERMINE
#ifdef FORCE_KATMAI
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
#define ARCHITECTURE "X86" #define ARCHITECTURE "X86"
#define SUBARCHITECTURE "PENTIUM3" #define SUBARCHITECTURE "PENTIUM3"
#define ARCHCONFIG "-DPENTIUM3 " \ #define ARCHCONFIG "-DPENTIUM3 " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE "
#define LIBNAME "coppermine"
#define CORENAME "COPPERMINE"
#define LIBNAME "katmai"
#define CORENAME "KATMAI"
#endif #endif


#ifdef FORCE_KATMAI
#ifdef FORCE_COPPERMINE
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
#define ARCHITECTURE "X86" #define ARCHITECTURE "X86"
#define SUBARCHITECTURE "PENTIUM3" #define SUBARCHITECTURE "PENTIUM3"
#define ARCHCONFIG "-DPENTIUM3 " \ #define ARCHCONFIG "-DPENTIUM3 " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE "
#define LIBNAME "katmai"
#define CORENAME "KATMAI"
#define LIBNAME "coppermine"
#define CORENAME "COPPERMINE"
#endif #endif


#ifdef FORCE_NORTHWOOD #ifdef FORCE_NORTHWOOD
@@ -396,6 +400,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "BULLDOZER" #define CORENAME "BULLDOZER"
#endif #endif


#if defined (FORCE_PILEDRIVER)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "PILEDRIVER"
#define ARCHCONFIG "-DPILEDRIVER " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
#define LIBNAME "piledriver"
#define CORENAME "PILEDRIVER"
#endif

#ifdef FORCE_SSE_GENERIC #ifdef FORCE_SSE_GENERIC
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
@@ -717,7 +737,8 @@ static int get_num_cores(void) {
#endif #endif
#ifdef linux #ifdef linux
return get_nprocs();
//returns the number of processors which are currently online
return sysconf(_SC_NPROCESSORS_ONLN);
#elif defined(OS_WINDOWS) #elif defined(OS_WINDOWS)


@@ -802,8 +823,12 @@ int main(int argc, char *argv[]){
#endif #endif
#endif #endif


#if NO_PARALLEL_MAKE==1
printf("MAKE += -j 1\n");
#else
#ifndef OS_WINDOWS #ifndef OS_WINDOWS
printf("MAKE += -j %d\n", get_num_cores()); printf("MAKE += -j %d\n", get_num_cores());
#endif
#endif #endif


break; break;


+ 40
- 2
getarch_2nd.c View File

@@ -8,7 +8,7 @@


int main(int argc, char **argv) { int main(int argc, char **argv) {


if ((argc < 1) || (*argv[1] == '0')) {
if ( (argc <= 1) || (argc >= 2) && (*argv[1] == '0')) {
printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
@@ -22,10 +22,48 @@ int main(int argc, char **argv) {
printf("ZGEMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N); printf("ZGEMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N);
printf("XGEMM_UNROLL_M=%d\n", XGEMM_DEFAULT_UNROLL_M); printf("XGEMM_UNROLL_M=%d\n", XGEMM_DEFAULT_UNROLL_M);
printf("XGEMM_UNROLL_N=%d\n", XGEMM_DEFAULT_UNROLL_N); printf("XGEMM_UNROLL_N=%d\n", XGEMM_DEFAULT_UNROLL_N);

#ifdef CGEMM3M_DEFAULT_UNROLL_M
printf("CGEMM3M_UNROLL_M=%d\n", CGEMM3M_DEFAULT_UNROLL_M);
#else
printf("CGEMM3M_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
#endif

#ifdef CGEMM3M_DEFAULT_UNROLL_N
printf("CGEMM3M_UNROLL_N=%d\n", CGEMM3M_DEFAULT_UNROLL_N);
#else
printf("CGEMM3M_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
#endif

#ifdef ZGEMM3M_DEFAULT_UNROLL_M
printf("ZGEMM3M_UNROLL_M=%d\n", ZGEMM3M_DEFAULT_UNROLL_M);
#else
printf("ZGEMM3M_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
#endif

#ifdef ZGEMM3M_DEFAULT_UNROLL_N
printf("ZGEMM3M_UNROLL_N=%d\n", ZGEMM3M_DEFAULT_UNROLL_N);
#else
printf("ZGEMM3M_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N);
#endif

#ifdef XGEMM3M_DEFAULT_UNROLL_M
printf("XGEMM3M_UNROLL_M=%d\n", ZGEMM3M_DEFAULT_UNROLL_M);
#else
printf("XGEMM3M_UNROLL_M=%d\n", QGEMM_DEFAULT_UNROLL_M);
#endif

#ifdef XGEMM3M_DEFAULT_UNROLL_N
printf("XGEMM3M_UNROLL_N=%d\n", ZGEMM3M_DEFAULT_UNROLL_N);
#else
printf("XGEMM3M_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N);
#endif


} }


if ((argc >= 1) && (*argv[1] == '1')) {
if ((argc >= 2) && (*argv[1] == '1')) {
printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float))); printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float)));
printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double))); printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double)));
printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float))); printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float)));


+ 13
- 0
interface/trtri.c View File

@@ -60,6 +60,8 @@ static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *
}; };
#endif #endif


extern void dtrtri_lapack_(char *UPLO, char *DIAG, int *N, double *a, int *ldA, int *Info);

int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){


blas_arg_t args; blas_arg_t args;
@@ -83,6 +85,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
TOUPPER(uplo_arg); TOUPPER(uplo_arg);
TOUPPER(diag_arg); TOUPPER(diag_arg);



uplo = -1; uplo = -1;
if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1; if (uplo_arg == 'L') uplo = 1;
@@ -90,6 +93,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
if (diag_arg == 'U') diag = 0; if (diag_arg == 'U') diag = 0;
if (diag_arg == 'N') diag = 1; if (diag_arg == 'N') diag = 1;



info = 0; info = 0;
if (args.lda < MAX(1,args.n)) info = 5; if (args.lda < MAX(1,args.n)) info = 5;
if (args.n < 0) info = 3; if (args.n < 0) info = 3;
@@ -129,6 +133,15 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
if (args.nthreads == 1) { if (args.nthreads == 1) {
#endif #endif


#if DOUBLE
// double trtri_U single thread error
// call dtrtri from lapack for a walk around.
if(uplo==0){
dtrtri_lapack_(UPLO, DIAG, N, a, ldA, Info);
return 0;
}
#endif

*Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
#ifdef SMP #ifdef SMP


+ 1
- 1
kernel/Makefile.L2 View File

@@ -388,7 +388,7 @@ $(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER
$(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ -DXCONJ $< -o $@ $(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ -DXCONJ $< -o $@


$(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) $(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM)
$(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ-DXCONJ $< -o $@
$(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ -DXCONJ $< -o $@


$(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM) $(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@


+ 216
- 216
kernel/Makefile.L3 View File

@@ -1206,328 +1206,328 @@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M
$(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c $(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@


$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@


$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@


$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@


$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@


$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@


$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@


$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c $(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
@@ -2608,328 +2608,328 @@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_
$(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c $(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@


$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@


$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@


$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@


$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@


$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@


$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@


$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@


$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@


$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@


$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@


$(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c $(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c


+ 16
- 0
kernel/setparam-ref.c View File

@@ -826,6 +826,22 @@ static void init_parameter(void) {
#endif #endif
#endif #endif


#ifdef PILEDRIVER

#ifdef DEBUG
fprintf(stderr, "Piledriver\n");
#endif

TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif

#ifdef NANO #ifdef NANO


#ifdef DEBUG #ifdef DEBUG


+ 59
- 0
kernel/x86/KERNEL.PILEDRIVER View File

@@ -0,0 +1,59 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
CGEMMINCOPY =
CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S

DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S

CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S

ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S

CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S

+ 5
- 5
kernel/x86/gemv_n_sse.S View File

@@ -101,10 +101,10 @@
#define Y 36 + STACKSIZE+ARGS(%esp) #define Y 36 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp) #define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE+ARGS(%esp) #define BUFFER 44 + STACKSIZE+ARGS(%esp)

#define MMM 0+ARGS(%esp) #define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp) #define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp) #define AA 8+ARGS(%esp)
#define LDAX 12+ARGS(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@@ -153,8 +153,8 @@


movl YY,J movl YY,J
movl J,Y movl J,Y
movl STACK_LDA, LDA


movl STACK_LDA, LDA
movl STACK_X, X movl STACK_X, X
movl STACK_INCX, INCX movl STACK_INCX, INCX


@@ -688,9 +688,9 @@
movl M,J movl M,J
leal (,J,SIZE),%eax leal (,J,SIZE),%eax
addl %eax,AA addl %eax,AA
movl YY,J
addl %eax,J
movl J,YY
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t jmp .L0t
ALIGN_4 ALIGN_4




+ 3
- 3
kernel/x86/gemv_n_sse2.S View File

@@ -714,9 +714,9 @@
movl M,J movl M,J
leal (,J,SIZE),%eax leal (,J,SIZE),%eax
addl %eax,AA addl %eax,AA
movl YY,J
addl %eax,J
movl J,YY
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t jmp .L0t
ALIGN_4 ALIGN_4




+ 9
- 19
kernel/x86/gemv_t_sse.S View File

@@ -102,11 +102,9 @@
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp) #define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE+ARGS(%esp) #define BUFFER 44 + STACKSIZE+ARGS(%esp)


#define MMM 0+STACKSIZE(%esp)
#define NN 4+STACKSIZE(%esp)
#define AA 8+STACKSIZE(%esp)
#define LDAX 12+STACKSIZE(%esp)
#define XX 16+STACKSIZE(%esp)
#define MMM 0+ARGS(%esp)
#define AA 4+ARGS(%esp)
#define XX 8+ARGS(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@@ -129,12 +127,8 @@


PROFCODE PROFCODE


movl STACK_LDA, LDA
movl LDA,LDAX # backup LDA
movl STACK_X, X movl STACK_X, X
movl X,XX movl X,XX
movl N,J
movl J,NN # backup N
movl A,J movl A,J
movl J,AA # backup A movl J,AA # backup A
movl M,J movl M,J
@@ -144,7 +138,6 @@
addl $1,J addl $1,J
sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
subl $8, J # Don't use last 8 float in the buffer. subl $8, J # Don't use last 8 float in the buffer.
# Now, split M by block J
subl J,MMM # MMM=MMM-J subl J,MMM # MMM=MMM-J
movl J,M movl J,M
jge .L00t jge .L00t
@@ -159,13 +152,10 @@
movl AA,%eax movl AA,%eax
movl %eax,A # mov AA to A movl %eax,A # mov AA to A


movl NN,%eax
movl %eax,N # reset N


movl LDAX, LDA # reset LDA
movl XX,X
movl XX,%eax
movl %eax,X


movl STACK_LDA, LDA
movl STACK_INCX, INCX movl STACK_INCX, INCX
movl STACK_INCY, INCY movl STACK_INCY, INCY


@@ -688,9 +678,9 @@
movl M,J movl M,J
leal (,J,SIZE),%eax leal (,J,SIZE),%eax
addl %eax,AA addl %eax,AA
movl XX,J
addl %eax,J
movl J,XX
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t jmp .L0t
ALIGN_4 ALIGN_4




+ 13
- 16
kernel/x86/gemv_t_sse2.S View File

@@ -76,7 +76,7 @@
#endif #endif


#define STACKSIZE 16 #define STACKSIZE 16
#define ARGS 16
#define ARGS 20


#define M 4 + STACKSIZE+ARGS(%esp) #define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp) #define N 8 + STACKSIZE+ARGS(%esp)
@@ -89,10 +89,9 @@
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp) #define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp) #define BUFFER 48 + STACKSIZE+ARGS(%esp)


#define MMM 0+STACKSIZE(%esp)
#define AA 4+STACKSIZE(%esp)
#define LDAX 8+STACKSIZE(%esp)
#define NN 12+STACKSIZE(%esp)
#define MMM 0+ARGS(%esp)
#define AA 4+ARGS(%esp)
#define XX 8+ARGS(%esp)


#define I %eax #define I %eax
#define J %ebx #define J %ebx
@@ -117,10 +116,8 @@
PROFCODE PROFCODE




movl STACK_LDA, LDA
movl LDA,LDAX # backup LDA
movl N,J
movl J,NN # backup N
movl STACK_X, X
movl X,XX
movl A,J movl A,J
movl J,AA # backup A movl J,AA # backup A
movl M,J movl M,J
@@ -130,7 +127,6 @@
addl $1,J addl $1,J
sall $21,J # J=2^21*sizeof(double)=buffer size(16MB) sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
subl $4, J # Don't use last 4 double in the buffer. subl $4, J # Don't use last 4 double in the buffer.
# Now, split M by block J
subl J,MMM # MMM=MMM-J subl J,MMM # MMM=MMM-J
movl J,M movl J,M
jge .L00t jge .L00t
@@ -142,15 +138,13 @@
movl %eax,M movl %eax,M


.L00t: .L00t:
movl XX,%eax
movl %eax, X

movl AA,%eax movl AA,%eax
movl %eax,A # mov AA to A movl %eax,A # mov AA to A


movl NN,%eax
movl %eax,N # reset N


movl LDAX, LDA # reset LDA
movl STACK_X, X
movl STACK_LDA, LDA
movl STACK_INCX, INCX movl STACK_INCX, INCX
movl STACK_INCY, INCY movl STACK_INCY, INCY


@@ -605,6 +599,9 @@
movl M,J movl M,J
leal (,J,SIZE),%eax leal (,J,SIZE),%eax
addl %eax,AA addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t jmp .L0t
ALIGN_4 ALIGN_4




+ 2
- 2
kernel/x86/lsame.S View File

@@ -74,11 +74,11 @@
#else #else
movl %eax, %ecx movl %eax, %ecx
subl $32, %ecx subl $32, %ecx
cmovg %ecx, %eax
cmovge %ecx, %eax


movl %edx, %ecx movl %edx, %ecx
subl $32, %ecx subl $32, %ecx
cmovg %ecx, %edx
cmovge %ecx, %edx


subl %eax, %edx subl %eax, %edx
movl $0, %eax movl $0, %eax


+ 5
- 5
kernel/x86/trsm_kernel_LN_2x4_sse2.S View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@@ -439,7 +439,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@@ -488,7 +488,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2
@@ -1697,7 +1697,7 @@


.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@@ -1727,7 +1727,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2


+ 11
- 11
kernel/x86/trsm_kernel_LN_4x4_sse.S View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@@ -437,7 +437,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@@ -833,7 +833,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@@ -1848,7 +1848,7 @@


.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@@ -2109,7 +2109,7 @@
ALIGN_4 ALIGN_4


.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif


@@ -2429,7 +2429,7 @@


.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@@ -2459,7 +2459,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@@ -2952,7 +2952,7 @@


.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0
@@ -3148,7 +3148,7 @@


.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@@ -3389,7 +3389,7 @@


.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@@ -3404,7 +3404,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3


+ 5
- 5
kernel/x86/trsm_kernel_LT_2x4_sse2.S View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@@ -910,7 +910,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@@ -959,7 +959,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2
@@ -1439,7 +1439,7 @@


.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@@ -1469,7 +1469,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2


+ 11
- 11
kernel/x86/trsm_kernel_LT_4x4_sse.S View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@@ -872,7 +872,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@@ -1316,7 +1316,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@@ -1855,7 +1855,7 @@


.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@@ -1885,7 +1885,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@@ -2249,7 +2249,7 @@
ALIGN_4 ALIGN_4


.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif


@@ -2562,7 +2562,7 @@


.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@@ -2957,7 +2957,7 @@


.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@@ -2972,7 +2972,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@@ -3280,7 +3280,7 @@


.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@@ -3515,7 +3515,7 @@


.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0


+ 5
- 5
kernel/x86/trsm_kernel_RT_2x4_sse2.S View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@@ -1036,7 +1036,7 @@


.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@@ -1066,7 +1066,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2
@@ -2224,7 +2224,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@@ -2273,7 +2273,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2


+ 11
- 11
kernel/x86/trsm_kernel_RT_4x4_sse.S View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@@ -439,7 +439,7 @@


.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@@ -454,7 +454,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@@ -758,7 +758,7 @@


.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@@ -993,7 +993,7 @@


.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0
@@ -1324,7 +1324,7 @@


.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@@ -1354,7 +1354,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@@ -1718,7 +1718,7 @@
ALIGN_4 ALIGN_4


.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif


@@ -2031,7 +2031,7 @@


.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@@ -2859,7 +2859,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@@ -3303,7 +3303,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2


+ 56
- 12
kernel/x86/zgemv_n_sse.S View File

@@ -89,18 +89,23 @@
#endif #endif


#define STACKSIZE 16 #define STACKSIZE 16

#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 20 + STACKSIZE(%esp)
#define A 24 + STACKSIZE(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp)
#define STACK_X 32 + STACKSIZE(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp)
#define Y 40 + STACKSIZE(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp)
#define BUFFER 48 + STACKSIZE(%esp)
#define ARGS 20

#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)

#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@@ -123,6 +128,7 @@


PROLOGUE PROLOGUE


subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@@ -130,6 +136,33 @@


PROFCODE PROFCODE


movl Y,J
movl J,YY
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $20,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_3

movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M

.L00t:
movl AA,%eax
movl %eax,A
movl YY,J
movl J,Y

movl STACK_LDA, LDA movl STACK_LDA, LDA
movl STACK_X, X movl STACK_X, X
movl STACK_INCX, INCX movl STACK_INCX, INCX
@@ -595,10 +628,21 @@
ALIGN_3 ALIGN_3


.L999: .L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_3

.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp
addl $ARGS,%esp
ret ret


EPILOGUE EPILOGUE

+ 55
- 11
kernel/x86/zgemv_n_sse2.S View File

@@ -76,18 +76,23 @@
#endif #endif


#define STACKSIZE 16 #define STACKSIZE 16
#define ARGS 16

#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
#define A 32 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
#define STACK_X 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
#define Y 48 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
#define BUFFER 56 + STACKSIZE+ARGS(%esp)
#define MMM 0 + ARGS(%esp)
#define YY 4 + ARGS(%esp)
#define AA 8 + ARGS(%esp)


#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 24 + STACKSIZE(%esp)
#define A 32 + STACKSIZE(%esp)
#define STACK_LDA 36 + STACKSIZE(%esp)
#define STACK_X 40 + STACKSIZE(%esp)
#define STACK_INCX 44 + STACKSIZE(%esp)
#define Y 48 + STACKSIZE(%esp)
#define STACK_INCY 52 + STACKSIZE(%esp)
#define BUFFER 56 + STACKSIZE(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@@ -110,6 +115,7 @@


PROLOGUE PROLOGUE


subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@@ -117,6 +123,33 @@


PROFCODE PROFCODE


movl Y,J
movl J,YY
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $18,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_3

movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M

.L00t:
movl AA,%eax
movl %eax,A

movl YY,J
movl J,Y

movl STACK_LDA, LDA movl STACK_LDA, LDA
movl STACK_X, X movl STACK_X, X
movl STACK_INCX, INCX movl STACK_INCX, INCX
@@ -458,10 +491,21 @@
ALIGN_3 ALIGN_3


.L999: .L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_3

.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp
addl $ARGS,%esp
ret ret


EPILOGUE EPILOGUE

+ 58
- 13
kernel/x86/zgemv_t_sse.S View File

@@ -89,18 +89,23 @@
#endif #endif


#define STACKSIZE 16 #define STACKSIZE 16

#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 20 + STACKSIZE(%esp)
#define A 24 + STACKSIZE(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp)
#define STACK_X 32 + STACKSIZE(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp)
#define Y 40 + STACKSIZE(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp)
#define BUFFER 48 + STACKSIZE(%esp)
#define ARGS 20

#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)

#define MMM 0+ARGS(%esp)
#define XX 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@@ -123,6 +128,7 @@


PROLOGUE PROLOGUE


subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@@ -130,8 +136,35 @@


PROFCODE PROFCODE


movl STACK_LDA, LDA
movl STACK_X, X movl STACK_X, X
movl X,XX
movl A,J
movl J,AA #backup A
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $20,J
subl $8,J
subl J,MMM #MMM-=J
movl J,M
jge .L00t
ALIGN_4

movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M

.L00t:
movl AA,%eax
movl %eax,A

movl XX,%eax
movl %eax,X

movl STACK_LDA,LDA
movl STACK_INCX, INCX movl STACK_INCX, INCX
movl STACK_INCY, INCY movl STACK_INCY, INCY


@@ -513,10 +546,22 @@
ALIGN_4 ALIGN_4
.L999: .L999:
movl M,%eax
sall $ZBASE_SHIFT, %eax
addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4

.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp

addl $ARGS,%esp
ret ret


EPILOGUE EPILOGUE

+ 58
- 14
kernel/x86/zgemv_t_sse2.S View File

@@ -76,19 +76,24 @@
#endif #endif


#define STACKSIZE 16 #define STACKSIZE 16
#define ARGS 20

#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
#define A 32 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
#define STACK_X 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
#define Y 48 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
#define BUFFER 56 + STACKSIZE+ARGS(%esp)

#define MMM 0 + ARGS(%esp)
#define AA 4 + ARGS(%esp)
#define XX 8 + ARGS(%esp)


#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 24 + STACKSIZE(%esp)
#define A 32 + STACKSIZE(%esp)
#define STACK_LDA 36 + STACKSIZE(%esp)
#define STACK_X 40 + STACKSIZE(%esp)
#define STACK_INCX 44 + STACKSIZE(%esp)
#define Y 48 + STACKSIZE(%esp)
#define STACK_INCY 52 + STACKSIZE(%esp)
#define BUFFER 56 + STACKSIZE(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx


@@ -110,6 +115,7 @@


PROLOGUE PROLOGUE


subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@@ -117,8 +123,35 @@


PROFCODE PROFCODE


movl STACK_X, X
movl X, XX
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $18,J
subl $4,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_4

movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax, M

.L00t:
movl XX, %eax
movl %eax, X

movl AA,%eax
movl %eax,A

movl STACK_LDA, LDA movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX movl STACK_INCX, INCX
movl STACK_INCY, INCY movl STACK_INCY, INCY


@@ -188,7 +221,7 @@
movl Y, Y1 movl Y, Y1


movl N, J movl N, J
ALIGN_3
ALIGN_4


.L11: .L11:
movl BUFFER, X movl BUFFER, X
@@ -395,10 +428,21 @@
ALIGN_4 ALIGN_4
.L999: .L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4

.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp
addl $ARGS,%esp
ret ret


EPILOGUE EPILOGUE

+ 2
- 2
kernel/x86/ztrsm_kernel_LN_2x2_sse.S View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@@ -533,7 +533,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4


+ 2
- 2
kernel/x86/ztrsm_kernel_LT_2x2_sse.S View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@@ -994,7 +994,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4


+ 2
- 2
kernel/x86/ztrsm_kernel_RT_2x2_sse.S View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024


#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@@ -1820,7 +1820,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4


+ 45
- 36
kernel/x86_64/KERNEL.BULLDOZER View File

@@ -1,62 +1,71 @@
ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S ZGEMVTKERNEL = zgemv_t_dup.S


SGEMMKERNEL = sgemm_kernel_8x4_bulldozer.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S
DAXPYKERNEL = daxpy_bulldozer.S
DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S

SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S

SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4_opteron.S
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = zgemm_ncopy_2.S
CGEMMOTCOPY = zgemm_tcopy_2.S
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
ZGEMMINCOPY = ZGEMMINCOPY =
ZGEMMITCOPY = ZGEMMITCOPY =
ZGEMMONCOPY = zgemm_ncopy_2.S
ZGEMMOTCOPY = zgemm_tcopy_2.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ = ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)


STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c


DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c


CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c


ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S


CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

+ 70
- 0
kernel/x86_64/KERNEL.PILEDRIVER View File

@@ -0,0 +1,70 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S

DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S
DAXPYKERNEL = daxpy_bulldozer.S
DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S

SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c



+ 5
- 1
kernel/x86_64/axpy_sse.S View File

@@ -69,7 +69,7 @@
#endif #endif
movaps %xmm0, ALPHA movaps %xmm0, ALPHA
#else #else
movaps %xmm3, ALPHA


movq 40(%rsp), X movq 40(%rsp), X
movq 48(%rsp), INCX movq 48(%rsp), INCX
@@ -79,6 +79,10 @@


SAVEREGISTERS SAVEREGISTERS


#ifdef WINDOWS_ABI
movaps %xmm3, ALPHA
#endif
shufps $0, ALPHA, ALPHA shufps $0, ALPHA, ALPHA


leaq (, INCX, SIZE), INCX leaq (, INCX, SIZE), INCX


+ 4
- 1
kernel/x86_64/axpy_sse2.S View File

@@ -69,7 +69,6 @@
#endif #endif
movaps %xmm0, ALPHA movaps %xmm0, ALPHA
#else #else
movaps %xmm3, ALPHA


movq 40(%rsp), X movq 40(%rsp), X
movq 48(%rsp), INCX movq 48(%rsp), INCX
@@ -79,6 +78,10 @@


SAVEREGISTERS SAVEREGISTERS


#ifdef WINDOWS_ABI
movaps %xmm3, ALPHA
#endif

unpcklpd ALPHA, ALPHA unpcklpd ALPHA, ALPHA


leaq (, INCX, SIZE), INCX leaq (, INCX, SIZE), INCX


+ 1900
- 0
kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
File diff suppressed because it is too large
View File


+ 53
- 4
kernel/x86_64/cgemv_n.S View File

@@ -47,14 +47,22 @@


#ifndef WINDOWS_ABI #ifndef WINDOWS_ABI


#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp) #define OLD_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA 48 (%rsp) #define ALPHA 48 (%rsp)

#define MMM 64(%rsp)
#define NN 72(%rsp)
#define AA 80(%rsp)
#define XX 88(%rsp)
#define LDAX 96(%rsp)
#define ALPHAR 104(%rsp)
#define ALPHAI 112(%rsp)

#define M %rdi #define M %rdi
#define N %rsi #define N %rsi
#define A %rcx #define A %rcx
@@ -66,7 +74,7 @@


#else #else


#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp)
@@ -78,6 +86,14 @@
#define OLD_BUFFER 96 + STACKSIZE(%rsp) #define OLD_BUFFER 96 + STACKSIZE(%rsp)
#define ALPHA 224 (%rsp) #define ALPHA 224 (%rsp)


#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define XX 256(%rsp)
#define LDAX 264(%rsp)
#define ALPHAR 272(%rsp)
#define ALPHAI 280(%rsp)

#define M %rcx #define M %rcx
#define N %rdx #define N %rdx
#define A %r8 #define A %r8
@@ -142,9 +158,37 @@
movaps %xmm3, %xmm0 movaps %xmm3, %xmm0
movss OLD_ALPHA_I, %xmm1 movss OLD_ALPHA_I, %xmm1
#endif #endif
movq A, AA
movq N, NN
movq M, MMM
movq LDA, LDAX
movq X, XX
movq OLD_Y, Y
movss %xmm0,ALPHAR
movss %xmm1,ALPHAI

.L0t:
xorq I,I
addq $1,I
salq $20,I
subq I,MMM
movq I,M
movss ALPHAR,%xmm0
movss ALPHAI,%xmm1
jge .L00t

movq MMM,M
addq I,M
jle .L999x

.L00t:
movq AA, A
movq NN, N
movq LDAX, LDA
movq XX, X


movq OLD_INCX, INCX movq OLD_INCX, INCX
movq OLD_Y, Y
# movq OLD_Y, Y
movq OLD_INCY, INCY movq OLD_INCY, INCY
movq OLD_BUFFER, BUFFER movq OLD_BUFFER, BUFFER


@@ -4274,6 +4318,11 @@
ALIGN_3 ALIGN_3


.L999: .L999:
movq M, I
salq $ZBASE_SHIFT,I
addq I,AA
jmp .L0t
.L999x:
movq 0(%rsp), %rbx movq 0(%rsp), %rbx
movq 8(%rsp), %rbp movq 8(%rsp), %rbp
movq 16(%rsp), %r12 movq 16(%rsp), %r12


+ 46
- 2
kernel/x86_64/cgemv_t.S View File

@@ -47,13 +47,19 @@


#ifndef WINDOWS_ABI #ifndef WINDOWS_ABI


#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp) #define OLD_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA 48 (%rsp) #define ALPHA 48 (%rsp)
#define MMM 64(%rsp)
#define NN 72(%rsp)
#define AA 80(%rsp)
#define LDAX 88(%rsp)
#define ALPHAR 96(%rsp)
#define ALPHAI 104(%rsp)
#define M %rdi #define M %rdi
#define N %rsi #define N %rsi
@@ -66,7 +72,7 @@


#else #else


#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp)
@@ -78,6 +84,13 @@
#define OLD_BUFFER 96 + STACKSIZE(%rsp) #define OLD_BUFFER 96 + STACKSIZE(%rsp)
#define ALPHA 224 (%rsp) #define ALPHA 224 (%rsp)


#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define LDAX 256(%rsp)
#define ALPHAR 264(%rsp)
#define ALPHAI 272(%rsp)

#define M %rcx #define M %rcx
#define N %rdx #define N %rdx
#define A %r8 #define A %r8
@@ -144,6 +157,32 @@
movss OLD_ALPHA_I, %xmm1 movss OLD_ALPHA_I, %xmm1
#endif #endif


movq A, AA
movq N, NN
movq M, MMM
movq LDA, LDAX
movss %xmm0,ALPHAR
movss %xmm1,ALPHAI

.L0t:
xorq I,I
addq $1,I
salq $20,I
subq I,MMM
movq I,M
movss ALPHAR,%xmm0
movss ALPHAI,%xmm1
jge .L00t

movq MMM,M
addq I,M
jle .L999x

.L00t:
movq AA, A
movq NN, N
movq LDAX, LDA

movq OLD_INCX, INCX movq OLD_INCX, INCX
movq OLD_Y, Y movq OLD_Y, Y
movq OLD_INCY, INCY movq OLD_INCY, INCY
@@ -4350,6 +4389,11 @@
ALIGN_3 ALIGN_3


.L999: .L999:
movq M, I
salq $ZBASE_SHIFT,I
addq I,AA
jmp .L0t
.L999x:
movq 0(%rsp), %rbx movq 0(%rsp), %rbx
movq 8(%rsp), %rbp movq 8(%rsp), %rbp
movq 16(%rsp), %r12 movq 16(%rsp), %r12


+ 408
- 0
kernel/x86_64/daxpy_bulldozer.S View File

@@ -0,0 +1,408 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#ifndef WINDOWS_ABI
#define M ARG1
#define X ARG4
#define INCX ARG5
#define Y ARG6
#define INCY ARG2
#else
#define M ARG1
#define X ARG2
#define INCX ARG3
#define Y ARG4
#define INCY %r10
#endif

#define YY %r11
#define ALPHA %xmm15

#define A_PRE 640

#include "l1param.h"
PROLOGUE
PROFCODE

#ifndef WINDOWS_ABI
#ifndef XDOUBLE
movq 8(%rsp), INCY
#else
movq 24(%rsp), INCY
#endif
vmovups %xmm0, ALPHA
#else
vmovups %xmm3, ALPHA

movq 40(%rsp), X
movq 48(%rsp), INCX
movq 56(%rsp), Y
movq 64(%rsp), INCY
#endif

SAVEREGISTERS

unpcklpd ALPHA, ALPHA

leaq (, INCX, SIZE), INCX
leaq (, INCY, SIZE), INCY

testq M, M
jle .L47
cmpq $SIZE, INCX
jne .L40
cmpq $SIZE, INCY
jne .L40

testq $SIZE, Y
je .L10

movsd (X), %xmm0
mulsd ALPHA, %xmm0
addsd (Y), %xmm0
movsd %xmm0, (Y)
addq $1 * SIZE, X
addq $1 * SIZE, Y
decq M
jle .L19
ALIGN_4

.L10:
subq $-16 * SIZE, X
subq $-16 * SIZE, Y

movq M, %rax
sarq $4, %rax
jle .L13

vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3

decq %rax
jle .L12
ALIGN_3

.L11:

prefetchnta A_PRE(Y)

vmovups -8 * SIZE(X), %xmm4
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
vmovups -6 * SIZE(X), %xmm5
vmovups -4 * SIZE(X), %xmm6
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
vmovups -2 * SIZE(X), %xmm7


vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
prefetchnta A_PRE(X)
nop
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)

prefetchnta A_PRE+64(Y)

vmovups 0 * SIZE(X), %xmm0
vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4
vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5
vmovups 2 * SIZE(X), %xmm1
vmovups 4 * SIZE(X), %xmm2
vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6
vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7
vmovups 6 * SIZE(X), %xmm3


vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
prefetchnta A_PRE+64(X)
nop
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)

subq $-16 * SIZE, Y
subq $-16 * SIZE, X
decq %rax
jg .L11
ALIGN_3

.L12:

vmovups -8 * SIZE(X), %xmm4
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
vmovups -6 * SIZE(X), %xmm5
vmovups -4 * SIZE(X), %xmm6
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
vmovups -2 * SIZE(X), %xmm7


vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)

vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4
vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5
vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6
vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7

vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)

subq $-16 * SIZE, Y
subq $-16 * SIZE, X
ALIGN_3

.L13:


movq M, %rax
andq $8, %rax
jle .L14
ALIGN_3

vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3

vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3

vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)

addq $8 * SIZE, X
addq $8 * SIZE, Y
ALIGN_3

.L14:
movq M, %rax
andq $4, %rax
jle .L15
ALIGN_3

vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1

vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1

vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)

addq $4 * SIZE, X
addq $4 * SIZE, Y
ALIGN_3

.L15:
movq M, %rax
andq $2, %rax
jle .L16
ALIGN_3

vmovups -16 * SIZE(X), %xmm0
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vmovups %xmm0, -16 * SIZE(Y)

addq $2 * SIZE, X
addq $2 * SIZE, Y
ALIGN_3

.L16:
movq M, %rax
andq $1, %rax
jle .L19
ALIGN_3

vmovsd -16 * SIZE(X), %xmm0
vfmaddsd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0

vmovsd %xmm0, -16 * SIZE(Y)
ALIGN_3

.L19:
xorq %rax,%rax

RESTOREREGISTERS

ret
ALIGN_3


.L40:
movq Y, YY
movq M, %rax
//If incx==0 || incy==0, avoid unloop.
cmpq $0, INCX
je .L46
cmpq $0, INCY
je .L46
sarq $3, %rax
jle .L45

prefetchnta 512(X)
prefetchnta 512+64(X)
prefetchnta 512+128(X)
prefetchnta 512+192(X)

prefetchnta 512(Y)
prefetchnta 512+64(Y)
prefetchnta 512+128(Y)
prefetchnta 512+192(Y)
ALIGN_3

.L41:

vmovsd 0 * SIZE(X), %xmm0
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm0 , %xmm0
addq INCX, X

vmovsd 0 * SIZE(YY), %xmm6
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm6 , %xmm6
addq INCY, YY


vmovsd 0 * SIZE(X), %xmm1
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm1 , %xmm1
addq INCX, X

vmovsd 0 * SIZE(YY), %xmm7
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm7 , %xmm7
addq INCY, YY

vfmaddpd %xmm6 , ALPHA , %xmm0 , %xmm0

vmovsd 0 * SIZE(X), %xmm2
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm2 , %xmm2
addq INCX, X

vmovsd 0 * SIZE(YY), %xmm8
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm8 , %xmm8
addq INCY, YY

vfmaddpd %xmm7 , ALPHA , %xmm1 , %xmm1

vmovsd 0 * SIZE(X), %xmm3
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm3 , %xmm3
addq INCX, X

vfmaddpd %xmm8 , ALPHA , %xmm2 , %xmm2

vmovsd 0 * SIZE(YY), %xmm9
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm9 , %xmm9
addq INCY, YY


vmovsd %xmm0, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm0, 0 * SIZE(Y)
addq INCY, Y
vmovsd %xmm1, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm1, 0 * SIZE(Y)
addq INCY, Y
vmovsd %xmm2, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm2, 0 * SIZE(Y)
addq INCY, Y

vfmaddpd %xmm9 , ALPHA , %xmm3 , %xmm3

vmovsd %xmm3, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm3, 0 * SIZE(Y)
addq INCY, Y

decq %rax
jg .L41
ALIGN_3

.L45:
movq M, %rax
andq $7, %rax
jle .L47
ALIGN_3

.L46:
vmovsd (X), %xmm0
addq INCX, X

vfmaddsd (Y) , ALPHA , %xmm0 , %xmm0

vmovsd %xmm0, (Y)
addq INCY, Y

decq %rax
jg .L46
ALIGN_3

.L47:
xorq %rax, %rax

RESTOREREGISTERS

ret

EPILOGUE

+ 291
- 0
kernel/x86_64/dcopy_bulldozer.S View File

@@ -0,0 +1,291 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define Y ARG4 /* rcx */
#ifndef WINDOWS_ABI
#define INCY ARG5 /* r8 */
#else
#define INCY %r10
#endif

#include "l1param.h"

#define VLOAD(OFFSET, ADDR, REG) vmovups OFFSET(ADDR), REG
#define VSHUFPD_1(REG1 , REG2) vshufpd $0x01, REG1, REG2, REG2
#define A_PRE 640
#define B_PRE 640

PROLOGUE
PROFCODE

#ifdef WINDOWS_ABI
movq 40(%rsp), INCY
#endif

SAVEREGISTERS

leaq (, INCX, SIZE), INCX
leaq (, INCY, SIZE), INCY

cmpq $SIZE, INCX
jne .L40
cmpq $SIZE, INCY
jne .L40

testq $SIZE, X
je .L10

vmovsd (X), %xmm0
vmovsd %xmm0, (Y)
addq $1 * SIZE, X
addq $1 * SIZE, Y
decq M
jle .L19
ALIGN_4

.L10:
subq $-16 * SIZE, X
subq $-16 * SIZE, Y


movq M, %rax
sarq $4, %rax
jle .L13

vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3
vmovups -8 * SIZE(X), %xmm4
vmovups -6 * SIZE(X), %xmm5
vmovups -4 * SIZE(X), %xmm6
vmovups -2 * SIZE(X), %xmm7

decq %rax
jle .L12
ALIGN_4

.L11:

prefetchnta A_PRE(X)
nop
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
prefetchnta B_PRE(Y)
nop
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)

VLOAD( 0 * SIZE, X, %xmm0)
VLOAD( 2 * SIZE, X, %xmm1)
VLOAD( 4 * SIZE, X, %xmm2)
VLOAD( 6 * SIZE, X, %xmm3)

prefetchnta A_PRE+64(X)
nop
vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
prefetchnta B_PRE+64(Y)
nop
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)

VLOAD( 8 * SIZE, X, %xmm4)
VLOAD(10 * SIZE, X, %xmm5)
subq $-16 * SIZE, Y
VLOAD(12 * SIZE, X, %xmm6)
VLOAD(14 * SIZE, X, %xmm7)

subq $-16 * SIZE, X
decq %rax
jg .L11
ALIGN_3

.L12:
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)
vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)

subq $-16 * SIZE, Y
subq $-16 * SIZE, X
ALIGN_3

.L13:
testq $8, M
jle .L14
ALIGN_3

vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3

vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)

addq $8 * SIZE, X
addq $8 * SIZE, Y
ALIGN_3

.L14:
testq $4, M
jle .L15
ALIGN_3

vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1

vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)

addq $4 * SIZE, X
addq $4 * SIZE, Y
ALIGN_3

.L15:
testq $2, M
jle .L16
ALIGN_3

vmovups -16 * SIZE(X), %xmm0
vmovups %xmm0, -16 * SIZE(Y)

addq $2 * SIZE, X
addq $2 * SIZE, Y
ALIGN_3

.L16:
testq $1, M
jle .L19
ALIGN_3

vmovsd -16 * SIZE(X), %xmm0
vmovsd %xmm0, -16 * SIZE(Y)
ALIGN_3

.L19:
xorq %rax,%rax

RESTOREREGISTERS

ret
ALIGN_3



.L40:
movq M, %rax
sarq $3, %rax
jle .L45
ALIGN_3

.L41:
vmovsd (X), %xmm0
addq INCX, X
vmovsd (X), %xmm4
addq INCX, X
vmovsd (X), %xmm1
addq INCX, X
vmovsd (X), %xmm5
addq INCX, X
vmovsd (X), %xmm2
addq INCX, X
vmovsd (X), %xmm6
addq INCX, X
vmovsd (X), %xmm3
addq INCX, X
vmovsd (X), %xmm7
addq INCX, X

vmovsd %xmm0, (Y)
addq INCY, Y
vmovsd %xmm4, (Y)
addq INCY, Y
vmovsd %xmm1, (Y)
addq INCY, Y
vmovsd %xmm5, (Y)
addq INCY, Y
vmovsd %xmm2, (Y)
addq INCY, Y
vmovsd %xmm6, (Y)
addq INCY, Y
vmovsd %xmm3, (Y)
addq INCY, Y
vmovsd %xmm7, (Y)
addq INCY, Y

decq %rax
jg .L41
ALIGN_3

.L45:
movq M, %rax
andq $7, %rax
jle .L47
ALIGN_3

.L46:
vmovsd (X), %xmm0
addq INCX, X
vmovsd %xmm0, (Y)
addq INCY, Y
decq %rax
jg .L46
ALIGN_3

.L47:
xorq %rax, %rax

RESTOREREGISTERS

ret

EPILOGUE

+ 311
- 0
kernel/x86_64/ddot_bulldozer.S View File

@@ -0,0 +1,311 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define N ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define Y ARG4 /* rcx */
#ifndef WINDOWS_ABI
#define INCY ARG5 /* r8 */
#else
#define INCY %r10
#endif

#define A_PRE 512

#include "l1param.h"

PROLOGUE
PROFCODE

#ifdef WINDOWS_ABI
movq 40(%rsp), INCY
#endif

SAVEREGISTERS

leaq (, INCX, SIZE), INCX
leaq (, INCY, SIZE), INCY

vxorps %xmm0, %xmm0 , %xmm0
vxorps %xmm1, %xmm1 , %xmm1
vxorps %xmm2, %xmm2 , %xmm2
vxorps %xmm3, %xmm3 , %xmm3

cmpq $0, N
jle .L999

cmpq $SIZE, INCX
jne .L50
cmpq $SIZE, INCY
jne .L50

subq $-16 * SIZE, X
subq $-16 * SIZE, Y

testq $SIZE, Y
je .L10

vmovsd -16 * SIZE(X), %xmm0
vmulsd -16 * SIZE(Y), %xmm0 , %xmm0
addq $1 * SIZE, X
addq $1 * SIZE, Y
decq N
ALIGN_2

.L10:

movq N, %rax
sarq $4, %rax
jle .L14

vmovups -16 * SIZE(X), %xmm4
vmovups -14 * SIZE(X), %xmm5
vmovups -12 * SIZE(X), %xmm6
vmovups -10 * SIZE(X), %xmm7

vmovups -8 * SIZE(X), %xmm8
vmovups -6 * SIZE(X), %xmm9
vmovups -4 * SIZE(X), %xmm10
vmovups -2 * SIZE(X), %xmm11

decq %rax
jle .L12

ALIGN_3

.L11:
prefetchnta A_PRE(Y)

vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
prefetchnta A_PRE(X)
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3

vmovups 0 * SIZE(X), %xmm4
vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0
vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1
vmovups 2 * SIZE(X), %xmm5
vmovups 4 * SIZE(X), %xmm6
vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2
vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3
vmovups 6 * SIZE(X), %xmm7

prefetchnta A_PRE+64(Y)

vmovups 8 * SIZE(X), %xmm8
vmovups 10 * SIZE(X), %xmm9
prefetchnta A_PRE+64(X)
vmovups 12 * SIZE(X), %xmm10
vmovups 14 * SIZE(X), %xmm11

subq $-16 * SIZE, X
subq $-16 * SIZE, Y

decq %rax
jg .L11
ALIGN_3

.L12:

vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3

vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0
vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1
vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2
vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3

subq $-16 * SIZE, X
subq $-16 * SIZE, Y
ALIGN_3

.L14:
testq $15, N
jle .L999

testq $8, N
jle .L15

vmovups -16 * SIZE(X), %xmm4
vmovups -14 * SIZE(X), %xmm5
vmovups -12 * SIZE(X), %xmm6
vmovups -10 * SIZE(X), %xmm7

vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3

addq $8 * SIZE, X
addq $8 * SIZE, Y
ALIGN_3

.L15:
testq $4, N
jle .L16

vmovups -16 * SIZE(X), %xmm4
vmovups -14 * SIZE(X), %xmm5

vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1

addq $4 * SIZE, X
addq $4 * SIZE, Y
ALIGN_3

.L16:
testq $2, N
jle .L17

vmovups -16 * SIZE(X), %xmm4
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0


addq $2 * SIZE, X
addq $2 * SIZE, Y
ALIGN_3

.L17:
testq $1, N
jle .L999

vmovsd -16 * SIZE(X), %xmm4
vmovsd -16 * SIZE(Y), %xmm5
vfmaddpd %xmm0, %xmm4 , %xmm5 , %xmm0
jmp .L999
ALIGN_3


.L50:
movq N, %rax
sarq $3, %rax
jle .L55
ALIGN_3

.L53:


vmovsd 0 * SIZE(X), %xmm4
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm8
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm5
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm9
addq INCY, Y

vmovsd 0 * SIZE(X), %xmm6
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm10
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm7
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm11
addq INCY, Y

vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1
vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2
vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3


vmovsd 0 * SIZE(X), %xmm4
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm8
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm5
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm9
addq INCY, Y

vmovsd 0 * SIZE(X), %xmm6
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm10
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm7
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm11
addq INCY, Y

vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1
vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2
vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3

decq %rax
jg .L53
ALIGN_3

.L55:
movq N, %rax
andq $7, %rax
jle .L999
ALIGN_3

.L56:
vmovsd 0 * SIZE(X), %xmm4
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm8
addq INCY, Y

vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0

decq %rax
jg .L56
ALIGN_3

.L999:
vaddpd %xmm1, %xmm0 , %xmm0
vaddpd %xmm3, %xmm2 , %xmm2
vaddpd %xmm2, %xmm0 , %xmm0

vhaddpd %xmm0, %xmm0 , %xmm0

RESTOREREGISTERS

ret

EPILOGUE

+ 0
- 1860
kernel/x86_64/dgemm_kernel_4x4_bulldozer.S
File diff suppressed because it is too large
View File


+ 3880
- 0
kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
File diff suppressed because it is too large
View File


+ 1821
- 0
kernel/x86_64/dgemm_ncopy_8_bulldozer.S
File diff suppressed because it is too large
View File


+ 667
- 0
kernel/x86_64/dgemm_tcopy_8_bulldozer.S View File

@@ -0,0 +1,667 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS
#define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS

#define A_PRE 256

#ifndef WINDOWS_ABI

#define N ARG1 /* rsi */
#define M ARG2 /* rdi */
#define A ARG3 /* rdx */
#define LDA ARG4 /* rcx */
#define B ARG5 /* r8 */

#define AO1 %r9
#define AO2 %r10
#define LDA3 %r11
#define M8 %r12

#else

#define N ARG1 /* rdx */
#define M ARG2 /* rcx */
#define A ARG3 /* r8 */
#define LDA ARG4 /* r9 */
#define OLD_B 40 + 56(%rsp)

#define B %r12

#define AO1 %rsi
#define AO2 %rdi
#define LDA3 %r10
#define M8 %r11
#endif

#define I %rax

#define B0 %rbp
#define B1 %r13
#define B2 %r14
#define B3 %r15

PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
#endif

pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbp

#ifdef WINDOWS_ABI
movq OLD_B, B
#endif

subq $-16 * SIZE, B

movq M, B1
movq M, B2
movq M, B3

andq $-8, B1
andq $-4, B2
andq $-2, B3

imulq N, B1
imulq N, B2
imulq N, B3

leaq (B, B1, SIZE), B1
leaq (B, B2, SIZE), B2
leaq (B, B3, SIZE), B3

leaq (,LDA, SIZE), LDA
leaq (LDA, LDA, 2), LDA3

leaq (, N, SIZE), M8

cmpq $8, N
jl .L20
ALIGN_4

.L11:
subq $8, N

movq A, AO1
leaq (A, LDA, 4), AO2
leaq (A, LDA, 8), A

movq B, B0
addq $64 * SIZE, B

movq M, I
sarq $3, I
jle .L14
ALIGN_4

.L13:

prefetchnta A_PRE(AO1)
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)

vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)


prefetchnta A_PRE(AO1, LDA, 1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)

vmovups %xmm0, -8 * SIZE(B0)
vmovups %xmm1, -6 * SIZE(B0)
vmovups %xmm2, -4 * SIZE(B0)
vmovups %xmm3, -2 * SIZE(B0)


prefetchnta A_PRE(AO1, LDA, 2)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA, 2, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA, 2, %xmm3)


vmovups %xmm0, 0 * SIZE(B0)
vmovups %xmm1, 2 * SIZE(B0)
vmovups %xmm2, 4 * SIZE(B0)
vmovups %xmm3, 6 * SIZE(B0)


prefetchnta A_PRE(AO1, LDA3, 1)
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA3, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA3, 1, %xmm3)

vmovups %xmm0, 8 * SIZE(B0)
vmovups %xmm1, 10 * SIZE(B0)
vmovups %xmm2, 12 * SIZE(B0)
vmovups %xmm3, 14 * SIZE(B0)

prefetchnta A_PRE(AO2)
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)

vmovups %xmm0, 16 * SIZE(B0)
vmovups %xmm1, 18 * SIZE(B0)
vmovups %xmm2, 20 * SIZE(B0)
vmovups %xmm3, 22 * SIZE(B0)

prefetchnta A_PRE(AO2, LDA, 1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)

vmovups %xmm0, 24 * SIZE(B0)
vmovups %xmm1, 26 * SIZE(B0)
vmovups %xmm2, 28 * SIZE(B0)
vmovups %xmm3, 30 * SIZE(B0)

prefetchnta A_PRE(AO2, LDA, 2)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA, 2, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA, 2, %xmm3)

vmovups %xmm0, 32 * SIZE(B0)
vmovups %xmm1, 34 * SIZE(B0)
vmovups %xmm2, 36 * SIZE(B0)
vmovups %xmm3, 38 * SIZE(B0)

prefetchnta A_PRE(AO2, LDA3, 1)
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA3, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA3, 1, %xmm3)

vmovups %xmm0, 40 * SIZE(B0)
vmovups %xmm1, 42 * SIZE(B0)
vmovups %xmm2, 44 * SIZE(B0)
vmovups %xmm3, 46 * SIZE(B0)

addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
leaq (B0, M8, 8), B0

decq I
jg .L13
ALIGN_4

.L14:
testq $4, M
jle .L16

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)

vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)
vmovups %xmm2, -12 * SIZE(B1)
vmovups %xmm3, -10 * SIZE(B1)

VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm3)

vmovups %xmm0, -8 * SIZE(B1)
vmovups %xmm1, -6 * SIZE(B1)
vmovups %xmm2, -4 * SIZE(B1)
vmovups %xmm3, -2 * SIZE(B1)

VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)

vmovups %xmm0, 0 * SIZE(B1)
vmovups %xmm1, 2 * SIZE(B1)
vmovups %xmm2, 4 * SIZE(B1)
vmovups %xmm3, 6 * SIZE(B1)

VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm3)

vmovups %xmm0, 8 * SIZE(B1)
vmovups %xmm1, 10 * SIZE(B1)
vmovups %xmm2, 12 * SIZE(B1)
vmovups %xmm3, 14 * SIZE(B1)

addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-32 * SIZE, B1
ALIGN_4

.L16:
testq $2, M
jle .L18

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm2)
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm3)

vmovups %xmm0, -16 * SIZE(B2)
vmovups %xmm1, -14 * SIZE(B2)
vmovups %xmm2, -12 * SIZE(B2)
vmovups %xmm3, -10 * SIZE(B2)

VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm2)
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm3)

vmovups %xmm0, -8 * SIZE(B2)
vmovups %xmm1, -6 * SIZE(B2)
vmovups %xmm2, -4 * SIZE(B2)
vmovups %xmm3, -2 * SIZE(B2)

addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
subq $-16 * SIZE, B2
ALIGN_4

.L18:
testq $1, M
jle .L19

vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO1, LDA), %xmm1
vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2
vmovsd 0 * SIZE(AO1, LDA3), %xmm3

vunpcklpd %xmm1, %xmm0 , %xmm0
vunpcklpd %xmm3, %xmm2 , %xmm2

vmovups %xmm0, -16 * SIZE(B3)
vmovups %xmm2, -14 * SIZE(B3)

vmovsd 0 * SIZE(AO2), %xmm0
vmovsd 0 * SIZE(AO2, LDA), %xmm1
vmovsd 0 * SIZE(AO2, LDA, 2), %xmm2
vmovsd 0 * SIZE(AO2, LDA3), %xmm3

vunpcklpd %xmm1, %xmm0 , %xmm0
vunpcklpd %xmm3, %xmm2 , %xmm2

vmovups %xmm0, -12 * SIZE(B3)
vmovups %xmm2, -10 * SIZE(B3)

subq $-8 * SIZE, B3
ALIGN_4

.L19:
cmpq $8, N
jge .L11
ALIGN_4

.L20:
cmpq $4, N
jl .L30

subq $4, N

movq A, AO1
leaq (A, LDA, 2), AO2
leaq (A, LDA, 4), A

movq B, B0
addq $32 * SIZE, B

movq M, I
sarq $3, I
jle .L24
ALIGN_4

.L23:

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)

vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)


VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)

vmovups %xmm0, -8 * SIZE(B0)
vmovups %xmm1, -6 * SIZE(B0)
vmovups %xmm2, -4 * SIZE(B0)
vmovups %xmm3, -2 * SIZE(B0)

VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)

vmovups %xmm0, 0 * SIZE(B0)
vmovups %xmm1, 2 * SIZE(B0)
vmovups %xmm2, 4 * SIZE(B0)
vmovups %xmm3, 6 * SIZE(B0)

VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)

vmovups %xmm0, 8 * SIZE(B0)
vmovups %xmm1, 10 * SIZE(B0)
vmovups %xmm2, 12 * SIZE(B0)
vmovups %xmm3, 14 * SIZE(B0)

addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
leaq (B0, M8, 8), B0

decq I
jg .L23
ALIGN_4

.L24:
testq $4, M
jle .L26

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)

vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)
vmovups %xmm2, -12 * SIZE(B1)
vmovups %xmm3, -10 * SIZE(B1)

VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)

vmovups %xmm0, -8 * SIZE(B1)
vmovups %xmm1, -6 * SIZE(B1)
vmovups %xmm2, -4 * SIZE(B1)
vmovups %xmm3, -2 * SIZE(B1)

addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-16 * SIZE, B1
ALIGN_4

.L26:
testq $2, M
jle .L28

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A1(0 * SIZE, AO2, %xmm2)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3)

vmovups %xmm0, -16 * SIZE(B2)
vmovups %xmm1, -14 * SIZE(B2)
vmovups %xmm2, -12 * SIZE(B2)
vmovups %xmm3, -10 * SIZE(B2)

addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
subq $-8 * SIZE, B2
ALIGN_4

.L28:
testq $1, M
jle .L30

vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO1, LDA), %xmm1
vmovsd 0 * SIZE(AO2), %xmm2
vmovsd 0 * SIZE(AO2, LDA), %xmm3

vunpcklpd %xmm1, %xmm0, %xmm0
vunpcklpd %xmm3, %xmm2, %xmm2

vmovups %xmm0, -16 * SIZE(B3)
vmovups %xmm2, -14 * SIZE(B3)
subq $-4 * SIZE, B3
ALIGN_4

.L30:
cmpq $2, N
jl .L40

subq $2, N

movq A, AO1
leaq (A, LDA), AO2
leaq (A, LDA, 2), A

movq B, B0
addq $16 * SIZE, B

movq M, I
sarq $3, I
jle .L34
ALIGN_4

.L33:

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)

vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)

VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)

vmovups %xmm0, -8 * SIZE(B0)
vmovups %xmm1, -6 * SIZE(B0)
vmovups %xmm2, -4 * SIZE(B0)
vmovups %xmm3, -2 * SIZE(B0)

addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
leaq (B0, M8, 8), B0

decq I
jg .L33
ALIGN_4

.L34:
testq $4, M
jle .L36

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(0 * SIZE, AO2, %xmm2)
VMOVUPS_A1(2 * SIZE, AO2, %xmm3)

vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)
vmovups %xmm2, -12 * SIZE(B1)
vmovups %xmm3, -10 * SIZE(B1)

addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-8 * SIZE, B1
ALIGN_4

.L36:
testq $2, M
jle .L38

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(0 * SIZE, AO2, %xmm1)

vmovups %xmm0, -16 * SIZE(B2)
vmovups %xmm1, -14 * SIZE(B2)

addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
subq $-4 * SIZE, B2
ALIGN_4

.L38:
testq $1, M
jle .L40

vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO2), %xmm1

vunpcklpd %xmm1, %xmm0, %xmm0

vmovups %xmm0, -16 * SIZE(B3)
subq $-2 * SIZE, B3
ALIGN_4

.L40:
cmpq $1, N
jl .L999

movq A, AO1

movq B, B0

movq M, I
sarq $3, I
jle .L44
ALIGN_4

.L43:

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)

vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)

addq $8 * SIZE, AO1
leaq (B0, M8, 8), B0

decq I
jg .L43
ALIGN_4

.L44:
testq $4, M
jle .L45

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)

vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)

addq $4 * SIZE, AO1
subq $-4 * SIZE, B1
ALIGN_4

.L45:
testq $2, M
jle .L46

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)

vmovups %xmm0, -16 * SIZE(B2)

addq $2 * SIZE, AO1
subq $-2 * SIZE, B2
ALIGN_4

.L46:
testq $1, M
jle .L999

vmovsd 0 * SIZE(AO1), %xmm0

vmovsd %xmm0, -16 * SIZE(B3)
jmp .L999
ALIGN_4
.L999:
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15

#ifdef WINDOWS_ABI
popq %rsi
popq %rdi
#endif
ret

EPILOGUE

+ 49
- 7
kernel/x86_64/dgemv_n.S View File

@@ -47,7 +47,7 @@


#ifndef WINDOWS_ABI #ifndef WINDOWS_ABI


#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_M %rdi #define OLD_M %rdi
#define OLD_N %rsi #define OLD_N %rsi
@@ -59,9 +59,14 @@
#define STACK_BUFFER 32 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA 48 (%rsp) #define ALPHA 48 (%rsp)


#define MMM 56(%rsp)
#define NN 64(%rsp)
#define AA 72(%rsp)
#define LDAX 80(%rsp)
#define XX 88(%rsp)
#else #else


#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_M %rcx #define OLD_M %rcx
#define OLD_N %rdx #define OLD_N %rdx
@@ -74,6 +79,12 @@
#define STACK_BUFFER 88 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp)
#define ALPHA 224 (%rsp) #define ALPHA 224 (%rsp)


#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define LDAX 256(%rsp)
#define XX 264(%rsp)

#endif #endif


#define LDA %r8 #define LDA %r8
@@ -137,17 +148,42 @@
movq OLD_LDA, LDA movq OLD_LDA, LDA
#endif #endif


movq STACK_INCX, INCX
movq STACK_Y, Y
movq STACK_INCY, INCY
movq STACK_BUFFER, BUFFER

#ifndef WINDOWS_ABI #ifndef WINDOWS_ABI
movsd %xmm0, ALPHA movsd %xmm0, ALPHA
#else #else
movsd %xmm3, ALPHA movsd %xmm3, ALPHA
#endif #endif


movq STACK_Y, Y
movq A,AA
movq N,NN
movq M,MMM
movq LDA,LDAX
movq X,XX

.L0t:
xorq I,I
addq $1,I
salq $21,I
subq I,MMM
movq I,M
jge .L00t

movq MMM,M
addq I,M
jle .L999x
.L00t:
movq XX,X
movq AA,A
movq NN,N
movq LDAX,LDA

movq STACK_INCX, INCX
movq STACK_INCY, INCY
movq STACK_BUFFER, BUFFER


leaq -1(INCY), %rax leaq -1(INCY), %rax


leaq (,INCX, SIZE), INCX leaq (,INCX, SIZE), INCX
@@ -2815,6 +2851,12 @@
ALIGN_3 ALIGN_3


.L999: .L999:
leaq (, M, SIZE), %rax
addq %rax,AA
jmp .L0t
ALIGN_4

.L999x:
movq 0(%rsp), %rbx movq 0(%rsp), %rbx
movq 8(%rsp), %rbp movq 8(%rsp), %rbp
movq 16(%rsp), %r12 movq 16(%rsp), %r12


+ 2325
- 0
kernel/x86_64/dgemv_n_bulldozer.S
File diff suppressed because it is too large
View File


+ 1938
- 0
kernel/x86_64/dgemv_t_bulldozer.S
File diff suppressed because it is too large
View File


+ 360
- 0
kernel/x86_64/gemm_ncopy_2_bulldozer.S View File

@@ -0,0 +1,360 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"


#ifndef WINDOWS_ABI

#define M ARG1 /* rdi */
#define N ARG2 /* rsi */
#define A ARG3 /* rdx */
#define LDA ARG4 /* rcx */
#define B ARG5 /* r8 */

#define I %r9

#else

#define STACKSIZE 256

#define M ARG1 /* rcx */
#define N ARG2 /* rdx */
#define A ARG3 /* r8 */
#define LDA ARG4 /* r9 */
#define OLD_B 40 + 32 + STACKSIZE(%rsp)

#define B %r14
#define I %r15

#endif

#define J %r10
#define AO1 %r11
#define AO2 %r12
#define AO3 %r13
#define AO4 %rax

PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
pushq %r15
pushq %r14
#endif
pushq %r13
pushq %r12

#ifdef WINDOWS_ABI
subq $STACKSIZE, %rsp

vmovups %xmm6, 0(%rsp)
vmovups %xmm7, 16(%rsp)
vmovups %xmm8, 32(%rsp)
vmovups %xmm9, 48(%rsp)
vmovups %xmm10, 64(%rsp)
vmovups %xmm11, 80(%rsp)
vmovups %xmm12, 96(%rsp)
vmovups %xmm13, 112(%rsp)
vmovups %xmm14, 128(%rsp)
vmovups %xmm15, 144(%rsp)

movq OLD_B, B
#endif

leaq (,LDA, SIZE), LDA # Scaling

movq N, J
sarq $1, J
jle .L20
ALIGN_4

.L01:
movq A, AO1
leaq (A, LDA), AO2
leaq (A, LDA, 2), A

movq M, I
sarq $3, I
jle .L08
ALIGN_4

.L03:

#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1
vmovss 1 * SIZE(AO1), %xmm2
vmovss 1 * SIZE(AO2), %xmm3
vmovss 2 * SIZE(AO1), %xmm4
vmovss 2 * SIZE(AO2), %xmm5
vmovss 3 * SIZE(AO1), %xmm6
vmovss 3 * SIZE(AO2), %xmm7

vmovss 4 * SIZE(AO1), %xmm8
vmovss 4 * SIZE(AO2), %xmm9
vmovss 5 * SIZE(AO1), %xmm10
vmovss 5 * SIZE(AO2), %xmm11
vmovss 6 * SIZE(AO1), %xmm12
vmovss 6 * SIZE(AO2), %xmm13
vmovss 7 * SIZE(AO1), %xmm14
vmovss 7 * SIZE(AO2), %xmm15

vmovss %xmm0, 0 * SIZE(B)
vmovss %xmm1, 1 * SIZE(B)
vmovss %xmm2, 2 * SIZE(B)
vmovss %xmm3, 3 * SIZE(B)
vmovss %xmm4, 4 * SIZE(B)
vmovss %xmm5, 5 * SIZE(B)
vmovss %xmm6, 6 * SIZE(B)
vmovss %xmm7, 7 * SIZE(B)

vmovss %xmm8, 8 * SIZE(B)
vmovss %xmm9, 9 * SIZE(B)
vmovss %xmm10, 10 * SIZE(B)
vmovss %xmm11, 11 * SIZE(B)
vmovss %xmm12, 12 * SIZE(B)
vmovss %xmm13, 13 * SIZE(B)
vmovss %xmm14, 14 * SIZE(B)
vmovss %xmm15, 15 * SIZE(B)

#else
prefetchw 256(B)

prefetchnta 256(AO1)
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 1 * SIZE(AO1), %xmm1
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 3 * SIZE(AO1), %xmm3
vmovsd 4 * SIZE(AO1), %xmm4
vmovsd 5 * SIZE(AO1), %xmm5
vmovsd 6 * SIZE(AO1), %xmm6
vmovsd 7 * SIZE(AO1), %xmm7

prefetchnta 256(AO2)
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1
vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2
vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3
vmovhpd 4 * SIZE(AO2), %xmm4 , %xmm4
vmovhpd 5 * SIZE(AO2), %xmm5 , %xmm5
vmovhpd 6 * SIZE(AO2), %xmm6 , %xmm6
vmovhpd 7 * SIZE(AO2), %xmm7 , %xmm7


prefetchw 256+64(B)
vmovups %xmm0, 0 * SIZE(B)
vmovups %xmm1, 2 * SIZE(B)
vmovups %xmm2, 4 * SIZE(B)
vmovups %xmm3, 6 * SIZE(B)
vmovups %xmm4, 8 * SIZE(B)
vmovups %xmm5, 10 * SIZE(B)
vmovups %xmm6, 12 * SIZE(B)
vmovups %xmm7, 14 * SIZE(B)

#endif

addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
subq $-16 * SIZE, B
decq I
jg .L03
ALIGN_4


.L08:
testq $4 , M
je .L14

ALIGN_4


.L13:
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1
vmovss 1 * SIZE(AO1), %xmm2
vmovss 1 * SIZE(AO2), %xmm3
vmovss 2 * SIZE(AO1), %xmm4
vmovss 2 * SIZE(AO2), %xmm5
vmovss 3 * SIZE(AO1), %xmm6
vmovss 3 * SIZE(AO2), %xmm7

vmovss %xmm0, 0 * SIZE(B)
vmovss %xmm1, 1 * SIZE(B)
vmovss %xmm2, 2 * SIZE(B)
vmovss %xmm3, 3 * SIZE(B)
vmovss %xmm4, 4 * SIZE(B)
vmovss %xmm5, 5 * SIZE(B)
vmovss %xmm6, 6 * SIZE(B)
vmovss %xmm7, 7 * SIZE(B)
#else

vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 1 * SIZE(AO1), %xmm1
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 3 * SIZE(AO1), %xmm3

vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1
vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2
vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3


vmovups %xmm0, 0 * SIZE(B)
vmovups %xmm1, 2 * SIZE(B)
vmovups %xmm2, 4 * SIZE(B)
vmovups %xmm3, 6 * SIZE(B)
#endif

addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-8 * SIZE, B
ALIGN_4

.L14:
movq M, I
andq $3, I
jle .L16
ALIGN_4

.L15:
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1

vmovss %xmm0, 0 * SIZE(B)
vmovss %xmm1, 1 * SIZE(B)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0

vmovups %xmm0, 0 * SIZE(B)
#endif

addq $SIZE, AO1
addq $SIZE, AO2
addq $2 * SIZE, B
decq I
jg .L15
ALIGN_4

.L16:
decq J
jg .L01
ALIGN_4

.L20:
testq $1, N
jle .L999

movq A, AO1

movq M, I
sarq $2, I
jle .L34
ALIGN_4

.L33:
#ifndef DOUBLE
vmovups 0 * SIZE(AO1), %xmm0

vmovups %xmm0, 0 * SIZE(B)
#else
vmovups 0 * SIZE(AO1), %xmm0
vmovups 2 * SIZE(AO1), %xmm1

vmovups %xmm0, 0 * SIZE(B)
vmovups %xmm1, 2 * SIZE(B)
#endif

addq $4 * SIZE, AO1
subq $-4 * SIZE, B
decq I
jg .L33
ALIGN_4

.L34:
movq M, I
andq $3, I
jle .L999
ALIGN_4

.L35:
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss %xmm0, 0 * SIZE(B)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd %xmm0, 0 * SIZE(B)
#endif

addq $SIZE, AO1
addq $1 * SIZE, B
decq I
jg .L35
ALIGN_4


.L999:
#ifdef WINDOWS_ABI
vmovups 0(%rsp), %xmm6
vmovups 16(%rsp), %xmm7
vmovups 32(%rsp), %xmm8
vmovups 48(%rsp), %xmm9
vmovups 64(%rsp), %xmm10
vmovups 80(%rsp), %xmm11
vmovups 96(%rsp), %xmm12
vmovups 112(%rsp), %xmm13
vmovups 128(%rsp), %xmm14
vmovups 144(%rsp), %xmm15

addq $STACKSIZE, %rsp
#endif

popq %r12
popq %r13

#ifdef WINDOWS_ABI
popq %r14
popq %r15
#endif
ret

EPILOGUE

+ 374
- 0
kernel/x86_64/gemm_tcopy_2_bulldozer.S View File

@@ -0,0 +1,374 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#ifndef WINDOWS_ABI

#define M ARG1 /* rdi */
#define N ARG2 /* rsi */
#define A ARG3 /* rdx */
#define LDA ARG4 /* rcx */
#define B ARG5 /* r8 */

#define I %r10
#define J %rbp

#define AO1 %r9
#define AO2 %r15
#define AO3 %r11
#define AO4 %r14
#define BO1 %r13
#define M8 %rbx
#define BO %rax

#else

#define STACKSIZE 256

#define M ARG1 /* rcx */
#define N ARG2 /* rdx */
#define A ARG3 /* r8 */
#define LDA ARG4 /* r9 */
#define OLD_B 40 + 64 + STACKSIZE(%rsp)

#define B %rdi

#define I %r10
#define J %r11

#define AO1 %r12
#define AO2 %r13
#define AO3 %r14
#define AO4 %r15

#define BO1 %rsi
#define M8 %rbp
#define BO %rax

#endif

PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
#endif
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbp
pushq %rbx

#ifdef WINDOWS_ABI
subq $STACKSIZE, %rsp

vmovups %xmm6, 0(%rsp)
vmovups %xmm7, 16(%rsp)
vmovups %xmm8, 32(%rsp)
vmovups %xmm9, 48(%rsp)
vmovups %xmm10, 64(%rsp)
vmovups %xmm11, 80(%rsp)
vmovups %xmm12, 96(%rsp)
vmovups %xmm13, 112(%rsp)
vmovups %xmm14, 128(%rsp)
vmovups %xmm15, 144(%rsp)

movq OLD_B, B
#endif

movq N, %rax
andq $-2, %rax
imulq M, %rax

leaq (B, %rax, SIZE), BO1

leaq (, LDA, SIZE), LDA
leaq (, M, SIZE), M8

movq M, J
sarq $1, J
jle .L20
ALIGN_4

.L01:
movq A, AO1
leaq (A, LDA ), AO2
leaq (A, LDA, 2), A

movq B, BO
addq $4 * SIZE, B

movq N, I
sarq $3, I
jle .L10
ALIGN_4


.L08:
#ifndef DOUBLE

vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 4 * SIZE(AO1), %xmm4
vmovsd 6 * SIZE(AO1), %xmm6
vmovsd 0 * SIZE(AO2), %xmm1
vmovsd 2 * SIZE(AO2), %xmm3
vmovsd 4 * SIZE(AO2), %xmm5
vmovsd 6 * SIZE(AO2), %xmm7

vmovsd %xmm0, 0 * SIZE(BO)
vmovsd %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovsd %xmm2, 0 * SIZE(BO)
vmovsd %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovsd %xmm4, 0 * SIZE(BO)
vmovsd %xmm5, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovsd %xmm6, 0 * SIZE(BO)
vmovsd %xmm7, 2 * SIZE(BO)
leaq (BO, M8, 2), BO


#else

prefetchnta 256(AO1)
prefetchnta 256(AO2)
vmovups 0 * SIZE(AO1), %xmm0
vmovups 2 * SIZE(AO1), %xmm2
vmovups 4 * SIZE(AO1), %xmm4
vmovups 6 * SIZE(AO1), %xmm6
vmovups 0 * SIZE(AO2), %xmm1
vmovups 2 * SIZE(AO2), %xmm3
vmovups 4 * SIZE(AO2), %xmm5
vmovups 6 * SIZE(AO2), %xmm7

vmovups %xmm0, 0 * SIZE(BO)
vmovups %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovups %xmm2, 0 * SIZE(BO)
vmovups %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovups %xmm4, 0 * SIZE(BO)
vmovups %xmm5, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovups %xmm6, 0 * SIZE(BO)
vmovups %xmm7, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

#endif
addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
decq I
jg .L08
ALIGN_4



.L10:
testq $4, N
jle .L12
#ifndef DOUBLE

vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 0 * SIZE(AO2), %xmm1
vmovsd 2 * SIZE(AO2), %xmm3

vmovsd %xmm0, 0 * SIZE(BO)
vmovsd %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovsd %xmm2, 0 * SIZE(BO)
vmovsd %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO


#else

vmovups 0 * SIZE(AO1), %xmm0
vmovups 2 * SIZE(AO1), %xmm2
vmovups 0 * SIZE(AO2), %xmm1
vmovups 2 * SIZE(AO2), %xmm3

vmovups %xmm0, 0 * SIZE(BO)
vmovups %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovups %xmm2, 0 * SIZE(BO)
vmovups %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

#endif
addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
ALIGN_4


.L12:
testq $2, N
jle .L14
#ifndef DOUBLE
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO2), %xmm1

vmovsd %xmm0, 0 * SIZE(BO)
vmovsd %xmm1, 2 * SIZE(BO)
#else
vmovups 0 * SIZE(AO1), %xmm0
vmovups 0 * SIZE(AO2), %xmm1

vmovups %xmm0, 0 * SIZE(BO)
vmovups %xmm1, 2 * SIZE(BO)
#endif
leaq (BO, M8, 2), BO
addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
ALIGN_4

.L14:
testq $1, N
jle .L19

#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1

vmovss %xmm0, 0 * SIZE(BO1)
vmovss %xmm1, 1 * SIZE(BO1)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0

vmovups %xmm0, 0 * SIZE(BO1)
#endif

addq $2 * SIZE, BO1
ALIGN_4

.L19:
decq J
jg .L01
ALIGN_4

.L20:
testq $1, M
jle .L999
ALIGN_4

.L31:
movq A, AO1
movq B, BO

movq N, I
sarq $1, I
jle .L33
ALIGN_4

.L32:
#ifndef DOUBLE
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd %xmm0, 0 * SIZE(BO)
#else
vmovups 0 * SIZE(AO1), %xmm0
vmovups %xmm0, 0 * SIZE(BO)
#endif

addq $2 * SIZE, AO1
leaq (BO, M8, 2), BO
decq I
jg .L32
ALIGN_4

.L33:
testq $1, N
jle .L999

#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss %xmm0, 0 * SIZE(BO1)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd %xmm0, 0 * SIZE(BO1)
#endif
addq $1 * SIZE, BO1
ALIGN_4

.L999:
#ifdef WINDOWS_ABI
vmovups 0(%rsp), %xmm6
vmovups 16(%rsp), %xmm7
vmovups 32(%rsp), %xmm8
vmovups 48(%rsp), %xmm9
vmovups 64(%rsp), %xmm10
vmovups 80(%rsp), %xmm11
vmovups 96(%rsp), %xmm12
vmovups 112(%rsp), %xmm13
vmovups 128(%rsp), %xmm14
vmovups 144(%rsp), %xmm15

addq $STACKSIZE, %rsp
#endif

popq %rbx
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
#ifdef WINDOWS_ABI
popq %rsi
popq %rdi
#endif

ret

EPILOGUE

+ 4657
- 0
kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
File diff suppressed because it is too large
View File


+ 48
- 8
kernel/x86_64/sgemv_n.S View File

@@ -47,7 +47,7 @@


#ifndef WINDOWS_ABI #ifndef WINDOWS_ABI


#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_M %rdi #define OLD_M %rdi
#define OLD_N %rsi #define OLD_N %rsi
@@ -58,10 +58,14 @@
#define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp)
#define STACK_BUFFER 32 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA 48 (%rsp) #define ALPHA 48 (%rsp)

#define MMM 56(%rsp)
#define NN 64(%rsp)
#define AA 72(%rsp)
#define LDAX 80(%rsp)
#define XX 96(%rsp)
#else #else


#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_M %rcx #define OLD_M %rcx
#define OLD_N %rdx #define OLD_N %rdx
@@ -74,6 +78,12 @@
#define STACK_BUFFER 88 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp)
#define ALPHA 224 (%rsp) #define ALPHA 224 (%rsp)


#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define LDAX 256(%rsp)
#define XX 264(%rsp)

#endif #endif


#define LDA %r8 #define LDA %r8
@@ -137,17 +147,41 @@
movq OLD_LDA, LDA movq OLD_LDA, LDA
#endif #endif


movq STACK_INCX, INCX
movq STACK_Y, Y
movq STACK_INCY, INCY
movq STACK_BUFFER, BUFFER

#ifndef WINDOWS_ABI #ifndef WINDOWS_ABI
movss %xmm0, ALPHA movss %xmm0, ALPHA
#else #else
movss %xmm3, ALPHA movss %xmm3, ALPHA
#endif #endif



movq M,MMM
movq A,AA
movq N,NN
movq LDA,LDAX
movq X,XX
movq STACK_Y, Y
.L0t:
xorq I,I
addq $1,I
salq $22,I
subq I,MMM
movq I,M
jge .L00t

movq MMM,M
addq I,M
jle .L999x

.L00t:
movq AA,A
movq NN,N
movq LDAX,LDA
movq XX,X

movq STACK_INCX, INCX
movq STACK_INCY, INCY
movq STACK_BUFFER, BUFFER

leaq (,INCX, SIZE), INCX leaq (,INCX, SIZE), INCX
leaq (,INCY, SIZE), INCY leaq (,INCY, SIZE), INCY
leaq (,LDA, SIZE), LDA leaq (,LDA, SIZE), LDA
@@ -5990,6 +6024,12 @@
ALIGN_3 ALIGN_3


.L999: .L999:
leaq (,M,SIZE),%rax
addq %rax,AA
jmp .L0t
ALIGN_4

.L999x:
movq 0(%rsp), %rbx movq 0(%rsp), %rbx
movq 8(%rsp), %rbp movq 8(%rsp), %rbp
movq 16(%rsp), %r12 movq 16(%rsp), %r12


+ 5
- 5
kernel/x86_64/sgemv_t.S View File

@@ -63,7 +63,7 @@


#else #else


#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_M %rcx #define OLD_M %rcx
#define OLD_N %rdx #define OLD_N %rdx
@@ -74,10 +74,10 @@
#define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_Y 72 + STACKSIZE(%rsp)
#define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp)
#define STACK_BUFFER 88 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp)
#define MMM 216(%rsp)
#define NN 224(%rsp)
#define AA 232(%rsp)
#define LDAX 240(%rsp)
#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define LDAX 256(%rsp)


#endif #endif




+ 1
- 1
kernel/x86_64/symv_L_sse.S View File

@@ -76,7 +76,7 @@
#define movsd movlps #define movsd movlps
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/symv_L_sse2.S View File

@@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/symv_U_sse.S View File

@@ -76,7 +76,7 @@
#define movsd movlps #define movsd movlps
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/symv_U_sse2.S View File

@@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif


#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)


+ 1407
- 0
kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
File diff suppressed because it is too large
View File


+ 3
- 3
kernel/x86_64/zgemm_kernel_4x4_sandy.S View File

@@ -1385,7 +1385,7 @@ ALIGN_5
EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec15, xvec7;
EXTRA_DY $1, yvec14, xvec6; EXTRA_DY $1, yvec14, xvec6;
EXTRA_DY $1, yvec13, xvec5; EXTRA_DY $1, yvec13, xvec5;
EXTRA_DY $2, yvec12, xvec4;
EXTRA_DY $1, yvec12, xvec4;
#ifndef TRMMKERNEL #ifndef TRMMKERNEL
LDL_DX 0*SIZE(C0), xvec0, xvec0; LDL_DX 0*SIZE(C0), xvec0, xvec0;
LDH_DX 1*SIZE(C0), xvec0, xvec0; LDH_DX 1*SIZE(C0), xvec0, xvec0;
@@ -1406,8 +1406,8 @@ STL_DX xvec7, 2*SIZE(C0, ldc, 1);
STH_DX xvec7, 3*SIZE(C0, ldc, 1); STH_DX xvec7, 3*SIZE(C0, ldc, 1);
STL_DX xvec13, 0*SIZE(C0, ldc, 1); STL_DX xvec13, 0*SIZE(C0, ldc, 1);
STH_DX xvec13, 1*SIZE(C0, ldc, 1); STH_DX xvec13, 1*SIZE(C0, ldc, 1);
STL_DX xvec6, 2*SIZE(C0);
STH_DX xvec6, 3*SIZE(C0);
STL_DX xvec5, 2*SIZE(C0);
STH_DX xvec5, 3*SIZE(C0);
#ifndef TRMMKERNEL #ifndef TRMMKERNEL
LDL_DX 0*SIZE(C1), xvec0, xvec0; LDL_DX 0*SIZE(C1), xvec0, xvec0;
LDH_DX 1*SIZE(C1), xvec0, xvec0; LDH_DX 1*SIZE(C1), xvec0, xvec0;


Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save