Browse Source

Merge branch 'develop' into bulldozer

Conflicts:
	kernel/x86_64/KERNEL.BULLDOZER
tags/v0.2.9.rc1
Zhang Xianyi 12 years ago
parent
commit
72b1edaf1b
100 changed files with 22312 additions and 2519 deletions
  1. +4
    -0
      .gitignore
  2. +24
    -0
      .travis.yml
  3. +87
    -0
      CONTRIBUTORS.md
  4. +50
    -0
      Changelog.txt
  5. +49
    -44
      Makefile
  6. +0
    -5
      Makefile.generic
  7. +18
    -13
      Makefile.install
  8. +0
    -11
      Makefile.power
  9. +6
    -3
      Makefile.rule
  10. +0
    -1
      Makefile.sparc
  11. +121
    -11
      Makefile.system
  12. +0
    -3
      Makefile.x86
  13. +0
    -13
      Makefile.x86_64
  14. +29
    -14
      README.md
  15. +1
    -1
      TargetList.txt
  16. +12
    -6
      c_check
  17. +10
    -0
      cblas.h
  18. +17
    -0
      common.h
  19. +9
    -1
      common_alpha.h
  20. +8
    -1
      common_ia64.h
  21. +14
    -1
      common_linux.h
  22. +10
    -3
      common_mips64.h
  23. +10
    -1
      common_sparc.h
  24. +10
    -3
      common_x86.h
  25. +9
    -1
      common_x86_64.h
  26. +7
    -0
      cpuid.h
  27. +1
    -0
      cpuid_power.c
  28. +68
    -11
      cpuid_x86.c
  29. +14
    -0
      ctest.c
  30. +1
    -1
      ctest/Makefile
  31. +6
    -4
      driver/level2/sbmv_thread.c
  32. +2
    -2
      driver/level3/gemm_thread_n.c
  33. +14
    -1
      driver/level3/level3.c
  34. +24
    -1
      driver/level3/level3_gemm3m_thread.c
  35. +23
    -0
      driver/level3/level3_syrk_threaded.c
  36. +38
    -2
      driver/level3/level3_thread.c
  37. +4
    -1
      driver/others/Makefile
  38. +1
    -0
      driver/others/blas_server.c
  39. +50
    -6
      driver/others/blas_server_omp.c
  40. +2
    -1
      driver/others/blas_server_win32.c
  41. +42
    -7
      driver/others/dynamic.c
  42. +5
    -2
      driver/others/init.c
  43. +22
    -1
      driver/others/memory.c
  44. +52
    -0
      driver/others/openblas_get_parallel.c
  45. +14
    -9
      exports/Makefile
  46. +56
    -21
      exports/gensymbol
  47. +1
    -1
      f_check
  48. +38
    -13
      getarch.c
  49. +40
    -2
      getarch_2nd.c
  50. +13
    -0
      interface/trtri.c
  51. +1
    -1
      kernel/Makefile.L2
  52. +216
    -216
      kernel/Makefile.L3
  53. +16
    -0
      kernel/setparam-ref.c
  54. +59
    -0
      kernel/x86/KERNEL.PILEDRIVER
  55. +5
    -5
      kernel/x86/gemv_n_sse.S
  56. +3
    -3
      kernel/x86/gemv_n_sse2.S
  57. +9
    -19
      kernel/x86/gemv_t_sse.S
  58. +13
    -16
      kernel/x86/gemv_t_sse2.S
  59. +2
    -2
      kernel/x86/lsame.S
  60. +5
    -5
      kernel/x86/trsm_kernel_LN_2x4_sse2.S
  61. +11
    -11
      kernel/x86/trsm_kernel_LN_4x4_sse.S
  62. +5
    -5
      kernel/x86/trsm_kernel_LT_2x4_sse2.S
  63. +11
    -11
      kernel/x86/trsm_kernel_LT_4x4_sse.S
  64. +5
    -5
      kernel/x86/trsm_kernel_RT_2x4_sse2.S
  65. +11
    -11
      kernel/x86/trsm_kernel_RT_4x4_sse.S
  66. +56
    -12
      kernel/x86/zgemv_n_sse.S
  67. +55
    -11
      kernel/x86/zgemv_n_sse2.S
  68. +58
    -13
      kernel/x86/zgemv_t_sse.S
  69. +58
    -14
      kernel/x86/zgemv_t_sse2.S
  70. +2
    -2
      kernel/x86/ztrsm_kernel_LN_2x2_sse.S
  71. +2
    -2
      kernel/x86/ztrsm_kernel_LT_2x2_sse.S
  72. +2
    -2
      kernel/x86/ztrsm_kernel_RT_2x2_sse.S
  73. +45
    -36
      kernel/x86_64/KERNEL.BULLDOZER
  74. +70
    -0
      kernel/x86_64/KERNEL.PILEDRIVER
  75. +5
    -1
      kernel/x86_64/axpy_sse.S
  76. +4
    -1
      kernel/x86_64/axpy_sse2.S
  77. +1900
    -0
      kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
  78. +53
    -4
      kernel/x86_64/cgemv_n.S
  79. +46
    -2
      kernel/x86_64/cgemv_t.S
  80. +408
    -0
      kernel/x86_64/daxpy_bulldozer.S
  81. +291
    -0
      kernel/x86_64/dcopy_bulldozer.S
  82. +311
    -0
      kernel/x86_64/ddot_bulldozer.S
  83. +0
    -1860
      kernel/x86_64/dgemm_kernel_4x4_bulldozer.S
  84. +3880
    -0
      kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
  85. +1821
    -0
      kernel/x86_64/dgemm_ncopy_8_bulldozer.S
  86. +667
    -0
      kernel/x86_64/dgemm_tcopy_8_bulldozer.S
  87. +49
    -7
      kernel/x86_64/dgemv_n.S
  88. +2325
    -0
      kernel/x86_64/dgemv_n_bulldozer.S
  89. +1938
    -0
      kernel/x86_64/dgemv_t_bulldozer.S
  90. +360
    -0
      kernel/x86_64/gemm_ncopy_2_bulldozer.S
  91. +374
    -0
      kernel/x86_64/gemm_tcopy_2_bulldozer.S
  92. +4657
    -0
      kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
  93. +48
    -8
      kernel/x86_64/sgemv_n.S
  94. +5
    -5
      kernel/x86_64/sgemv_t.S
  95. +1
    -1
      kernel/x86_64/symv_L_sse.S
  96. +1
    -1
      kernel/x86_64/symv_L_sse2.S
  97. +1
    -1
      kernel/x86_64/symv_U_sse.S
  98. +1
    -1
      kernel/x86_64/symv_U_sse2.S
  99. +1407
    -0
      kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
  100. +3
    -3
      kernel/x86_64/zgemm_kernel_4x4_sandy.S

+ 4
- 0
.gitignore View File

@@ -4,12 +4,16 @@
*.dylib
*.def
*.o
*.out
lapack-3.1.1
lapack-3.1.1.tgz
lapack-3.4.1
lapack-3.4.1.tgz
lapack-3.4.2
lapack-3.4.2.tgz
lapack-netlib/make.inc
lapack-netlib/lapacke/include/lapacke_mangling.h
lapack-netlib/TESTING/testing_results.txt
*.so
*.a
.svn


+ 24
- 0
.travis.yml View File

@@ -0,0 +1,24 @@
language: c
compiler:
- gcc

env:
- TARGET_BOX=LINUX64 BTYPE="BINARY=64"
- TARGET_BOX=LINUX64 BTYPE="BINARY=64 USE_OPENMP=1"
- TARGET_BOX=LINUX64 BTYPE="BINARY=64 INTERFACE64=1"
- TARGET_BOX=LINUX32 BTYPE="BINARY=32"
- TARGET_BOX=WIN64 BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"

before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq gfortran
- if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi

script: make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE

# whitelist
branches:
only:
- master
- develop

+ 87
- 0
CONTRIBUTORS.md View File

@@ -0,0 +1,87 @@
# Contributions to the OpenBLAS project

## Creator & Maintainer

* Zhang Xianyi <traits.zhang@gmail.com>

## Active Developers

* Wang Qian <traz0824@gmail.com>
* Optimize BLAS3 on ICT Loongson 3A.
* Optimize BLAS3 on Intel Sandy Bridge.

* Zaheer Chothia <zaheer.chothia@gmail.com>
* Improve the compatibility about complex number
* Build LAPACKE: C interface to LAPACK
* Improve the windows build.

## Previous Developers

* Chen Shaohu <huhumartinwar@gmail.com>
* Optimize GEMV on the Loongson 3A processor.

* Luo Wen
* Intern. Test Level-2 BLAS.

## Contributors

In chronological order:

* pipping <http://page.mi.fu-berlin.de/pipping>
* [2011-06-11] Make USE_OPENMP=0 disable openmp.

* Stefan Karpinski <stefan@karpinski.org>
* [2011-12-28] Fix a bug about SystemStubs on Mac OS X.

* Alexander Eberspächer <https://github.com/aeberspaecher>
* [2012-05-02] Add note on patch for segfaults on Linux kernel 2.6.32.

* Mike Nolta <mike@nolta.net>
* [2012-05-19] Fix building bug on FreeBSD and NetBSD.

* Sylvestre Ledru <https://github.com/sylvestre>
* [2012-07-01] Improve the detection of sparc. Fix building bug under
Hurd and kfreebsd.

* Jameson Nash <https://github.com/vtjnash>
* [2012-08-20] Provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to
make on the command line.

* Alexander Nasonov <alnsn@yandex.ru>
* [2012-11-10] Fix NetBSD build.

* Sébastien Villemot <sebastien@debian.org>
* [2012-11-14] Fix compilation with TARGET=GENERIC. Patch applied to Debian package.

* Werner Saar <wernsaar@googlemail.com>
* [2013-03-04] Optimize AVX and FMA4 DGEMM on AMD Bulldozer
* [2013-04-27] Optimize AVX and FMA4 TRSM on AMD Bulldozer
* [2013-06-09] Optimize AVX and FMA4 SGEMM on AMD Bulldozer
* [2013-06-11] Optimize AVX and FMA4 ZGEMM on AMD Bulldozer
* [2013-06-12] Optimize AVX and FMA4 CGEMM on AMD Bulldozer
* [2013-06-16] Optimize dgemv_n kernel on AMD Bulldozer
* [2013-06-20] Optimize ddot, daxpy kernel on AMD Bulldozer
* [2013-06-21] Optimize dcopy kernel on AMD Bulldozer

* Kang-Che Sung <Explorer09@gmail.com>
* [2013-05-17] Fix typo in the document. Re-order the architecture list in getarch.c.

* Kenneth Hoste <kenneth.hoste@gmail.com>
* [2013-05-22] Adjust Makefile about downloading LAPACK source files.

* Lei WANG <https://github.com/wlbksy>
* [2013-05-22] Fix a bug about wget.

* Dan Luu <http://www.linkedin.com/in/danluu>
* [2013-06-30] Add Intel Haswell support (using sandybridge optimizations).

* grisuthedragon <https://github.com/grisuthedragon>
* [2013-07-11] create openblas_get_parallel to retrieve information which parallelization
model is used by OpenBLAS.

* Sébastien Fabbro <bicatali@gentoo.org>
* [2013-07-24] Modify makefile to respect user's LDFLAGS
* [2013-07-24] Add stack markings for GNU as arch-independent for assembler files

* [Your name or handle] <[email or website]>
* [Date] [Brief summary of your changes]

+ 50
- 0
Changelog.txt View File

@@ -1,4 +1,54 @@
OpenBLAS ChangeLog
====================================================================
Version 0.2.7
20-Jul-2013
common:
* Support LSB (Linux Standard Base) 4.1.
e.g. make CC=lsbcc
* Include LAPACK 3.4.2 source codes to the repo.
Avoid downloading at compile time.
* Add NO_PARALLEL_MAKE flag to disable parallel make.
* Create openblas_get_parallel to retrieve information which
parallelization model is used by OpenBLAS. (Thank grisuthedragon)
* Detect LLVM/Clang compiler. The default compiler is Clang on Mac OS X.
* Change LIBSUFFIX from .lib to .a on windows.
* A walk round for dtrti_U single thread bug. Replace it with LAPACK codes. (#191)

x86/x86-64:
* Optimize c/zgemm, trsm, dgemv_n, ddot, daxpy, dcopy on
AMD Bulldozer. (Thank Werner Saar)
* Add Intel Haswell support (using Sandybridge optimizations).
(Thank Dan Luu)
* Add AMD Piledriver support (using Bulldozer optimizations).
* Fix the computational error in zgemm avx kernel on
Sandybridge. (#237)
* Fix the overflow bug in gemv.
* Fix the overflow bug in multi-threaded BLAS3, getrf when NUM_THREADS
is very large.(#214, #221, #246).
MIPS64:
* Support loongcc (Open64 based) compiler for ICT Loongson 3A/B.

Power:
* Support Power7 by old Power6 kernels. (#220)

====================================================================
Version 0.2.6
2-Mar-2013
common:
* Improved OpenMP performance slightly. (d744c9)
* Improved cblas.h compatibility with Intel MKL.(#185)
* Fixed the overflowing bug in single thread cholesky factorization.
* Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174)

x86/x86-64:
* Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
We will tune the performance in future.
* Auto-detect Intel Xeon E7540.
* Fixed the overflowing buffer bug of gemv. (#173)
* Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189)

MIPS64:

====================================================================
Version 0.2.5
26-Nov-2012


+ 49
- 44
Makefile View File

@@ -82,27 +82,27 @@ endif
shared :
ifndef NO_SHARED
ifeq ($(OSNAME), Linux)
$(MAKE) -C exports so
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
$(MAKE) -C exports so
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
$(MAKE) -C exports so
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
$(MAKE) -C exports dyn
-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
@$(MAKE) -C exports dyn
@-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
$(MAKE) -C exports dll
@$(MAKE) -C exports dll
endif
ifeq ($(OSNAME), CYGWIN_NT)
$(MAKE) -C exports dll
@$(MAKE) -C exports dll
endif
endif

@@ -131,30 +131,33 @@ endif
ifeq ($(NOFORTRAN), 1)
$(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.)
endif
-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
for d in $(SUBDIRS) ; \
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
@for d in $(SUBDIRS) ; \
do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \
done
#Save the config files for installation
cp Makefile.conf Makefile.conf_last
cp config.h config_last.h
@cp Makefile.conf Makefile.conf_last
@cp config.h config_last.h
ifdef QUAD_PRECISION
echo "#define QUAD_PRECISION">> config_last.h
@echo "#define QUAD_PRECISION">> config_last.h
endif
ifeq ($(EXPRECISION), 1)
echo "#define EXPRECISION">> config_last.h
@echo "#define EXPRECISION">> config_last.h
endif
##
ifeq ($(DYNAMIC_ARCH), 1)
$(MAKE) -C kernel commonlibs || exit 1
for d in $(DYNAMIC_CORE) ; \
@$(MAKE) -C kernel commonlibs || exit 1
@for d in $(DYNAMIC_CORE) ; \
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
done
echo DYNAMIC_ARCH=1 >> Makefile.conf_last
@echo DYNAMIC_ARCH=1 >> Makefile.conf_last
endif
touch lib.grd
ifdef USE_THREAD
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
endif
@touch lib.grd

prof : prof_blas prof_lapack

@@ -203,19 +206,19 @@ ifeq ($(NO_LAPACK), 1)
netlib :

else
netlib : lapack-3.4.2 patch.for_lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc
netlib : lapack_prebuild
ifndef NOFORTRAN
-@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
endif
ifndef NO_LAPACKE
-@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
endif
endif

prof_lapack : lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc
-@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof

$(NETLIB_LAPACK_DIR)/make.inc :
lapack_prebuild :
ifndef NOFORTRAN
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -224,11 +227,7 @@ ifndef NOFORTRAN
-@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
ifdef INTERFACE64
-@echo "CFLAGS = $(CFLAGS) -DHAVE_LAPACK_CONFIG_H -DLAPACK_ILP64" >> $(NETLIB_LAPACK_DIR)/make.inc
else
-@echo "CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -244,7 +243,7 @@ endif
lapack-3.4.2 : lapack-3.4.2.tgz
ifndef NOFORTRAN
ifndef NO_LAPACK
@if test `$(MD5SUM) lapack-3.4.2.tgz | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \
@if test `$(MD5SUM) $< | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \
echo $(TAR) zxf $< ;\
$(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.2) ;\
rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\
@@ -262,27 +261,31 @@ lapack-3.4.2.tgz :
ifndef NOFORTRAN
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD))
curl -O $(LAPACK_URL)
curl -O $(LAPACK_URL);
else
ifeq ($(OSNAME), FreeBSD)
fetch $(LAPACK_URL)
fetch $(LAPACK_URL);
else
wget $(LAPACK_URL)
wget -O $@ $(LAPACK_URL);
endif
endif
endif

large.tgz :
ifndef NOFORTRAN
-wget http://www.netlib.org/lapack/timing/large.tgz
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/large.tgz;
fi
endif

timing.tgz :
ifndef NOFORTRAN
-wget http://www.netlib.org/lapack/timing/timing.tgz
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/timing.tgz;
fi
endif

lapack-timing : lapack-3.4.2 large.tgz timing.tgz
lapack-timing : large.tgz timing.tgz
ifndef NOFORTRAN
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
@@ -314,10 +317,12 @@ clean ::
#endif
@$(MAKE) -C reference clean
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
ifeq ($(OSNAME), Darwin)
@rm -rf getarch.dSYM getarch_2nd.dSYM
endif
@rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib
@if test -d $(NETLIB_LAPACK_DIR); then \
echo deleting $(NETLIB_LAPACK_DIR); \
rm -rf $(NETLIB_LAPACK_DIR) ;\
fi
@touch $(NETLIB_LAPACK_DIR)/make.inc
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
@rm -f *.grd Makefile.conf_last config_last.h
@echo Done.
@echo Done.

+ 0
- 5
Makefile.generic View File

@@ -1,6 +1 @@
COPT = -Wall -O2 # -DGEMMTEST
ifdef BINARY64
else
# LDFLAGS = -m elf32ppc
LDFLAGS = -m elf_i386
endif

+ 18
- 13
Makefile.install View File

@@ -5,6 +5,7 @@ include ./Makefile.system

OPENBLAS_INCLUDE_DIR:=$(PREFIX)/include
OPENBLAS_LIBRARY_DIR:=$(PREFIX)/lib
OPENBLAS_BUILD_DIR:=$(CURDIR)

.PHONY : install
.NOTPARALLEL : install
@@ -48,32 +49,36 @@ endif
#for install static library
@echo Copy the static library to $(OPENBLAS_LIBRARY_DIR)
@cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR)
@-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(LIBSUFFIX)
@cd $(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
#for install shared library
@echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), Linux)
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so.$(MAJOR_VERSION)
@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
@cd $(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
@cd $(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
@cd $(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
-cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)
-install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
@-cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)
@-install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
@-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
@-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
endif
ifeq ($(OSNAME), CYGWIN_NT)
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
@-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
endif

@echo Install OK!


+ 0
- 11
Makefile.power View File

@@ -17,13 +17,7 @@ endif
endif

ifdef BINARY64
ifeq ($(OSNAME), Linux)
LDFLAGS = -m elf64ppc
endif

ifeq ($(OSNAME), Darwin)
LDFLAGS = -arch ppc64
endif

ifeq ($(OSNAME), AIX)
CCOMMON_OPT += -mpowerpc64 -maix64
@@ -34,17 +28,12 @@ ifeq ($(COMPILER_F77), xlf)
FCOMMON_OPT += -q64
endif
ARFLAGS = -X 64
LDFLAGS = -b64
ASFLAGS = -a64
endif
else
ifeq ($(OSNAME), Linux)
LDFLAGS = -m elf32ppc
endif
ifeq ($(OSNAME), AIX)
CCOMMON_OPT += -Wa,-a32
ARFLAGS = -X 32
LDFLAGS = -b32
ASFLAGS = -a32
endif
endif


+ 6
- 3
Makefile.rule View File

@@ -3,7 +3,7 @@
#

# This library's version
VERSION = 0.2.5
VERSION = 0.2.7

# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -81,6 +81,9 @@ VERSION = 0.2.5
# and OS. However, the performance is low.
# NO_AVX = 1

# Don't use parallel make.
# NO_PARALLEL_MAKE = 1

# If you would like to know minute performance report of GotoBLAS.
# FUNCTION_PROFILE = 1

@@ -104,8 +107,8 @@ VERSION = 0.2.5

# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading
# in small matrix sizes. The default value is 50.
# GEMM_MULTITHREAD_THRESHOLD = 50
# in small matrix sizes. The default value is 4.
# GEMM_MULTITHREAD_THRESHOLD = 4

# If you need santy check by comparing reference BLAS. It'll be very
# slow (Not implemented yet).


+ 0
- 1
Makefile.sparc View File

@@ -10,7 +10,6 @@ endif
ifeq ($(COMPILER_F77), f90)
FCOMMON_OPT += -xarch=v9
endif
LDFLAGS = -64
else

CCOMMON_OPT += -mcpu=v9


+ 121
- 11
Makefile.system View File

@@ -9,9 +9,7 @@ ifndef TOPDIR
TOPDIR = .
endif

ifndef NETLIB_LAPACK_DIR
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.2
endif
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib

# Default C compiler
# - Only set if not specified on the command line or inherited from the environment.
@@ -20,6 +18,12 @@ endif
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
ifeq ($(origin CC),default)
CC = gcc
# Change the default compile to clang on Mac OSX.
# http://stackoverflow.com/questions/714100/os-detecting-makefile
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
CC = clang
endif
endif

# Default Fortran compiler (FC) is selected by f_check.
@@ -53,7 +57,7 @@ GETARCH_FLAGS += -DUSE64BITINT
endif

ifndef GEMM_MULTITHREAD_THRESHOLD
GEMM_MULTITHREAD_THRESHOLD=50
GEMM_MULTITHREAD_THRESHOLD=4
endif
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)

@@ -65,6 +69,19 @@ ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g
endif

ifeq ($(QUIET_MAKE), 1)
MAKE += -s
endif

ifndef NO_PARALLEL_MAKE
NO_PARALLEL_MAKE=0
endif
GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE)

ifeq ($(HOSTCC), loongcc)
GETARCH_FLAGS += -static
endif

# This operation is expensive, so execution should be once.
ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1
@@ -148,7 +165,12 @@ EXTRALIB += -defaultlib:advapi32

SUFFIX = obj
PSUFFIX = pobj
LIBSUFFIX = lib
LIBSUFFIX = a

ifeq ($(C_COMPILER), CLANG)
CCOMMON_OPT += -DMS_ABI
endif

ifeq ($(C_COMPILER), GCC)
#Test for supporting MS_ABI
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
@@ -167,8 +189,15 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1)
CCOMMON_OPT += -DMS_ABI
endif
endif
endif

# Ensure the correct stack alignment on Win32
# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97
ifeq ($(ARCH), x86)
CCOMMON_OPT += -mincoming-stack-boundary=2
FCOMMON_OPT += -mincoming-stack-boundary=2
endif

endif

ifeq ($(OSNAME), Interix)
@@ -223,11 +252,17 @@ NO_BINARY_MODE = 1
endif
ifndef NO_EXPRECISION
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), GCC)
# ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION -m128bit-long-double
FCOMMON_OPT += -m128bit-long-double
endif
ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double
endif
endif
endif
endif
@@ -235,11 +270,17 @@ endif
ifeq ($(ARCH), x86_64)
ifndef NO_EXPRECISION
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), GCC)
# ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION -m128bit-long-double
FCOMMON_OPT += -m128bit-long-double
endif
ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double
endif
endif
endif
endif
@@ -249,7 +290,13 @@ CCOMMON_OPT += -wd981
endif

ifeq ($(USE_OPENMP), 1)
ifeq ($(C_COMPILER), GCC)
# ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
CCOMMON_OPT += -fopenmp
endif

ifeq ($(C_COMPILER), CLANG)
$(error OpenBLAS: Clang didn't support OpenMP yet.)
CCOMMON_OPT += -fopenmp
endif

@@ -277,14 +324,14 @@ ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
endif
endif

ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
endif
endif

@@ -318,11 +365,18 @@ endif
# C Compiler dependent settings
#

ifeq ($(C_COMPILER), GCC)

# ifeq logical or. GCC or CLANG or LSB
# http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG LSB))
CCOMMON_OPT += -Wall
COMMON_PROF += -fno-inline
NO_UNINITIALIZED_WARN = -Wno-uninitialized

ifeq ($(QUIET_MAKE), 1)
CCOMMON_OPT += $(NO_UNINITIALIZED_WARN) -Wno-unused
endif

ifdef NO_BINARY_MODE

ifeq ($(ARCH), mips64)
@@ -407,7 +461,12 @@ endif
ifeq ($(F_COMPILER), GFORTRAN)
CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NO_LAPACK), 1)
ifneq ($(C_COMPILER), LSB)
EXTRALIB += -lgfortran
endif
endif
ifdef NO_BINARY_MODE
ifeq ($(ARCH), mips64)
ifdef BINARY64
@@ -514,11 +573,28 @@ ifdef INTERFACE64
FCOMMON_OPT += -i8
endif
endif

ifeq ($(ARCH), mips64)
ifndef BINARY64
FCOMMON_OPT += -n32
else
FCOMMON_OPT += -n64
endif
ifeq ($(CORE), LOONGSON3A)
FCOMMON_OPT += -loongson3 -static
endif

ifeq ($(CORE), LOONGSON3B)
FCOMMON_OPT += -loongson3 -static
endif

else
ifndef BINARY64
FCOMMON_OPT += -m32
else
FCOMMON_OPT += -m64
endif
endif

ifdef USE_OPENMP
FEXTRALIB += -lstdc++
@@ -527,12 +603,30 @@ endif
endif

ifeq ($(C_COMPILER), OPEN64)

ifeq ($(ARCH), mips64)
ifndef BINARY64
CCOMMON_OPT += -n32
else
CCOMMON_OPT += -n64
endif
ifeq ($(CORE), LOONGSON3A)
CCOMMON_OPT += -loongson3 -static
endif

ifeq ($(CORE), LOONGSON3B)
CCOMMON_OPT += -loongson3 -static
endif

else

ifndef BINARY64
CCOMMON_OPT += -m32
else
CCOMMON_OPT += -m64
endif
endif
endif

ifeq ($(C_COMPILER), SUN)
CCOMMON_OPT += -w
@@ -741,6 +835,15 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES =

LAPACK_CFLAGS = $(CFLAGS)
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
ifdef INTERFACE64
LAPACK_CFLAGS += -DLAPACK_ILP64
endif
ifeq ($(C_COMPILER), LSB)
LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE
endif

ifndef SUFFIX
SUFFIX = o
endif
@@ -835,6 +938,13 @@ export ZGEMM_UNROLL_M
export ZGEMM_UNROLL_N
export XGEMM_UNROLL_M
export XGEMM_UNROLL_N
export CGEMM3M_UNROLL_M
export CGEMM3M_UNROLL_N
export ZGEMM3M_UNROLL_M
export ZGEMM3M_UNROLL_N
export XGEMM3M_UNROLL_M
export XGEMM3M_UNROLL_N


ifdef USE_CUDA
export CUDADIR


+ 0
- 3
Makefile.x86 View File

@@ -1,8 +1,5 @@
# COMPILER_PREFIX = mingw32-

ifeq ($(OSNAME), Linux)
LDFLAGS = -melf_i386
endif

ifeq ($(OSNAME), Interix)
ARFLAGS = -m x86


+ 0
- 13
Makefile.x86_64 View File

@@ -2,25 +2,12 @@

ifeq ($(OSNAME), SunOS)
ifdef BINARY64
LDFLAGS = -64
ifeq ($(F_COMPILER), SUN)
FCOMMON_OPT += -m64
endif
endif
endif

ifeq ($(OSNAME), FreeBSD)
LDFLAGS = -m elf_x86_64_fbsd
endif

ifeq ($(OSNAME), Linux)
LDFLAGS = -m elf_x86_64
endif

ifeq ($(OSNAME), Darwin)
LDFLAGS =
endif

ifeq ($(OSNAME), Interix)
ARFLAGS = -m x64
endif


+ 29
- 14
README.md View File

@@ -1,11 +1,20 @@
# OpenBLAS

[![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)

## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS <http://www.rdcps.ac.cn>.
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.

Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.

## Installation
## Binary Packages
We provide binary packages for the following platform.

* Windows x86/x86_64

You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).

## Installation from Source
Download from project homepage. http://xianyi.github.com/OpenBLAS/

Or, check out codes from git://github.com/xianyi/OpenBLAS.git
@@ -23,11 +32,15 @@ On X86 box, compile this library for loongson3a CPU.

make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A

On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler.

make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32

### Debug version

make DEBUG=1

### Intall to the directory (Optional)
### Install to the directory (optional)

Example:

@@ -43,8 +56,10 @@ Please read GotoBLAS_01Readme.txt
#### x86/x86-64:
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge).
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
- **AMD PILEDRIVER**: Used Bulldozer codes.

#### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
@@ -54,7 +69,7 @@ Please read GotoBLAS_01Readme.txt
- **GNU/Linux**
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supportted by community. We didn't test the library on this OS.
- **FreeBSD**: Supported by community. We didn't test the library on this OS.

## Usages
Link with libopenblas.a or -lopenblas for shared library.
@@ -79,7 +94,7 @@ If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS enviro

### Set the number of threads on runtime.

We provided the below functions to controll the number of threads on runtime.
We provided the below functions to control the number of threads on runtime.

void goto_set_num_threads(int num_threads);

@@ -91,7 +106,8 @@ If you compile this lib with USE_OPENMP=1, you should use the above functions, t
Please add a issue in https://github.com/xianyi/OpenBLAS/issues

## Contact
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas
* OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
* OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev

## ChangeLog
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
@@ -104,10 +120,9 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.

## Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
Now, there are 4 branches in github.com.
* The master branch. This a main branch to reflect a production-ready state.
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
* The gh-pages branch. This is for web pages
## Contributing
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
1. Write a test which shows that the bug was fixed or that the feature works as expected.
1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.


+ 1
- 1
TargetList.txt View File

@@ -8,8 +8,8 @@ Supported List:
1.X86/X86_64
a)Intel CPU:
P2
COPPERMINE
KATMAI
COPPERMINE
NORTHWOOD
PRESCOTT
BANIAS


+ 12
- 6
c_check View File

@@ -33,6 +33,8 @@ if ($ARGV[0] =~ /(.*)(-[.\d]+)/) {
}

$compiler = "";
$compiler = LSB if ($data =~ /COMPILER_LSB/);
$compiler = CLANG if ($data =~ /COMPILER_CLANG/);
$compiler = PGI if ($data =~ /COMPILER_PGI/);
$compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/);
$compiler = INTEL if ($data =~ /COMPILER_INTEL/);
@@ -117,7 +119,11 @@ if ($compiler eq "OPEN64") {
$openmp = "-mp";
}

if ($compiler eq "GCC") {
if ($compiler eq "CLANG") {
$openmp = "-fopenmp";
}

if ($compiler eq "GCC" || $compiler eq "LSB") {
$openmp = "-fopenmp";
}

@@ -241,13 +247,13 @@ print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";

if ($os eq "LINUX") {
@pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`);
# @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`);
if ($pthread[2] ne "") {
print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n";
} else {
# if ($pthread[2] ne "") {
# print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n";
# } else {
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
}
# }
} else {
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
}


+ 10
- 0
cblas.h View File

@@ -16,6 +16,16 @@ void goto_set_num_threads(int num_threads);
/*Get the build configure on runtime.*/
char* openblas_get_config(void);

/* Get the parallelization type which is used by OpenBLAS */
int openblas_get_parallel(void);
/* OpenBLAS is compiled for sequential use */
#define OPENBLAS_SEQUENTIAL 0
/* OpenBLAS is compiled using normal threading model */
#define OPENBLAS_THREAD 1
/* OpenBLAS is compiled using OpenMP threading model */
#define OPENBLAS_OPENMP 2


#define CBLAS_INDEX size_t

typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;


+ 17
- 0
common.h View File

@@ -314,6 +314,23 @@ typedef int blasint;
#define YIELDING sched_yield()
#endif

/***
To alloc job_t on heap or statck.
please https://github.com/xianyi/OpenBLAS/issues/246
***/
#if defined(OS_WINDOWS)
#define GETRF_MEM_ALLOC_THRESHOLD 32
#define BLAS3_MEM_ALLOC_THRESHOLD 32
#endif

#ifndef GETRF_MEM_ALLOC_THRESHOLD
#define GETRF_MEM_ALLOC_THRESHOLD 80
#endif

#ifndef BLAS3_MEM_ALLOC_THRESHOLD
#define BLAS3_MEM_ALLOC_THRESHOLD 160
#endif

#ifdef QUAD_PRECISION
#include "common_quad.h"
#endif


+ 9
- 1
common_alpha.h View File

@@ -150,9 +150,17 @@ REALNAME:
#define PROFCODE .prologue 0
#endif

#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",%progbits
#else
#define GNUSTACK
#endif

#define EPILOGUE \
.end REALNAME; \
.ident VERSION
.ident VERSION; \
GNUSTACK

#endif

#ifdef DOUBLE


+ 8
- 1
common_ia64.h View File

@@ -379,8 +379,15 @@ REALNAME:
#define PROFCODE
#endif

#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",%progbits
#else
#define GNUSTACK
#endif

#define EPILOGUE \
.endp REALNAME
.endp REALNAME ; \
GNUSTACK

#define START_ADDRESS 0x20000fc800000000UL



+ 14
- 1
common_linux.h View File

@@ -65,9 +65,16 @@ extern long int syscall (long int __sysno, ...);
#endif
#endif



static inline int my_mbind(void *addr, unsigned long len, int mode,
unsigned long *nodemask, unsigned long maxnode,
unsigned flags) {
#if defined (__LSB_VERSION__)
// So far, LSB (Linux Standard Base) don't support syscall().
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
return 0;
#else
#if defined (LOONGSON3B)
#if defined (__64BIT__)
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
@@ -79,11 +86,17 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
// unsigned long null_nodemask=0;
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
#endif
#endif
}

static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {

#if defined (__LSB_VERSION__)
// So far, LSB (Linux Standard Base) don't support syscall().
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
return 0;
#else
return syscall(SYS_set_mempolicy, mode, addr, flag);
#endif
}

static inline int my_gettid(void) {


+ 10
- 3
common_mips64.h View File

@@ -235,10 +235,17 @@ REALNAME: ;\
.set noreorder ;\
.set nomacro

#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",%progbits
#else
#define GNUSTACK
#endif

#define EPILOGUE \
.set macro ;\
.set reorder ;\
.end REALNAME
.end REALNAME ;\
GNUSTACK

#define PROFCODE
#endif
@@ -255,8 +262,8 @@ REALNAME: ;\
#endif

#if defined(LOONGSON3B)
#define PAGESIZE (32UL << 10)
#define FIXED_PAGESIZE (32UL << 10)
#define PAGESIZE (16UL << 10)
#define FIXED_PAGESIZE (16UL << 10)
#endif

#ifndef PAGESIZE


+ 10
- 1
common_sparc.h View File

@@ -199,8 +199,17 @@ static __inline int blas_quickdivide(blasint x, blasint y){
.type REALNAME, #function; \
.proc 07; \
REALNAME:;

#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",%progbits
#else
#define GNUSTACK
#endif

#define EPILOGUE \
.size REALNAME, .-REALNAME
.size REALNAME, .-REALNAME; \
GNUSTACK

#endif

#endif


+ 10
- 3
common_x86.h View File

@@ -171,6 +171,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define MMXSTORE movd
#endif

#if defined(PILEDRIVER) || defined(BULLDOZER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

#if defined(HAVE_3DNOW)
#define EMMS femms
#elif defined(HAVE_MMX)
@@ -296,7 +301,9 @@ REALNAME:
#define PROFCODE
#endif

#define EPILOGUE .size REALNAME, .-REALNAME
#define EPILOGUE \
.size REALNAME, .-REALNAME; \
.section .note.GNU-stack,"",%progbits

#endif

@@ -335,6 +342,7 @@ REALNAME:
#define ALIGN_2 .align 2
#define ALIGN_3 .align 3
#define ALIGN_4 .align 4
#define ALIGN_5 .align 5
#define ffreep fstp
#endif

@@ -356,11 +364,10 @@ REALNAME:

#ifndef ALIGN_6
#define ALIGN_6 .align 64
#endif
// ffreep %st(0).
// Because Clang didn't support ffreep, we directly use the opcode.
// Please check out http://www.sandpile.org/x86/opc_fpu.htm
#ifndef ffreep
#define ffreep .byte 0xdf, 0xc0 #
#endif
#endif

+ 9
- 1
common_x86_64.h View File

@@ -218,6 +218,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){

#ifdef ASSEMBLER

#if defined(PILEDRIVER) || defined(BULLDOZER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

#if defined(HAVE_3DNOW)
#define EMMS femms
#elif defined(HAVE_MMX)
@@ -367,7 +372,10 @@ REALNAME:
#define PROFCODE
#endif

#define EPILOGUE .size REALNAME, .-REALNAME
#define EPILOGUE \
.size REALNAME, .-REALNAME; \
.section .note.GNU-stack,"",%progbits


#endif



+ 7
- 0
cpuid.h View File

@@ -106,6 +106,8 @@
#define CORE_SANDYBRIDGE 20
#define CORE_BOBCAT 21
#define CORE_BULLDOZER 22
#define CORE_PILEDRIVER 23
#define CORE_HASWELL CORE_SANDYBRIDGE

#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
@@ -127,6 +129,7 @@
#define HAVE_FASTMOVU (1 << 17)
#define HAVE_AVX (1 << 18)
#define HAVE_FMA4 (1 << 19)
#define HAVE_FMA3 (1 << 20)

#define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2
@@ -196,4 +199,8 @@ typedef struct {
#define CPUTYPE_SANDYBRIDGE 44
#define CPUTYPE_BOBCAT 45
#define CPUTYPE_BULLDOZER 46
#define CPUTYPE_PILEDRIVER 47
// this define is because BLAS doesn't have haswell specific optimizations yet
#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE

#endif

+ 1
- 0
cpuid_power.c View File

@@ -114,6 +114,7 @@ int detect(void){
if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970;
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;



+ 68
- 11
cpuid_x86.c View File

@@ -41,10 +41,14 @@
#include "cpuid.h"

#ifdef NO_AVX
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
#define CORE_HASWELL CORE_NEHALEM
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
#define CORE_SANDYBRIDGE CORE_NEHALEM
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
#define CORE_BULLDOZER CORE_BARCELONA
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
#define CORE_PILEDRIVER CORE_BARCELONA
#endif

#ifndef CPUIDEMU
@@ -130,7 +134,7 @@ int support_avx(){
int ret=0;
cpuid(1, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 6) == 6){
ret=1; //OS support AVX
@@ -225,6 +229,7 @@ int get_cputype(int gettype){
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
#ifndef NO_AVX
if (support_avx()) feature |= HAVE_AVX;
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
#endif

if (have_excpuid() >= 0x01) {
@@ -1050,8 +1055,22 @@ int get_cpuname(void){
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 12:
if(support_avx())
return CPUTYPE_HASWELL;
else
return CPUTYPE_NEHALEM;
}
break;
case 4:
switch (model) {
case 5:
if(support_avx())
return CPUTYPE_HASWELL;
else
return CPUTYPE_NEHALEM;
}
break;
}
break;
case 0x7:
@@ -1084,11 +1103,21 @@ int get_cpuname(void){
case 1:
case 10:
return CPUTYPE_BARCELONA;
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return CPUTYPE_BULLDOZER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 6:
switch (model) {
case 1:
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return CPUTYPE_BULLDOZER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 2:
if(support_avx())
return CPUTYPE_PILEDRIVER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
}
break;
case 5:
return CPUTYPE_BOBCAT;
}
@@ -1213,6 +1242,7 @@ static char *cpuname[] = {
"SANDYBRIDGE",
"BOBCAT",
"BULLDOZER",
"PILEDRIVER",
};

static char *lowercpuname[] = {
@@ -1262,6 +1292,7 @@ static char *lowercpuname[] = {
"sandybridge",
"bobcat",
"bulldozer",
"piledriver",
};

static char *corename[] = {
@@ -1288,6 +1319,7 @@ static char *corename[] = {
"SANDYBRIDGE",
"BOBCAT",
"BULLDOZER",
"PILEDRIVER",
};

static char *corename_lower[] = {
@@ -1314,6 +1346,7 @@ static char *corename_lower[] = {
"sandybridge",
"bobcat",
"bulldozer",
"piledriver",
};


@@ -1424,8 +1457,22 @@ int get_coretype(void){
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM; //OS doesn't support AVX
case 12:
if(support_avx())
return CORE_HASWELL;
else
return CORE_NEHALEM;
}
break;
case 4:
switch (model) {
case 5:
if(support_avx())
return CORE_HASWELL;
else
return CORE_NEHALEM;
}
break;
}
break;

@@ -1442,11 +1489,19 @@ int get_coretype(void){
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCAT;
else if (exfamily == 6) {
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return CORE_BULLDOZER;
else
return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
switch (model) {
case 1:
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return CORE_BULLDOZER;
else
return CORE_BARCELONA; //OS don't support AVX.
case 2:
if(support_avx())
return CORE_PILEDRIVER;
else
return CORE_BARCELONA; //OS don't support AVX.
}
}else return CORE_BARCELONA;
}
}
@@ -1534,6 +1589,7 @@ void get_cpuconfig(void){
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
if (features & HAVE_FMA3 ) printf("#define HAVE_FMA3\n");
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
@@ -1601,5 +1657,6 @@ void get_sse(void){
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n");

}

+ 14
- 0
ctest.c View File

@@ -1,3 +1,17 @@
//LSB (Linux Standard Base) compiler
//only support lsbc++
#if defined (__LSB_VERSION__)
#if !defined (__cplusplus)
COMPILER_LSB
#else
#error "OpenBLAS only supports lsbcc."
#endif
#endif

#if defined(__clang__)
COMPILER_CLANG
#endif

#if defined(__PGI) || defined(__PGIC__)
COMPILER_PGI
#endif


+ 1
- 1
ctest/Makefile View File

@@ -77,7 +77,7 @@ endif
clean ::
rm -f x*

FLDFLAGS = $(FFLAGS:-fPIC=)
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
CEXTRALIB =

# Single real


+ 6
- 4
driver/level2/sbmv_thread.c View File

@@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F

a = (FLOAT *)args -> a;
x = (FLOAT *)args -> b;
y = (FLOAT *)args -> c;

lda = args -> lda;
incx = args -> ldb;
@@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
n_from = 0;
n_to = n;

//Use y as each thread's n* COMPSIZE elements in sb buffer
y = buffer;
buffer += ((COMPSIZE * n + 1023) & ~1023);

if (range_m) {
n_from = *(range_m + 0);
n_to = *(range_m + 1);
@@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
a += n_from * lda * COMPSIZE;
}

if (range_n) y += *range_n * COMPSIZE;

if (incx != 1) {
COPY_K(n, x, incx, buffer, 1);
@@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x

if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
queue[0].sb = buffer;
queue[num_cpu - 1].next = NULL;
exec_blas(num_cpu, queue);
@@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
#else
ONE, ZERO,
#endif
buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
(FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
}

AXPYU_K(n, 0, 0,


+ 2
- 2
driver/level3/gemm_thread_n.c View File

@@ -71,7 +71,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
queue[num_cpu].args = arg;
queue[num_cpu].range_m = range_m;
queue[num_cpu].range_n = &range[num_cpu];
#if defined(LOONGSON3A)
#if 0 //defined(LOONGSON3A)
queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu;
queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;
#else
@@ -83,7 +83,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
}
if (num_cpu) {
#if defined(LOONGSON3A)
#if 0 //defined(LOONGSON3A)
queue[0].sa = sa;
queue[0].sb = sa + GEMM_OFFSET_A1 * 5;
#else


+ 14
- 1
driver/level3/level3.c View File

@@ -332,7 +332,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#else
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
else
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#else

if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif

START_RPCC();


+ 24
- 1
driver/level3/level3_gemm3m_thread.c View File

@@ -48,6 +48,12 @@
#define SWITCH_RATIO 2
#endif

//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif

#ifndef GEMM3M_LOCAL
#if defined(NN)
#define GEMM3M_LOCAL GEMM3M_NN
@@ -836,7 +842,11 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
BLASLONG range_M[MAX_CPU_NUMBER + 1];
BLASLONG range_N[MAX_CPU_NUMBER + 1];

job_t job[MAX_CPU_NUMBER];
#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif

BLASLONG num_cpu_m, num_cpu_n;

@@ -866,6 +876,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;
newarg.nthreads = args -> nthreads;

#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif

newarg.common = (void *)job;
if (!range_m) {
@@ -945,6 +964,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
exec_blas(num_cpu_m, queue);
}

#ifdef USE_ALLOC_HEAP
free(job);
#endif

return 0;
}



+ 23
- 0
driver/level3/level3_syrk_threaded.c View File

@@ -48,6 +48,12 @@
#define SWITCH_RATIO 2
#endif

//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif

#ifndef SYRK_LOCAL
#if !defined(LOWER) && !defined(TRANS)
#define SYRK_LOCAL SYRK_UN
@@ -502,7 +508,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO

blas_arg_t newarg;

#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif

blas_queue_t queue[MAX_CPU_NUMBER];

BLASLONG range[MAX_CPU_NUMBER + 100];
@@ -556,6 +567,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
newarg.ldc = args -> ldc;
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;

#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif

newarg.common = (void *)job;
if (!range_n) {
@@ -668,6 +688,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
exec_blas(num_cpu, queue);
}
#ifdef USE_ALLOC_HEAP
free(job);
#endif

return 0;
}

+ 38
- 2
driver/level3/level3_thread.c View File

@@ -48,6 +48,12 @@
#define SWITCH_RATIO 2
#endif

//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif

#ifndef GEMM_LOCAL
#if defined(NN)
#define GEMM_LOCAL GEMM_NN
@@ -360,8 +366,20 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,

for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs;

#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
else
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#else

if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif

START_RPCC();
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
@@ -519,7 +537,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG

blas_arg_t newarg;

#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif

blas_queue_t queue[MAX_CPU_NUMBER];

BLASLONG range_M[MAX_CPU_NUMBER + 1];
@@ -563,6 +586,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;
newarg.nthreads = args -> nthreads;

#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif

newarg.common = (void *)job;
#ifdef PARAMTEST
@@ -634,7 +666,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
num_cpu_n ++;
}
for (j = 0; j < num_cpu_m; j++) {
for (i = 0; i < num_cpu_m; i++) {
for (k = 0; k < DIVIDE_RATE; k++) {
@@ -648,6 +680,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
exec_blas(num_cpu_m, queue);
}

#ifdef USE_ALLOC_HEAP
free(job);
#endif

return 0;
}



+ 4
- 1
driver/others/Makefile View File

@@ -1,7 +1,7 @@
TOPDIR = ../..
include ../../Makefile.system

COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX)
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX)

COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)

@@ -106,6 +106,9 @@ openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
openblas_get_config.$(SUFFIX) : openblas_get_config.c
$(CC) $(CFLAGS) -c $< -o $(@F)

openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c
$(CC) $(CFLAGS) -c $< -o $(@F)

blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
$(CC) $(CFLAGS) -c $< -o $(@F)



+ 1
- 0
driver/others/blas_server.c View File

@@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
}
}
queue->sb=sb;
}
#ifdef MONITOR


+ 50
- 6
driver/others/blas_server_omp.c View File

@@ -49,8 +49,12 @@

int blas_server_avail = 0;

static void * blas_thread_buffer[MAX_CPU_NUMBER];

void goto_set_num_threads(int num_threads) {

int i=0;

if (num_threads < 1) num_threads = blas_num_threads;

if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
@@ -62,7 +66,19 @@ void goto_set_num_threads(int num_threads) {
blas_cpu_number = num_threads;

omp_set_num_threads(blas_cpu_number);

//adjust buffer for each thread
for(i=0; i<blas_cpu_number; i++){
if(blas_thread_buffer[i]==NULL){
blas_thread_buffer[i]=blas_memory_alloc(2);
}
}
for(; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
}
}
#if defined(ARCH_MIPS64)
//set parameters for different number of threads.
blas_set_parameter();
@@ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) {

int blas_thread_init(void){

int i=0;

blas_get_cpu_number();

blas_server_avail = 1;

for(i=0; i<blas_num_threads; i++){
blas_thread_buffer[i]=blas_memory_alloc(2);
}
for(; i<MAX_CPU_NUMBER; i++){
blas_thread_buffer[i]=NULL;
}

return 0;
}

int BLASFUNC(blas_thread_shutdown)(void){

int i=0;
blas_server_avail = 0;

for(i=0; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
}
}

return 0;
}

@@ -177,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
static void exec_threads(blas_queue_t *queue){

void *buffer, *sa, *sb;

int pos=0, release_flag=0;
buffer = NULL;
sa = queue -> sa;
sb = queue -> sb;
@@ -189,9 +222,19 @@ static void exec_threads(blas_queue_t *queue){

if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {

buffer = blas_memory_alloc(2);
pos = omp_get_thread_num();
buffer = blas_thread_buffer[pos];

if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
//fallback
if(buffer==NULL) {
buffer = blas_memory_alloc(2);
release_flag=1;
}

if (sa == NULL) {
sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
queue->sa=sa;
}
if (sb == NULL) {
if (!(queue -> mode & BLAS_COMPLEX)){
@@ -224,6 +267,7 @@ static void exec_threads(blas_queue_t *queue){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
}
}
queue->sb=sb;
}
}

@@ -241,7 +285,7 @@ static void exec_threads(blas_queue_t *queue){

}

if (buffer != NULL) blas_memory_free(buffer);
if (release_flag) blas_memory_free(buffer);

}



+ 2
- 1
driver/others/blas_server_win32.c View File

@@ -253,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
}
}
queue->sb=sb;
}
#ifdef MONITOR
@@ -495,4 +496,4 @@ void goto_set_num_threads(int num_threads)
void openblas_set_num_threads(int num)
{
goto_set_num_threads(num);
}
}

+ 42
- 7
driver/others/dynamic.c View File

@@ -64,12 +64,15 @@ extern gotoblas_t gotoblas_BOBCAT;
#ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
extern gotoblas_t gotoblas_PILEDRIVER;
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#endif

//Use sandy bridge kernels for haswell.
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE

#define VENDOR_INTEL 1
#define VENDOR_AMD 2
@@ -92,7 +95,7 @@ int support_avx(){
int ret=0;
cpuid(1, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 6) == 6){
ret=1; //OS support AVX
@@ -175,7 +178,7 @@ static gotoblas_t *get_coretype(void){
if(support_avx())
return &gotoblas_SANDYBRIDGE;
else{
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n");
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
@@ -186,7 +189,27 @@ static gotoblas_t *get_coretype(void){
if(support_avx())
return &gotoblas_SANDYBRIDGE;
else{
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n");
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Haswell
if (model == 12) {
if(support_avx())
return &gotoblas_HASWELL;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
case 4:
//Intel Haswell
if (model == 5) {
if(support_avx())
return &gotoblas_HASWELL;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
@@ -207,13 +230,23 @@ static gotoblas_t *get_coretype(void){
} else if (exfamily == 5) {
return &gotoblas_BOBCAT;
} else if (exfamily == 6) {
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(model == 1){
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return &gotoblas_BULLDOZER;
else{
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n");
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
}else if(model == 2){
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
if(support_avx())
return &gotoblas_PILEDRIVER;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
} else {
return &gotoblas_BARCELONA;
}
@@ -251,6 +284,7 @@ static char *corename[] = {
"Sandybridge",
"Bobcat",
"Bulldozer",
"Piledriver",
};

char *gotoblas_corename(void) {
@@ -273,6 +307,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];

return corename[0];
}


+ 5
- 2
driver/others/init.c View File

@@ -82,6 +82,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sched.h>
#include <dirent.h>
#include <dlfcn.h>
#include <unistd.h>
#include <string.h>

#define MAX_NODES 16
#define MAX_CPUS 256
@@ -314,7 +316,7 @@ static int numa_check(void) {
}

while ((dir = readdir(dp)) != NULL) {
if (*(unsigned int *) dir -> d_name == 0x065646f6eU) {
if (strncmp(dir->d_name, "node", 4)==0) {

node = atoi(&dir -> d_name[4]);

@@ -735,7 +737,8 @@ void gotoblas_affinity_init(void) {
fprintf(stderr, "Shared Memory Initialization.\n");
#endif

common -> num_procs = get_nprocs();
//returns the number of processors which are currently online
common -> num_procs = sysconf(_SC_NPROCESSORS_ONLN);;

if(common -> num_procs > MAX_CPUS) {
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);


+ 22
- 1
driver/others/memory.c View File

@@ -105,6 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#if defined(OS_FREEBSD) || defined(OS_DARWIN)
#include <sys/sysctl.h>
#include <sys/resource.h>
#endif

#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
@@ -125,7 +126,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NO_WARMUP
#endif

#ifdef ALLOC_HUGETLB
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

@@ -216,6 +217,25 @@ int get_num_procs(void) {
}
return nums;
}
/*
void set_stack_limit(int limitMB){
int result=0;
struct rlimit rl;
rlim_t StackSize;

StackSize=limitMB*1024*1024;
result=getrlimit(RLIMIT_STACK, &rl);
if(result==0){
if(rl.rlim_cur < StackSize){
rl.rlim_cur=StackSize;
result=setrlimit(RLIMIT_STACK, &rl);
if(result !=0){
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
}
}
}
}
*/
#endif

/*
@@ -1248,6 +1268,7 @@ void CONSTRUCTOR gotoblas_init(void) {

if (gotoblas_initialized) return;


#ifdef PROFILE
moncontrol (0);
#endif


+ 52
- 0
driver/others/openblas_get_parallel.c View File

@@ -0,0 +1,52 @@
/*****************************************************************************
Copyright (c) 2013 Martin Koehler, grisuthedragon@users.github.com
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

**********************************************************************************/

#include "common.h"

#if defined(USE_OPENMP)
static int parallel = 2 ;
#elif defined(SMP_SERVER)
static int parallel = 1;
#else
static int parallel = 0;
#endif

int CNAME() {
return parallel;
}

int NAME() {
return parallel;
}




+ 14
- 9
exports/Makefile View File

@@ -89,7 +89,7 @@ else
endif

libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def
$(CC) $(CFLAGS) libgoto2_shared.def -shared -o $(@F) \
$(CC) $(CFLAGS) $(LDFLAGS) libgoto2_shared.def -shared -o $(@F) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB)

@@ -116,10 +116,15 @@ ifeq ($(OSNAME), Linux)
so : ../$(LIBSONAME)

../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
ifneq ($(C_COMPILER), LSB)
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
else
#Use FC on LSB
$(FC) $(FFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
endif
rm -f linktest

endif
@@ -130,10 +135,10 @@ ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD))
so : ../$(LIBSONAME)

../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB)
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
rm -f linktest

endif
@@ -143,15 +148,15 @@ ifeq ($(OSNAME), OSF1)
so : ../$(LIBSONAME)

../$(LIBSONAME) :
$(CC) -shared -o ../$(LIBSONAME) ../$(LIBNAME)
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) ../$(LIBNAME)
endif

ifeq ($(OSNAME), SunOS)

so : ../$(LIBSONAME)
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(EXTRALIB)
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
rm -f linktest

endif
@@ -194,7 +199,7 @@ symbol.S : gensymbol
perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > symbol.S

test : linktest.c
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
rm -f linktest

linktest.c : gensymbol ../Makefile.system ../getarch.c


+ 56
- 21
exports/gensymbol View File

@@ -49,7 +49,7 @@
cblas_zhemv, cblas_zher2, cblas_zher2k, cblas_zher, cblas_zherk, cblas_zhpmv, cblas_zhpr2,
cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk,
cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm,
cblas_ztrsv);
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub );

@exblasobjs = (
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
@@ -72,13 +72,18 @@
zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m,
);


#both underscore and no underscore
@misc_common_objs = (
openblas_set_num_threads, openblas_get_parallel,
);

@misc_no_underscore_objs = (
openblas_set_num_threads, goto_set_num_threads,
goto_set_num_threads,
openblas_get_config,
);

@misc_underscore_objs = (
openblas_set_num_threads,
);

@lapackobjs = (
@@ -111,7 +116,7 @@
# already provided by @blasobjs: xerbla, lsame
ilaenv, ieeeck, lsamen, xerbla_array, iparmq,
ilaprec, ilatrans, ilauplo, iladiag, chla_transtype,
ilaver, slamch,
ilaver, slamch, slamc3,

# SCLAUX -- Auxiliary routines called from both REAL and COMPLEX.
# excluded: second_$(TIMER)
@@ -148,7 +153,7 @@
dlasr, dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc,
dsteqr, dsterf, dlaisnan, disnan,
dlartgp, dlartgs,
dlamch,
dlamch, dlamc3,

# SLASRC -- Single precision real LAPACK routines
# already provided by @lapackobjs:
@@ -2671,7 +2676,7 @@ if ($ARGV[5] == 1) {
#NO_LAPACK=1
@underscore_objs = (@blasobjs, @misc_underscore_objs);
} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1" ||
-d "../lapack-3.4.2") {
-d "../lapack-3.4.2" || -d "../lapack-netlib") {
@underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs);
} else {
@underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs);
@@ -2679,7 +2684,7 @@ if ($ARGV[5] == 1) {

if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); };

if ($ARGV[1] eq "X86_64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); };
if ($ARGV[1] eq "x86_64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); };

if ($ARGV[1] eq "x86"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); };

@@ -2716,6 +2721,10 @@ $bu = $ARGV[2];
$bu = "" if (($bu eq "0") || ($bu eq "1"));

if ($ARGV[0] eq "linux"){

@underscore_objs = (@underscore_objs, @misc_common_objs);
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);

foreach $objs (@underscore_objs) {
print $objs, $bu, "\n";
}
@@ -2733,6 +2742,10 @@ if ($ARGV[0] eq "linux"){
}

if ($ARGV[0] eq "osx"){

@underscore_objs = (@underscore_objs, @misc_common_objs);
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);

foreach $objs (@underscore_objs) {
print "_", $objs, $bu, "\n";
}
@@ -2746,6 +2759,10 @@ if ($ARGV[0] eq "osx"){
}

if ($ARGV[0] eq "aix"){

@underscore_objs = (@underscore_objs, @misc_common_objs);
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);

foreach $objs (@underscore_objs) {
print $objs, $bu, "\n";
}
@@ -2761,23 +2778,31 @@ if ($ARGV[0] eq "aix"){
if ($ARGV[0] eq "win2k"){
print "EXPORTS\n";
$count = 1;

@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);

foreach $objs (@underscore_objs) {
unless ($objs =~ /openblas_set_num_threads/) { #remove openblas_set_num_threads
$uppercase = $objs;
$uppercase =~ tr/[a-z]/[A-Z]/;
print "\t$objs=$objs","_ \@", $count, "\n";
$count ++;
print "\t",$objs, "_=$objs","_ \@", $count, "\n";
$count ++;
print "\t$uppercase=$objs", "_ \@", $count, "\n";
$count ++;
}
$uppercase = $objs;
$uppercase =~ tr/[a-z]/[A-Z]/;
print "\t$objs=$objs","_ \@", $count, "\n";
$count ++;
print "\t",$objs, "_=$objs","_ \@", $count, "\n";
$count ++;
print "\t$uppercase=$objs", "_ \@", $count, "\n";
$count ++;
}
#for misc_common_objs
foreach $objs (@misc_common_objs) {

$uppercase = $objs;
$uppercase =~ tr/[a-z]/[A-Z]/;
print "\t",$objs, "_=$objs","_ \@", $count, "\n";
$count ++;
print "\t$uppercase=$objs", "_ \@", $count, "\n";
$count ++;
}
#for openblas_set_num_threads
print "\topenblas_set_num_threads_=openblas_set_num_threads_ \@", $count, "\n";
$count ++;
foreach $objs (@no_underscore_objs) {
print "\t",$objs,"=$objs"," \@", $count, "\n";
@@ -2810,6 +2835,9 @@ if ($ARGV[0] eq "win2khpl"){
}

if ($ARGV[0] eq "microsoft"){

@underscore_objs = (@underscore_objs, @misc_common_objs);

print "EXPORTS\n";
$count = 1;
foreach $objs (@underscore_objs) {
@@ -2828,6 +2856,9 @@ if ($ARGV[0] eq "microsoft"){
}

if ($ARGV[0] eq "win2kasm"){

@underscore_objs = (@underscore_objs, @misc_common_objs);

print "\t.text\n";
foreach $objs (@underscore_objs) {
$uppercase = $objs;
@@ -2841,6 +2872,10 @@ if ($ARGV[0] eq "win2kasm"){
}

if ($ARGV[0] eq "linktest"){

@underscore_objs = (@underscore_objs, @misc_common_objs);
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);

print "int main(void){\n";
foreach $objs (@underscore_objs) {
print $objs, $bu, "();\n" if $objs ne "xerbla";


+ 1
- 1
f_check View File

@@ -24,7 +24,7 @@ $compiler = "" if $compiler eq "f77";

if ($compiler eq "") {

@lists = ("f77", "g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95",
@lists = ("g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95",
"sunf77", "sunf90", "sunf95",
"xlf95", "xlf90", "xlf",
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",


+ 38
- 13
getarch.c View File

@@ -83,6 +83,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef linux
#include <sys/sysinfo.h>
#include <unistd.h>
#endif

/* #define FORCE_P2 */
@@ -96,14 +97,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_PENRYN */
/* #define FORCE_DUNNINGTON */
/* #define FORCE_NEHALEM */
/* #define FORCE_SANDYBRIDGE */
/* #define FORCE_ATOM */
/* #define FORCE_ATHLON */
/* #define FORCE_OPTERON */
/* #define FORCE_OPTERON_SSE3 */
/* #define FORCE_BARCELONA */
/* #define FORCE_SHANGHAI */
/* #define FORCE_ISTANBUL */
/* #define FORCE_BOBCAT */
/* #define FORCE_BULLDOZER */
/* #define FORCE_BOBCAT */
/* #define FORCE_PILEDRIVER */
/* #define FORCE_SSE_GENERIC */
/* #define FORCE_VIAC3 */
/* #define FORCE_NANO */
@@ -118,12 +122,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_PPC440FP2 */
/* #define FORCE_CELL */
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */
/* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */
/* #define FORCE_ITANIUM2 */
/* #define FORCE_GENERIC */
/* #define FORCE_SPARC */
/* #define FORCE_SPARCV7 */
/* #define FORCE_GENERIC */

#ifdef FORCE_P2
#define FORCE
@@ -139,32 +143,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "P5"
#endif

#ifdef FORCE_COPPERMINE
#ifdef FORCE_KATMAI
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "PENTIUM3"
#define ARCHCONFIG "-DPENTIUM3 " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE "
#define LIBNAME "coppermine"
#define CORENAME "COPPERMINE"
#define LIBNAME "katmai"
#define CORENAME "KATMAI"
#endif

#ifdef FORCE_KATMAI
#ifdef FORCE_COPPERMINE
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "PENTIUM3"
#define ARCHCONFIG "-DPENTIUM3 " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE "
#define LIBNAME "katmai"
#define CORENAME "KATMAI"
#define LIBNAME "coppermine"
#define CORENAME "COPPERMINE"
#endif

#ifdef FORCE_NORTHWOOD
@@ -396,6 +400,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "BULLDOZER"
#endif

#if defined (FORCE_PILEDRIVER)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "PILEDRIVER"
#define ARCHCONFIG "-DPILEDRIVER " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
#define LIBNAME "piledriver"
#define CORENAME "PILEDRIVER"
#endif

#ifdef FORCE_SSE_GENERIC
#define FORCE
#define FORCE_INTEL
@@ -717,7 +737,8 @@ static int get_num_cores(void) {
#endif
#ifdef linux
return get_nprocs();
//returns the number of processors which are currently online
return sysconf(_SC_NPROCESSORS_ONLN);
#elif defined(OS_WINDOWS)

@@ -802,8 +823,12 @@ int main(int argc, char *argv[]){
#endif
#endif

#if NO_PARALLEL_MAKE==1
printf("MAKE += -j 1\n");
#else
#ifndef OS_WINDOWS
printf("MAKE += -j %d\n", get_num_cores());
#endif
#endif

break;


+ 40
- 2
getarch_2nd.c View File

@@ -8,7 +8,7 @@

int main(int argc, char **argv) {

if ((argc < 1) || (*argv[1] == '0')) {
if ( (argc <= 1) || (argc >= 2) && (*argv[1] == '0')) {
printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
@@ -22,10 +22,48 @@ int main(int argc, char **argv) {
printf("ZGEMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N);
printf("XGEMM_UNROLL_M=%d\n", XGEMM_DEFAULT_UNROLL_M);
printf("XGEMM_UNROLL_N=%d\n", XGEMM_DEFAULT_UNROLL_N);

#ifdef CGEMM3M_DEFAULT_UNROLL_M
printf("CGEMM3M_UNROLL_M=%d\n", CGEMM3M_DEFAULT_UNROLL_M);
#else
printf("CGEMM3M_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
#endif

#ifdef CGEMM3M_DEFAULT_UNROLL_N
printf("CGEMM3M_UNROLL_N=%d\n", CGEMM3M_DEFAULT_UNROLL_N);
#else
printf("CGEMM3M_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
#endif

#ifdef ZGEMM3M_DEFAULT_UNROLL_M
printf("ZGEMM3M_UNROLL_M=%d\n", ZGEMM3M_DEFAULT_UNROLL_M);
#else
printf("ZGEMM3M_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
#endif

#ifdef ZGEMM3M_DEFAULT_UNROLL_N
printf("ZGEMM3M_UNROLL_N=%d\n", ZGEMM3M_DEFAULT_UNROLL_N);
#else
printf("ZGEMM3M_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N);
#endif

#ifdef XGEMM3M_DEFAULT_UNROLL_M
printf("XGEMM3M_UNROLL_M=%d\n", ZGEMM3M_DEFAULT_UNROLL_M);
#else
printf("XGEMM3M_UNROLL_M=%d\n", QGEMM_DEFAULT_UNROLL_M);
#endif

#ifdef XGEMM3M_DEFAULT_UNROLL_N
printf("XGEMM3M_UNROLL_N=%d\n", ZGEMM3M_DEFAULT_UNROLL_N);
#else
printf("XGEMM3M_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N);
#endif


}

if ((argc >= 1) && (*argv[1] == '1')) {
if ((argc >= 2) && (*argv[1] == '1')) {
printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float)));
printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double)));
printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float)));


+ 13
- 0
interface/trtri.c View File

@@ -60,6 +60,8 @@ static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *
};
#endif

extern void dtrtri_lapack_(char *UPLO, char *DIAG, int *N, double *a, int *ldA, int *Info);

int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){

blas_arg_t args;
@@ -83,6 +85,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
TOUPPER(uplo_arg);
TOUPPER(diag_arg);


uplo = -1;
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
@@ -90,6 +93,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
if (diag_arg == 'U') diag = 0;
if (diag_arg == 'N') diag = 1;


info = 0;
if (args.lda < MAX(1,args.n)) info = 5;
if (args.n < 0) info = 3;
@@ -129,6 +133,15 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
if (args.nthreads == 1) {
#endif

#if DOUBLE
// double trtri_U single thread error
// call dtrtri from lapack for a walk around.
if(uplo==0){
dtrtri_lapack_(UPLO, DIAG, N, a, ldA, Info);
return 0;
}
#endif

*Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
#ifdef SMP


+ 1
- 1
kernel/Makefile.L2 View File

@@ -388,7 +388,7 @@ $(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER
$(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ -DXCONJ $< -o $@

$(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM)
$(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ-DXCONJ $< -o $@
$(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ -DXCONJ $< -o $@

$(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@


+ 216
- 216
kernel/Makefile.L3 View File

@@ -1206,328 +1206,328 @@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M
$(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@

$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@

$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@

$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@

$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@

$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@

$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@

$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
@@ -2608,328 +2608,328 @@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_
$(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@

$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@

$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@

$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@

$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@

$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@

$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@

$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@

$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@

$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@

$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@

$(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c


+ 16
- 0
kernel/setparam-ref.c View File

@@ -826,6 +826,22 @@ static void init_parameter(void) {
#endif
#endif

#ifdef PILEDRIVER

#ifdef DEBUG
fprintf(stderr, "Piledriver\n");
#endif

TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif

#ifdef NANO

#ifdef DEBUG


+ 59
- 0
kernel/x86/KERNEL.PILEDRIVER View File

@@ -0,0 +1,59 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
CGEMMINCOPY =
CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S

DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S

CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S

ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S

CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S

+ 5
- 5
kernel/x86/gemv_n_sse.S View File

@@ -101,10 +101,10 @@
#define Y 36 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE+ARGS(%esp)

#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define LDAX 12+ARGS(%esp)
#define I %eax
#define J %ebx
@@ -153,8 +153,8 @@

movl YY,J
movl J,Y
movl STACK_LDA, LDA

movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX

@@ -688,9 +688,9 @@
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl YY,J
addl %eax,J
movl J,YY
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_4



+ 3
- 3
kernel/x86/gemv_n_sse2.S View File

@@ -714,9 +714,9 @@
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl YY,J
addl %eax,J
movl J,YY
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_4



+ 9
- 19
kernel/x86/gemv_t_sse.S View File

@@ -102,11 +102,9 @@
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE+ARGS(%esp)

#define MMM 0+STACKSIZE(%esp)
#define NN 4+STACKSIZE(%esp)
#define AA 8+STACKSIZE(%esp)
#define LDAX 12+STACKSIZE(%esp)
#define XX 16+STACKSIZE(%esp)
#define MMM 0+ARGS(%esp)
#define AA 4+ARGS(%esp)
#define XX 8+ARGS(%esp)
#define I %eax
#define J %ebx
@@ -129,12 +127,8 @@

PROFCODE

movl STACK_LDA, LDA
movl LDA,LDAX # backup LDA
movl STACK_X, X
movl X,XX
movl N,J
movl J,NN # backup N
movl A,J
movl J,AA # backup A
movl M,J
@@ -144,7 +138,6 @@
addl $1,J
sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
subl $8, J # Don't use last 8 float in the buffer.
# Now, split M by block J
subl J,MMM # MMM=MMM-J
movl J,M
jge .L00t
@@ -159,13 +152,10 @@
movl AA,%eax
movl %eax,A # mov AA to A

movl NN,%eax
movl %eax,N # reset N


movl LDAX, LDA # reset LDA
movl XX,X
movl XX,%eax
movl %eax,X

movl STACK_LDA, LDA
movl STACK_INCX, INCX
movl STACK_INCY, INCY

@@ -688,9 +678,9 @@
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl XX,J
addl %eax,J
movl J,XX
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4



+ 13
- 16
kernel/x86/gemv_t_sse2.S View File

@@ -76,7 +76,7 @@
#endif

#define STACKSIZE 16
#define ARGS 16
#define ARGS 20

#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
@@ -89,10 +89,9 @@
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)

#define MMM 0+STACKSIZE(%esp)
#define AA 4+STACKSIZE(%esp)
#define LDAX 8+STACKSIZE(%esp)
#define NN 12+STACKSIZE(%esp)
#define MMM 0+ARGS(%esp)
#define AA 4+ARGS(%esp)
#define XX 8+ARGS(%esp)

#define I %eax
#define J %ebx
@@ -117,10 +116,8 @@
PROFCODE


movl STACK_LDA, LDA
movl LDA,LDAX # backup LDA
movl N,J
movl J,NN # backup N
movl STACK_X, X
movl X,XX
movl A,J
movl J,AA # backup A
movl M,J
@@ -130,7 +127,6 @@
addl $1,J
sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
subl $4, J # Don't use last 4 double in the buffer.
# Now, split M by block J
subl J,MMM # MMM=MMM-J
movl J,M
jge .L00t
@@ -142,15 +138,13 @@
movl %eax,M

.L00t:
movl XX,%eax
movl %eax, X

movl AA,%eax
movl %eax,A # mov AA to A

movl NN,%eax
movl %eax,N # reset N


movl LDAX, LDA # reset LDA
movl STACK_X, X
movl STACK_LDA, LDA
movl STACK_INCX, INCX
movl STACK_INCY, INCY

@@ -605,6 +599,9 @@
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4



+ 2
- 2
kernel/x86/lsame.S View File

@@ -74,11 +74,11 @@
#else
movl %eax, %ecx
subl $32, %ecx
cmovg %ecx, %eax
cmovge %ecx, %eax

movl %edx, %ecx
subl $32, %ecx
cmovg %ecx, %edx
cmovge %ecx, %edx

subl %eax, %edx
movl $0, %eax


+ 5
- 5
kernel/x86/trsm_kernel_LN_2x4_sse2.S View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@@ -439,7 +439,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@@ -488,7 +488,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
@@ -1697,7 +1697,7 @@

.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@@ -1727,7 +1727,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2


+ 11
- 11
kernel/x86/trsm_kernel_LN_4x4_sse.S View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@@ -437,7 +437,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
@@ -833,7 +833,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@@ -1848,7 +1848,7 @@

.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@@ -2109,7 +2109,7 @@
ALIGN_4

.L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif

@@ -2429,7 +2429,7 @@

.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@@ -2459,7 +2459,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@@ -2952,7 +2952,7 @@

.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
@@ -3148,7 +3148,7 @@

.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@@ -3389,7 +3389,7 @@

.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@@ -3404,7 +3404,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3


+ 5
- 5
kernel/x86/trsm_kernel_LT_2x4_sse2.S View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@@ -910,7 +910,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@@ -959,7 +959,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
@@ -1439,7 +1439,7 @@

.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@@ -1469,7 +1469,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2


+ 11
- 11
kernel/x86/trsm_kernel_LT_4x4_sse.S View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@@ -872,7 +872,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@@ -1316,7 +1316,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
@@ -1855,7 +1855,7 @@

.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@@ -1885,7 +1885,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@@ -2249,7 +2249,7 @@
ALIGN_4

.L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif

@@ -2562,7 +2562,7 @@

.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@@ -2957,7 +2957,7 @@

.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@@ -2972,7 +2972,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
@@ -3280,7 +3280,7 @@

.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@@ -3515,7 +3515,7 @@

.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0


+ 5
- 5
kernel/x86/trsm_kernel_RT_2x4_sse2.S View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@@ -1036,7 +1036,7 @@

.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@@ -1066,7 +1066,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2
@@ -2224,7 +2224,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@@ -2273,7 +2273,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2


+ 11
- 11
kernel/x86/trsm_kernel_RT_4x4_sse.S View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@@ -439,7 +439,7 @@

.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@@ -454,7 +454,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
@@ -758,7 +758,7 @@

.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@@ -993,7 +993,7 @@

.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
@@ -1324,7 +1324,7 @@

.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@@ -1354,7 +1354,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@@ -1718,7 +1718,7 @@
ALIGN_4

.L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif

@@ -2031,7 +2031,7 @@

.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@@ -2859,7 +2859,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@@ -3303,7 +3303,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2


+ 56
- 12
kernel/x86/zgemv_n_sse.S View File

@@ -89,18 +89,23 @@
#endif

#define STACKSIZE 16

#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 20 + STACKSIZE(%esp)
#define A 24 + STACKSIZE(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp)
#define STACK_X 32 + STACKSIZE(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp)
#define Y 40 + STACKSIZE(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp)
#define BUFFER 48 + STACKSIZE(%esp)
#define ARGS 20

#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)

#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define I %eax
#define J %ebx
@@ -123,6 +128,7 @@

PROLOGUE

subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -130,6 +136,33 @@

PROFCODE

movl Y,J
movl J,YY
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $20,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_3

movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M

.L00t:
movl AA,%eax
movl %eax,A
movl YY,J
movl J,Y

movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
@@ -595,10 +628,21 @@
ALIGN_3

.L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_3

.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret

EPILOGUE

+ 55
- 11
kernel/x86/zgemv_n_sse2.S View File

@@ -76,18 +76,23 @@
#endif

#define STACKSIZE 16
#define ARGS 16

#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
#define A 32 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
#define STACK_X 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
#define Y 48 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
#define BUFFER 56 + STACKSIZE+ARGS(%esp)
#define MMM 0 + ARGS(%esp)
#define YY 4 + ARGS(%esp)
#define AA 8 + ARGS(%esp)

#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 24 + STACKSIZE(%esp)
#define A 32 + STACKSIZE(%esp)
#define STACK_LDA 36 + STACKSIZE(%esp)
#define STACK_X 40 + STACKSIZE(%esp)
#define STACK_INCX 44 + STACKSIZE(%esp)
#define Y 48 + STACKSIZE(%esp)
#define STACK_INCY 52 + STACKSIZE(%esp)
#define BUFFER 56 + STACKSIZE(%esp)
#define I %eax
#define J %ebx
@@ -110,6 +115,7 @@

PROLOGUE

subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -117,6 +123,33 @@

PROFCODE

movl Y,J
movl J,YY
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $18,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_3

movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M

.L00t:
movl AA,%eax
movl %eax,A

movl YY,J
movl J,Y

movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
@@ -458,10 +491,21 @@
ALIGN_3

.L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_3

.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret

EPILOGUE

+ 58
- 13
kernel/x86/zgemv_t_sse.S View File

@@ -89,18 +89,23 @@
#endif

#define STACKSIZE 16

#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 20 + STACKSIZE(%esp)
#define A 24 + STACKSIZE(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp)
#define STACK_X 32 + STACKSIZE(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp)
#define Y 40 + STACKSIZE(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp)
#define BUFFER 48 + STACKSIZE(%esp)
#define ARGS 20

#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)

#define MMM 0+ARGS(%esp)
#define XX 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define I %eax
#define J %ebx
@@ -123,6 +128,7 @@

PROLOGUE

subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -130,8 +136,35 @@

PROFCODE

movl STACK_LDA, LDA
movl STACK_X, X
movl X,XX
movl A,J
movl J,AA #backup A
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $20,J
subl $8,J
subl J,MMM #MMM-=J
movl J,M
jge .L00t
ALIGN_4

movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M

.L00t:
movl AA,%eax
movl %eax,A

movl XX,%eax
movl %eax,X

movl STACK_LDA,LDA
movl STACK_INCX, INCX
movl STACK_INCY, INCY

@@ -513,10 +546,22 @@
ALIGN_4
.L999:
movl M,%eax
sall $ZBASE_SHIFT, %eax
addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4

.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp

addl $ARGS,%esp
ret

EPILOGUE

+ 58
- 14
kernel/x86/zgemv_t_sse2.S View File

@@ -76,19 +76,24 @@
#endif

#define STACKSIZE 16
#define ARGS 20

#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
#define A 32 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
#define STACK_X 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
#define Y 48 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
#define BUFFER 56 + STACKSIZE+ARGS(%esp)

#define MMM 0 + ARGS(%esp)
#define AA 4 + ARGS(%esp)
#define XX 8 + ARGS(%esp)

#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 24 + STACKSIZE(%esp)
#define A 32 + STACKSIZE(%esp)
#define STACK_LDA 36 + STACKSIZE(%esp)
#define STACK_X 40 + STACKSIZE(%esp)
#define STACK_INCX 44 + STACKSIZE(%esp)
#define Y 48 + STACKSIZE(%esp)
#define STACK_INCY 52 + STACKSIZE(%esp)
#define BUFFER 56 + STACKSIZE(%esp)
#define I %eax
#define J %ebx

@@ -110,6 +115,7 @@

PROLOGUE

subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -117,8 +123,35 @@

PROFCODE

movl STACK_X, X
movl X, XX
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $18,J
subl $4,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_4

movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax, M

.L00t:
movl XX, %eax
movl %eax, X

movl AA,%eax
movl %eax,A

movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
movl STACK_INCY, INCY

@@ -188,7 +221,7 @@
movl Y, Y1

movl N, J
ALIGN_3
ALIGN_4

.L11:
movl BUFFER, X
@@ -395,10 +428,21 @@
ALIGN_4
.L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4

.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret

EPILOGUE

+ 2
- 2
kernel/x86/ztrsm_kernel_LN_2x2_sse.S View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@@ -533,7 +533,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4


+ 2
- 2
kernel/x86/ztrsm_kernel_LT_2x2_sse.S View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@@ -994,7 +994,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4


+ 2
- 2
kernel/x86/ztrsm_kernel_RT_2x2_sse.S View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024

#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@@ -1820,7 +1820,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4


+ 45
- 36
kernel/x86_64/KERNEL.BULLDOZER View File

@@ -1,62 +1,71 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S

SGEMMKERNEL = sgemm_kernel_8x4_bulldozer.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S
DAXPYKERNEL = daxpy_bulldozer.S
DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S

SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S

SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4_opteron.S
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = zgemm_ncopy_2.S
CGEMMOTCOPY = zgemm_tcopy_2.S
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = zgemm_ncopy_2.S
ZGEMMOTCOPY = zgemm_tcopy_2.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S

CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

+ 70
- 0
kernel/x86_64/KERNEL.PILEDRIVER View File

@@ -0,0 +1,70 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S

DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S
DAXPYKERNEL = daxpy_bulldozer.S
DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S

SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c



+ 5
- 1
kernel/x86_64/axpy_sse.S View File

@@ -69,7 +69,7 @@
#endif
movaps %xmm0, ALPHA
#else
movaps %xmm3, ALPHA

movq 40(%rsp), X
movq 48(%rsp), INCX
@@ -79,6 +79,10 @@

SAVEREGISTERS

#ifdef WINDOWS_ABI
movaps %xmm3, ALPHA
#endif
shufps $0, ALPHA, ALPHA

leaq (, INCX, SIZE), INCX


+ 4
- 1
kernel/x86_64/axpy_sse2.S View File

@@ -69,7 +69,6 @@
#endif
movaps %xmm0, ALPHA
#else
movaps %xmm3, ALPHA

movq 40(%rsp), X
movq 48(%rsp), INCX
@@ -79,6 +78,10 @@

SAVEREGISTERS

#ifdef WINDOWS_ABI
movaps %xmm3, ALPHA
#endif

unpcklpd ALPHA, ALPHA

leaq (, INCX, SIZE), INCX


+ 1900
- 0
kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
File diff suppressed because it is too large
View File


+ 53
- 4
kernel/x86_64/cgemv_n.S View File

@@ -47,14 +47,22 @@

#ifndef WINDOWS_ABI

#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA 48 (%rsp)

#define MMM 64(%rsp)
#define NN 72(%rsp)
#define AA 80(%rsp)
#define XX 88(%rsp)
#define LDAX 96(%rsp)
#define ALPHAR 104(%rsp)
#define ALPHAI 112(%rsp)

#define M %rdi
#define N %rsi
#define A %rcx
@@ -66,7 +74,7 @@

#else

#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
@@ -78,6 +86,14 @@
#define OLD_BUFFER 96 + STACKSIZE(%rsp)
#define ALPHA 224 (%rsp)

#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define XX 256(%rsp)
#define LDAX 264(%rsp)
#define ALPHAR 272(%rsp)
#define ALPHAI 280(%rsp)

#define M %rcx
#define N %rdx
#define A %r8
@@ -142,9 +158,37 @@
movaps %xmm3, %xmm0
movss OLD_ALPHA_I, %xmm1
#endif
movq A, AA
movq N, NN
movq M, MMM
movq LDA, LDAX
movq X, XX
movq OLD_Y, Y
movss %xmm0,ALPHAR
movss %xmm1,ALPHAI

.L0t:
xorq I,I
addq $1,I
salq $20,I
subq I,MMM
movq I,M
movss ALPHAR,%xmm0
movss ALPHAI,%xmm1
jge .L00t

movq MMM,M
addq I,M
jle .L999x

.L00t:
movq AA, A
movq NN, N
movq LDAX, LDA
movq XX, X

movq OLD_INCX, INCX
movq OLD_Y, Y
# movq OLD_Y, Y
movq OLD_INCY, INCY
movq OLD_BUFFER, BUFFER

@@ -4274,6 +4318,11 @@
ALIGN_3

.L999:
movq M, I
salq $ZBASE_SHIFT,I
addq I,AA
jmp .L0t
.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12


+ 46
- 2
kernel/x86_64/cgemv_t.S View File

@@ -47,13 +47,19 @@

#ifndef WINDOWS_ABI

#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA 48 (%rsp)
#define MMM 64(%rsp)
#define NN 72(%rsp)
#define AA 80(%rsp)
#define LDAX 88(%rsp)
#define ALPHAR 96(%rsp)
#define ALPHAI 104(%rsp)
#define M %rdi
#define N %rsi
@@ -66,7 +72,7 @@

#else

#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
@@ -78,6 +84,13 @@
#define OLD_BUFFER 96 + STACKSIZE(%rsp)
#define ALPHA 224 (%rsp)

#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define LDAX 256(%rsp)
#define ALPHAR 264(%rsp)
#define ALPHAI 272(%rsp)

#define M %rcx
#define N %rdx
#define A %r8
@@ -144,6 +157,32 @@
movss OLD_ALPHA_I, %xmm1
#endif

movq A, AA
movq N, NN
movq M, MMM
movq LDA, LDAX
movss %xmm0,ALPHAR
movss %xmm1,ALPHAI

.L0t:
xorq I,I
addq $1,I
salq $20,I
subq I,MMM
movq I,M
movss ALPHAR,%xmm0
movss ALPHAI,%xmm1
jge .L00t

movq MMM,M
addq I,M
jle .L999x

.L00t:
movq AA, A
movq NN, N
movq LDAX, LDA

movq OLD_INCX, INCX
movq OLD_Y, Y
movq OLD_INCY, INCY
@@ -4350,6 +4389,11 @@
ALIGN_3

.L999:
movq M, I
salq $ZBASE_SHIFT,I
addq I,AA
jmp .L0t
.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12


+ 408
- 0
kernel/x86_64/daxpy_bulldozer.S View File

@@ -0,0 +1,408 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#ifndef WINDOWS_ABI
#define M ARG1
#define X ARG4
#define INCX ARG5
#define Y ARG6
#define INCY ARG2
#else
#define M ARG1
#define X ARG2
#define INCX ARG3
#define Y ARG4
#define INCY %r10
#endif

#define YY %r11
#define ALPHA %xmm15

#define A_PRE 640

#include "l1param.h"
PROLOGUE
PROFCODE

#ifndef WINDOWS_ABI
#ifndef XDOUBLE
movq 8(%rsp), INCY
#else
movq 24(%rsp), INCY
#endif
vmovups %xmm0, ALPHA
#else
vmovups %xmm3, ALPHA

movq 40(%rsp), X
movq 48(%rsp), INCX
movq 56(%rsp), Y
movq 64(%rsp), INCY
#endif

SAVEREGISTERS

unpcklpd ALPHA, ALPHA

leaq (, INCX, SIZE), INCX
leaq (, INCY, SIZE), INCY

testq M, M
jle .L47
cmpq $SIZE, INCX
jne .L40
cmpq $SIZE, INCY
jne .L40

testq $SIZE, Y
je .L10

movsd (X), %xmm0
mulsd ALPHA, %xmm0
addsd (Y), %xmm0
movsd %xmm0, (Y)
addq $1 * SIZE, X
addq $1 * SIZE, Y
decq M
jle .L19
ALIGN_4

.L10:
subq $-16 * SIZE, X
subq $-16 * SIZE, Y

movq M, %rax
sarq $4, %rax
jle .L13

vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3

decq %rax
jle .L12
ALIGN_3

.L11:

prefetchnta A_PRE(Y)

vmovups -8 * SIZE(X), %xmm4
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
vmovups -6 * SIZE(X), %xmm5
vmovups -4 * SIZE(X), %xmm6
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
vmovups -2 * SIZE(X), %xmm7


vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
prefetchnta A_PRE(X)
nop
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)

prefetchnta A_PRE+64(Y)

vmovups 0 * SIZE(X), %xmm0
vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4
vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5
vmovups 2 * SIZE(X), %xmm1
vmovups 4 * SIZE(X), %xmm2
vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6
vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7
vmovups 6 * SIZE(X), %xmm3


vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
prefetchnta A_PRE+64(X)
nop
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)

subq $-16 * SIZE, Y
subq $-16 * SIZE, X
decq %rax
jg .L11
ALIGN_3

.L12:

vmovups -8 * SIZE(X), %xmm4
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
vmovups -6 * SIZE(X), %xmm5
vmovups -4 * SIZE(X), %xmm6
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
vmovups -2 * SIZE(X), %xmm7


vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)

vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4
vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5
vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6
vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7

vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)

subq $-16 * SIZE, Y
subq $-16 * SIZE, X
ALIGN_3

.L13:


movq M, %rax
andq $8, %rax
jle .L14
ALIGN_3

vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3

vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3

vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)

addq $8 * SIZE, X
addq $8 * SIZE, Y
ALIGN_3

.L14:
movq M, %rax
andq $4, %rax
jle .L15
ALIGN_3

vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1

vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1

vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)

addq $4 * SIZE, X
addq $4 * SIZE, Y
ALIGN_3

.L15:
movq M, %rax
andq $2, %rax
jle .L16
ALIGN_3

vmovups -16 * SIZE(X), %xmm0
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vmovups %xmm0, -16 * SIZE(Y)

addq $2 * SIZE, X
addq $2 * SIZE, Y
ALIGN_3

.L16:
movq M, %rax
andq $1, %rax
jle .L19
ALIGN_3

vmovsd -16 * SIZE(X), %xmm0
vfmaddsd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0

vmovsd %xmm0, -16 * SIZE(Y)
ALIGN_3

.L19:
xorq %rax,%rax

RESTOREREGISTERS

ret
ALIGN_3


.L40:
movq Y, YY
movq M, %rax
//If incx==0 || incy==0, avoid unloop.
cmpq $0, INCX
je .L46
cmpq $0, INCY
je .L46
sarq $3, %rax
jle .L45

prefetchnta 512(X)
prefetchnta 512+64(X)
prefetchnta 512+128(X)
prefetchnta 512+192(X)

prefetchnta 512(Y)
prefetchnta 512+64(Y)
prefetchnta 512+128(Y)
prefetchnta 512+192(Y)
ALIGN_3

.L41:

vmovsd 0 * SIZE(X), %xmm0
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm0 , %xmm0
addq INCX, X

vmovsd 0 * SIZE(YY), %xmm6
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm6 , %xmm6
addq INCY, YY


vmovsd 0 * SIZE(X), %xmm1
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm1 , %xmm1
addq INCX, X

vmovsd 0 * SIZE(YY), %xmm7
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm7 , %xmm7
addq INCY, YY

vfmaddpd %xmm6 , ALPHA , %xmm0 , %xmm0

vmovsd 0 * SIZE(X), %xmm2
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm2 , %xmm2
addq INCX, X

vmovsd 0 * SIZE(YY), %xmm8
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm8 , %xmm8
addq INCY, YY

vfmaddpd %xmm7 , ALPHA , %xmm1 , %xmm1

vmovsd 0 * SIZE(X), %xmm3
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm3 , %xmm3
addq INCX, X

vfmaddpd %xmm8 , ALPHA , %xmm2 , %xmm2

vmovsd 0 * SIZE(YY), %xmm9
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm9 , %xmm9
addq INCY, YY


vmovsd %xmm0, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm0, 0 * SIZE(Y)
addq INCY, Y
vmovsd %xmm1, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm1, 0 * SIZE(Y)
addq INCY, Y
vmovsd %xmm2, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm2, 0 * SIZE(Y)
addq INCY, Y

vfmaddpd %xmm9 , ALPHA , %xmm3 , %xmm3

vmovsd %xmm3, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm3, 0 * SIZE(Y)
addq INCY, Y

decq %rax
jg .L41
ALIGN_3

.L45:
movq M, %rax
andq $7, %rax
jle .L47
ALIGN_3

.L46:
vmovsd (X), %xmm0
addq INCX, X

vfmaddsd (Y) , ALPHA , %xmm0 , %xmm0

vmovsd %xmm0, (Y)
addq INCY, Y

decq %rax
jg .L46
ALIGN_3

.L47:
xorq %rax, %rax

RESTOREREGISTERS

ret

EPILOGUE

+ 291
- 0
kernel/x86_64/dcopy_bulldozer.S View File

@@ -0,0 +1,291 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define Y ARG4 /* rcx */
#ifndef WINDOWS_ABI
#define INCY ARG5 /* r8 */
#else
#define INCY %r10
#endif

#include "l1param.h"

#define VLOAD(OFFSET, ADDR, REG) vmovups OFFSET(ADDR), REG
#define VSHUFPD_1(REG1 , REG2) vshufpd $0x01, REG1, REG2, REG2
#define A_PRE 640
#define B_PRE 640

PROLOGUE
PROFCODE

#ifdef WINDOWS_ABI
movq 40(%rsp), INCY
#endif

SAVEREGISTERS

leaq (, INCX, SIZE), INCX
leaq (, INCY, SIZE), INCY

cmpq $SIZE, INCX
jne .L40
cmpq $SIZE, INCY
jne .L40

testq $SIZE, X
je .L10

vmovsd (X), %xmm0
vmovsd %xmm0, (Y)
addq $1 * SIZE, X
addq $1 * SIZE, Y
decq M
jle .L19
ALIGN_4

.L10:
subq $-16 * SIZE, X
subq $-16 * SIZE, Y


movq M, %rax
sarq $4, %rax
jle .L13

vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3
vmovups -8 * SIZE(X), %xmm4
vmovups -6 * SIZE(X), %xmm5
vmovups -4 * SIZE(X), %xmm6
vmovups -2 * SIZE(X), %xmm7

decq %rax
jle .L12
ALIGN_4

.L11:

prefetchnta A_PRE(X)
nop
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
prefetchnta B_PRE(Y)
nop
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)

VLOAD( 0 * SIZE, X, %xmm0)
VLOAD( 2 * SIZE, X, %xmm1)
VLOAD( 4 * SIZE, X, %xmm2)
VLOAD( 6 * SIZE, X, %xmm3)

prefetchnta A_PRE+64(X)
nop
vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
prefetchnta B_PRE+64(Y)
nop
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)

VLOAD( 8 * SIZE, X, %xmm4)
VLOAD(10 * SIZE, X, %xmm5)
subq $-16 * SIZE, Y
VLOAD(12 * SIZE, X, %xmm6)
VLOAD(14 * SIZE, X, %xmm7)

subq $-16 * SIZE, X
decq %rax
jg .L11
ALIGN_3

.L12:
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)
vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)

subq $-16 * SIZE, Y
subq $-16 * SIZE, X
ALIGN_3

.L13:
testq $8, M
jle .L14
ALIGN_3

vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3

vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)

addq $8 * SIZE, X
addq $8 * SIZE, Y
ALIGN_3

.L14:
testq $4, M
jle .L15
ALIGN_3

vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1

vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)

addq $4 * SIZE, X
addq $4 * SIZE, Y
ALIGN_3

.L15:
testq $2, M
jle .L16
ALIGN_3

vmovups -16 * SIZE(X), %xmm0
vmovups %xmm0, -16 * SIZE(Y)

addq $2 * SIZE, X
addq $2 * SIZE, Y
ALIGN_3

.L16:
testq $1, M
jle .L19
ALIGN_3

vmovsd -16 * SIZE(X), %xmm0
vmovsd %xmm0, -16 * SIZE(Y)
ALIGN_3

.L19:
xorq %rax,%rax

RESTOREREGISTERS

ret
ALIGN_3



.L40:
movq M, %rax
sarq $3, %rax
jle .L45
ALIGN_3

.L41:
vmovsd (X), %xmm0
addq INCX, X
vmovsd (X), %xmm4
addq INCX, X
vmovsd (X), %xmm1
addq INCX, X
vmovsd (X), %xmm5
addq INCX, X
vmovsd (X), %xmm2
addq INCX, X
vmovsd (X), %xmm6
addq INCX, X
vmovsd (X), %xmm3
addq INCX, X
vmovsd (X), %xmm7
addq INCX, X

vmovsd %xmm0, (Y)
addq INCY, Y
vmovsd %xmm4, (Y)
addq INCY, Y
vmovsd %xmm1, (Y)
addq INCY, Y
vmovsd %xmm5, (Y)
addq INCY, Y
vmovsd %xmm2, (Y)
addq INCY, Y
vmovsd %xmm6, (Y)
addq INCY, Y
vmovsd %xmm3, (Y)
addq INCY, Y
vmovsd %xmm7, (Y)
addq INCY, Y

decq %rax
jg .L41
ALIGN_3

.L45:
movq M, %rax
andq $7, %rax
jle .L47
ALIGN_3

.L46:
vmovsd (X), %xmm0
addq INCX, X
vmovsd %xmm0, (Y)
addq INCY, Y
decq %rax
jg .L46
ALIGN_3

.L47:
xorq %rax, %rax

RESTOREREGISTERS

ret

EPILOGUE

+ 311
- 0
kernel/x86_64/ddot_bulldozer.S View File

@@ -0,0 +1,311 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define N ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define Y ARG4 /* rcx */
#ifndef WINDOWS_ABI
#define INCY ARG5 /* r8 */
#else
#define INCY %r10
#endif

#define A_PRE 512

#include "l1param.h"

PROLOGUE
PROFCODE

#ifdef WINDOWS_ABI
movq 40(%rsp), INCY
#endif

SAVEREGISTERS

leaq (, INCX, SIZE), INCX
leaq (, INCY, SIZE), INCY

vxorps %xmm0, %xmm0 , %xmm0
vxorps %xmm1, %xmm1 , %xmm1
vxorps %xmm2, %xmm2 , %xmm2
vxorps %xmm3, %xmm3 , %xmm3

cmpq $0, N
jle .L999

cmpq $SIZE, INCX
jne .L50
cmpq $SIZE, INCY
jne .L50

subq $-16 * SIZE, X
subq $-16 * SIZE, Y

testq $SIZE, Y
je .L10

vmovsd -16 * SIZE(X), %xmm0
vmulsd -16 * SIZE(Y), %xmm0 , %xmm0
addq $1 * SIZE, X
addq $1 * SIZE, Y
decq N
ALIGN_2

.L10:

movq N, %rax
sarq $4, %rax
jle .L14

vmovups -16 * SIZE(X), %xmm4
vmovups -14 * SIZE(X), %xmm5
vmovups -12 * SIZE(X), %xmm6
vmovups -10 * SIZE(X), %xmm7

vmovups -8 * SIZE(X), %xmm8
vmovups -6 * SIZE(X), %xmm9
vmovups -4 * SIZE(X), %xmm10
vmovups -2 * SIZE(X), %xmm11

decq %rax
jle .L12

ALIGN_3

.L11:
prefetchnta A_PRE(Y)

vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
prefetchnta A_PRE(X)
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3

vmovups 0 * SIZE(X), %xmm4
vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0
vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1
vmovups 2 * SIZE(X), %xmm5
vmovups 4 * SIZE(X), %xmm6
vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2
vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3
vmovups 6 * SIZE(X), %xmm7

prefetchnta A_PRE+64(Y)

vmovups 8 * SIZE(X), %xmm8
vmovups 10 * SIZE(X), %xmm9
prefetchnta A_PRE+64(X)
vmovups 12 * SIZE(X), %xmm10
vmovups 14 * SIZE(X), %xmm11

subq $-16 * SIZE, X
subq $-16 * SIZE, Y

decq %rax
jg .L11
ALIGN_3

.L12:

vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3

vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0
vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1
vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2
vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3

subq $-16 * SIZE, X
subq $-16 * SIZE, Y
ALIGN_3

.L14:
testq $15, N
jle .L999

testq $8, N
jle .L15

vmovups -16 * SIZE(X), %xmm4
vmovups -14 * SIZE(X), %xmm5
vmovups -12 * SIZE(X), %xmm6
vmovups -10 * SIZE(X), %xmm7

vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3

addq $8 * SIZE, X
addq $8 * SIZE, Y
ALIGN_3

.L15:
testq $4, N
jle .L16

vmovups -16 * SIZE(X), %xmm4
vmovups -14 * SIZE(X), %xmm5

vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1

addq $4 * SIZE, X
addq $4 * SIZE, Y
ALIGN_3

.L16:
testq $2, N
jle .L17

vmovups -16 * SIZE(X), %xmm4
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0


addq $2 * SIZE, X
addq $2 * SIZE, Y
ALIGN_3

.L17:
testq $1, N
jle .L999

vmovsd -16 * SIZE(X), %xmm4
vmovsd -16 * SIZE(Y), %xmm5
vfmaddpd %xmm0, %xmm4 , %xmm5 , %xmm0
jmp .L999
ALIGN_3


.L50:
movq N, %rax
sarq $3, %rax
jle .L55
ALIGN_3

.L53:


vmovsd 0 * SIZE(X), %xmm4
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm8
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm5
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm9
addq INCY, Y

vmovsd 0 * SIZE(X), %xmm6
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm10
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm7
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm11
addq INCY, Y

vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1
vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2
vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3


vmovsd 0 * SIZE(X), %xmm4
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm8
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm5
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm9
addq INCY, Y

vmovsd 0 * SIZE(X), %xmm6
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm10
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm7
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm11
addq INCY, Y

vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1
vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2
vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3

decq %rax
jg .L53
ALIGN_3

.L55:
movq N, %rax
andq $7, %rax
jle .L999
ALIGN_3

.L56:
vmovsd 0 * SIZE(X), %xmm4
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm8
addq INCY, Y

vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0

decq %rax
jg .L56
ALIGN_3

.L999:
vaddpd %xmm1, %xmm0 , %xmm0
vaddpd %xmm3, %xmm2 , %xmm2
vaddpd %xmm2, %xmm0 , %xmm0

vhaddpd %xmm0, %xmm0 , %xmm0

RESTOREREGISTERS

ret

EPILOGUE

+ 0
- 1860
kernel/x86_64/dgemm_kernel_4x4_bulldozer.S
File diff suppressed because it is too large
View File


+ 3880
- 0
kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
File diff suppressed because it is too large
View File


+ 1821
- 0
kernel/x86_64/dgemm_ncopy_8_bulldozer.S
File diff suppressed because it is too large
View File


+ 667
- 0
kernel/x86_64/dgemm_tcopy_8_bulldozer.S View File

@@ -0,0 +1,667 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS
#define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS

#define A_PRE 256

#ifndef WINDOWS_ABI

#define N ARG1 /* rsi */
#define M ARG2 /* rdi */
#define A ARG3 /* rdx */
#define LDA ARG4 /* rcx */
#define B ARG5 /* r8 */

#define AO1 %r9
#define AO2 %r10
#define LDA3 %r11
#define M8 %r12

#else

#define N ARG1 /* rdx */
#define M ARG2 /* rcx */
#define A ARG3 /* r8 */
#define LDA ARG4 /* r9 */
#define OLD_B 40 + 56(%rsp)

#define B %r12

#define AO1 %rsi
#define AO2 %rdi
#define LDA3 %r10
#define M8 %r11
#endif

#define I %rax

#define B0 %rbp
#define B1 %r13
#define B2 %r14
#define B3 %r15

PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
#endif

pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbp

#ifdef WINDOWS_ABI
movq OLD_B, B
#endif

subq $-16 * SIZE, B

movq M, B1
movq M, B2
movq M, B3

andq $-8, B1
andq $-4, B2
andq $-2, B3

imulq N, B1
imulq N, B2
imulq N, B3

leaq (B, B1, SIZE), B1
leaq (B, B2, SIZE), B2
leaq (B, B3, SIZE), B3

leaq (,LDA, SIZE), LDA
leaq (LDA, LDA, 2), LDA3

leaq (, N, SIZE), M8

cmpq $8, N
jl .L20
ALIGN_4

.L11:
subq $8, N

movq A, AO1
leaq (A, LDA, 4), AO2
leaq (A, LDA, 8), A

movq B, B0
addq $64 * SIZE, B

movq M, I
sarq $3, I
jle .L14
ALIGN_4

.L13:

prefetchnta A_PRE(AO1)
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)

vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)


prefetchnta A_PRE(AO1, LDA, 1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)

vmovups %xmm0, -8 * SIZE(B0)
vmovups %xmm1, -6 * SIZE(B0)
vmovups %xmm2, -4 * SIZE(B0)
vmovups %xmm3, -2 * SIZE(B0)


prefetchnta A_PRE(AO1, LDA, 2)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA, 2, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA, 2, %xmm3)


vmovups %xmm0, 0 * SIZE(B0)
vmovups %xmm1, 2 * SIZE(B0)
vmovups %xmm2, 4 * SIZE(B0)
vmovups %xmm3, 6 * SIZE(B0)


prefetchnta A_PRE(AO1, LDA3, 1)
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA3, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA3, 1, %xmm3)

vmovups %xmm0, 8 * SIZE(B0)
vmovups %xmm1, 10 * SIZE(B0)
vmovups %xmm2, 12 * SIZE(B0)
vmovups %xmm3, 14 * SIZE(B0)

prefetchnta A_PRE(AO2)
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)

vmovups %xmm0, 16 * SIZE(B0)
vmovups %xmm1, 18 * SIZE(B0)
vmovups %xmm2, 20 * SIZE(B0)
vmovups %xmm3, 22 * SIZE(B0)

prefetchnta A_PRE(AO2, LDA, 1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)

vmovups %xmm0, 24 * SIZE(B0)
vmovups %xmm1, 26 * SIZE(B0)
vmovups %xmm2, 28 * SIZE(B0)
vmovups %xmm3, 30 * SIZE(B0)

prefetchnta A_PRE(AO2, LDA, 2)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA, 2, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA, 2, %xmm3)

vmovups %xmm0, 32 * SIZE(B0)
vmovups %xmm1, 34 * SIZE(B0)
vmovups %xmm2, 36 * SIZE(B0)
vmovups %xmm3, 38 * SIZE(B0)

prefetchnta A_PRE(AO2, LDA3, 1)
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA3, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA3, 1, %xmm3)

vmovups %xmm0, 40 * SIZE(B0)
vmovups %xmm1, 42 * SIZE(B0)
vmovups %xmm2, 44 * SIZE(B0)
vmovups %xmm3, 46 * SIZE(B0)

addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
leaq (B0, M8, 8), B0

decq I
jg .L13
ALIGN_4

.L14:
testq $4, M
jle .L16

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)

vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)
vmovups %xmm2, -12 * SIZE(B1)
vmovups %xmm3, -10 * SIZE(B1)

VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm3)

vmovups %xmm0, -8 * SIZE(B1)
vmovups %xmm1, -6 * SIZE(B1)
vmovups %xmm2, -4 * SIZE(B1)
vmovups %xmm3, -2 * SIZE(B1)

VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)

vmovups %xmm0, 0 * SIZE(B1)
vmovups %xmm1, 2 * SIZE(B1)
vmovups %xmm2, 4 * SIZE(B1)
vmovups %xmm3, 6 * SIZE(B1)

VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm3)

vmovups %xmm0, 8 * SIZE(B1)
vmovups %xmm1, 10 * SIZE(B1)
vmovups %xmm2, 12 * SIZE(B1)
vmovups %xmm3, 14 * SIZE(B1)

addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-32 * SIZE, B1
ALIGN_4

.L16:
testq $2, M
jle .L18

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm2)
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm3)

vmovups %xmm0, -16 * SIZE(B2)
vmovups %xmm1, -14 * SIZE(B2)
vmovups %xmm2, -12 * SIZE(B2)
vmovups %xmm3, -10 * SIZE(B2)

VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm2)
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm3)

vmovups %xmm0, -8 * SIZE(B2)
vmovups %xmm1, -6 * SIZE(B2)
vmovups %xmm2, -4 * SIZE(B2)
vmovups %xmm3, -2 * SIZE(B2)

addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
subq $-16 * SIZE, B2
ALIGN_4

.L18:
testq $1, M
jle .L19

vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO1, LDA), %xmm1
vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2
vmovsd 0 * SIZE(AO1, LDA3), %xmm3

vunpcklpd %xmm1, %xmm0 , %xmm0
vunpcklpd %xmm3, %xmm2 , %xmm2

vmovups %xmm0, -16 * SIZE(B3)
vmovups %xmm2, -14 * SIZE(B3)

vmovsd 0 * SIZE(AO2), %xmm0
vmovsd 0 * SIZE(AO2, LDA), %xmm1
vmovsd 0 * SIZE(AO2, LDA, 2), %xmm2
vmovsd 0 * SIZE(AO2, LDA3), %xmm3

vunpcklpd %xmm1, %xmm0 , %xmm0
vunpcklpd %xmm3, %xmm2 , %xmm2

vmovups %xmm0, -12 * SIZE(B3)
vmovups %xmm2, -10 * SIZE(B3)

subq $-8 * SIZE, B3
ALIGN_4

.L19:
cmpq $8, N
jge .L11
ALIGN_4

.L20:
cmpq $4, N
jl .L30

subq $4, N

movq A, AO1
leaq (A, LDA, 2), AO2
leaq (A, LDA, 4), A

movq B, B0
addq $32 * SIZE, B

movq M, I
sarq $3, I
jle .L24
ALIGN_4

.L23:

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)

vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)


VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)

vmovups %xmm0, -8 * SIZE(B0)
vmovups %xmm1, -6 * SIZE(B0)
vmovups %xmm2, -4 * SIZE(B0)
vmovups %xmm3, -2 * SIZE(B0)

VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)

vmovups %xmm0, 0 * SIZE(B0)
vmovups %xmm1, 2 * SIZE(B0)
vmovups %xmm2, 4 * SIZE(B0)
vmovups %xmm3, 6 * SIZE(B0)

VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)

vmovups %xmm0, 8 * SIZE(B0)
vmovups %xmm1, 10 * SIZE(B0)
vmovups %xmm2, 12 * SIZE(B0)
vmovups %xmm3, 14 * SIZE(B0)

addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
leaq (B0, M8, 8), B0

decq I
jg .L23
ALIGN_4

.L24:
testq $4, M
jle .L26

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)

vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)
vmovups %xmm2, -12 * SIZE(B1)
vmovups %xmm3, -10 * SIZE(B1)

VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)

vmovups %xmm0, -8 * SIZE(B1)
vmovups %xmm1, -6 * SIZE(B1)
vmovups %xmm2, -4 * SIZE(B1)
vmovups %xmm3, -2 * SIZE(B1)

addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-16 * SIZE, B1
ALIGN_4

.L26:
testq $2, M
jle .L28

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A1(0 * SIZE, AO2, %xmm2)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3)

vmovups %xmm0, -16 * SIZE(B2)
vmovups %xmm1, -14 * SIZE(B2)
vmovups %xmm2, -12 * SIZE(B2)
vmovups %xmm3, -10 * SIZE(B2)

addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
subq $-8 * SIZE, B2
ALIGN_4

.L28:
testq $1, M
jle .L30

vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO1, LDA), %xmm1
vmovsd 0 * SIZE(AO2), %xmm2
vmovsd 0 * SIZE(AO2, LDA), %xmm3

vunpcklpd %xmm1, %xmm0, %xmm0
vunpcklpd %xmm3, %xmm2, %xmm2

vmovups %xmm0, -16 * SIZE(B3)
vmovups %xmm2, -14 * SIZE(B3)
subq $-4 * SIZE, B3
ALIGN_4

.L30:
cmpq $2, N
jl .L40

subq $2, N

movq A, AO1
leaq (A, LDA), AO2
leaq (A, LDA, 2), A

movq B, B0
addq $16 * SIZE, B

movq M, I
sarq $3, I
jle .L34
ALIGN_4

.L33:

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)

vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)

VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)

vmovups %xmm0, -8 * SIZE(B0)
vmovups %xmm1, -6 * SIZE(B0)
vmovups %xmm2, -4 * SIZE(B0)
vmovups %xmm3, -2 * SIZE(B0)

addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
leaq (B0, M8, 8), B0

decq I
jg .L33
ALIGN_4

.L34:
testq $4, M
jle .L36

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(0 * SIZE, AO2, %xmm2)
VMOVUPS_A1(2 * SIZE, AO2, %xmm3)

vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)
vmovups %xmm2, -12 * SIZE(B1)
vmovups %xmm3, -10 * SIZE(B1)

addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-8 * SIZE, B1
ALIGN_4

.L36:
testq $2, M
jle .L38

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(0 * SIZE, AO2, %xmm1)

vmovups %xmm0, -16 * SIZE(B2)
vmovups %xmm1, -14 * SIZE(B2)

addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
subq $-4 * SIZE, B2
ALIGN_4

.L38:
testq $1, M
jle .L40

vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO2), %xmm1

vunpcklpd %xmm1, %xmm0, %xmm0

vmovups %xmm0, -16 * SIZE(B3)
subq $-2 * SIZE, B3
ALIGN_4

.L40:
cmpq $1, N
jl .L999

movq A, AO1

movq B, B0

movq M, I
sarq $3, I
jle .L44
ALIGN_4

.L43:

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)

vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)

addq $8 * SIZE, AO1
leaq (B0, M8, 8), B0

decq I
jg .L43
ALIGN_4

.L44:
testq $4, M
jle .L45

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)

vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)

addq $4 * SIZE, AO1
subq $-4 * SIZE, B1
ALIGN_4

.L45:
testq $2, M
jle .L46

VMOVUPS_A1(0 * SIZE, AO1, %xmm0)

vmovups %xmm0, -16 * SIZE(B2)

addq $2 * SIZE, AO1
subq $-2 * SIZE, B2
ALIGN_4

.L46:
testq $1, M
jle .L999

vmovsd 0 * SIZE(AO1), %xmm0

vmovsd %xmm0, -16 * SIZE(B3)
jmp .L999
ALIGN_4
.L999:
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15

#ifdef WINDOWS_ABI
popq %rsi
popq %rdi
#endif
ret

EPILOGUE

+ 49
- 7
kernel/x86_64/dgemv_n.S View File

@@ -47,7 +47,7 @@

#ifndef WINDOWS_ABI

#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_M %rdi
#define OLD_N %rsi
@@ -59,9 +59,14 @@
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA 48 (%rsp)

#define MMM 56(%rsp)
#define NN 64(%rsp)
#define AA 72(%rsp)
#define LDAX 80(%rsp)
#define XX 88(%rsp)
#else

#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_M %rcx
#define OLD_N %rdx
@@ -74,6 +79,12 @@
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
#define ALPHA 224 (%rsp)

#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define LDAX 256(%rsp)
#define XX 264(%rsp)

#endif

#define LDA %r8
@@ -137,17 +148,42 @@
movq OLD_LDA, LDA
#endif

movq STACK_INCX, INCX
movq STACK_Y, Y
movq STACK_INCY, INCY
movq STACK_BUFFER, BUFFER

#ifndef WINDOWS_ABI
movsd %xmm0, ALPHA
#else
movsd %xmm3, ALPHA
#endif

movq STACK_Y, Y
movq A,AA
movq N,NN
movq M,MMM
movq LDA,LDAX
movq X,XX

.L0t:
xorq I,I
addq $1,I
salq $21,I
subq I,MMM
movq I,M
jge .L00t

movq MMM,M
addq I,M
jle .L999x
.L00t:
movq XX,X
movq AA,A
movq NN,N
movq LDAX,LDA

movq STACK_INCX, INCX
movq STACK_INCY, INCY
movq STACK_BUFFER, BUFFER


leaq -1(INCY), %rax

leaq (,INCX, SIZE), INCX
@@ -2815,6 +2851,12 @@
ALIGN_3

.L999:
leaq (, M, SIZE), %rax
addq %rax,AA
jmp .L0t
ALIGN_4

.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12


+ 2325
- 0
kernel/x86_64/dgemv_n_bulldozer.S
File diff suppressed because it is too large
View File


+ 1938
- 0
kernel/x86_64/dgemv_t_bulldozer.S
File diff suppressed because it is too large
View File


+ 360
- 0
kernel/x86_64/gemm_ncopy_2_bulldozer.S View File

@@ -0,0 +1,360 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"


#ifndef WINDOWS_ABI

#define M ARG1 /* rdi */
#define N ARG2 /* rsi */
#define A ARG3 /* rdx */
#define LDA ARG4 /* rcx */
#define B ARG5 /* r8 */

#define I %r9

#else

#define STACKSIZE 256

#define M ARG1 /* rcx */
#define N ARG2 /* rdx */
#define A ARG3 /* r8 */
#define LDA ARG4 /* r9 */
#define OLD_B 40 + 32 + STACKSIZE(%rsp)

#define B %r14
#define I %r15

#endif

#define J %r10
#define AO1 %r11
#define AO2 %r12
#define AO3 %r13
#define AO4 %rax

PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
pushq %r15
pushq %r14
#endif
pushq %r13
pushq %r12

#ifdef WINDOWS_ABI
subq $STACKSIZE, %rsp

vmovups %xmm6, 0(%rsp)
vmovups %xmm7, 16(%rsp)
vmovups %xmm8, 32(%rsp)
vmovups %xmm9, 48(%rsp)
vmovups %xmm10, 64(%rsp)
vmovups %xmm11, 80(%rsp)
vmovups %xmm12, 96(%rsp)
vmovups %xmm13, 112(%rsp)
vmovups %xmm14, 128(%rsp)
vmovups %xmm15, 144(%rsp)

movq OLD_B, B
#endif

leaq (,LDA, SIZE), LDA # Scaling

movq N, J
sarq $1, J
jle .L20
ALIGN_4

.L01:
movq A, AO1
leaq (A, LDA), AO2
leaq (A, LDA, 2), A

movq M, I
sarq $3, I
jle .L08
ALIGN_4

.L03:

#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1
vmovss 1 * SIZE(AO1), %xmm2
vmovss 1 * SIZE(AO2), %xmm3
vmovss 2 * SIZE(AO1), %xmm4
vmovss 2 * SIZE(AO2), %xmm5
vmovss 3 * SIZE(AO1), %xmm6
vmovss 3 * SIZE(AO2), %xmm7

vmovss 4 * SIZE(AO1), %xmm8
vmovss 4 * SIZE(AO2), %xmm9
vmovss 5 * SIZE(AO1), %xmm10
vmovss 5 * SIZE(AO2), %xmm11
vmovss 6 * SIZE(AO1), %xmm12
vmovss 6 * SIZE(AO2), %xmm13
vmovss 7 * SIZE(AO1), %xmm14
vmovss 7 * SIZE(AO2), %xmm15

vmovss %xmm0, 0 * SIZE(B)
vmovss %xmm1, 1 * SIZE(B)
vmovss %xmm2, 2 * SIZE(B)
vmovss %xmm3, 3 * SIZE(B)
vmovss %xmm4, 4 * SIZE(B)
vmovss %xmm5, 5 * SIZE(B)
vmovss %xmm6, 6 * SIZE(B)
vmovss %xmm7, 7 * SIZE(B)

vmovss %xmm8, 8 * SIZE(B)
vmovss %xmm9, 9 * SIZE(B)
vmovss %xmm10, 10 * SIZE(B)
vmovss %xmm11, 11 * SIZE(B)
vmovss %xmm12, 12 * SIZE(B)
vmovss %xmm13, 13 * SIZE(B)
vmovss %xmm14, 14 * SIZE(B)
vmovss %xmm15, 15 * SIZE(B)

#else
prefetchw 256(B)

prefetchnta 256(AO1)
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 1 * SIZE(AO1), %xmm1
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 3 * SIZE(AO1), %xmm3
vmovsd 4 * SIZE(AO1), %xmm4
vmovsd 5 * SIZE(AO1), %xmm5
vmovsd 6 * SIZE(AO1), %xmm6
vmovsd 7 * SIZE(AO1), %xmm7

prefetchnta 256(AO2)
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1
vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2
vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3
vmovhpd 4 * SIZE(AO2), %xmm4 , %xmm4
vmovhpd 5 * SIZE(AO2), %xmm5 , %xmm5
vmovhpd 6 * SIZE(AO2), %xmm6 , %xmm6
vmovhpd 7 * SIZE(AO2), %xmm7 , %xmm7


prefetchw 256+64(B)
vmovups %xmm0, 0 * SIZE(B)
vmovups %xmm1, 2 * SIZE(B)
vmovups %xmm2, 4 * SIZE(B)
vmovups %xmm3, 6 * SIZE(B)
vmovups %xmm4, 8 * SIZE(B)
vmovups %xmm5, 10 * SIZE(B)
vmovups %xmm6, 12 * SIZE(B)
vmovups %xmm7, 14 * SIZE(B)

#endif

addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
subq $-16 * SIZE, B
decq I
jg .L03
ALIGN_4


.L08:
testq $4 , M
je .L14

ALIGN_4


.L13:
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1
vmovss 1 * SIZE(AO1), %xmm2
vmovss 1 * SIZE(AO2), %xmm3
vmovss 2 * SIZE(AO1), %xmm4
vmovss 2 * SIZE(AO2), %xmm5
vmovss 3 * SIZE(AO1), %xmm6
vmovss 3 * SIZE(AO2), %xmm7

vmovss %xmm0, 0 * SIZE(B)
vmovss %xmm1, 1 * SIZE(B)
vmovss %xmm2, 2 * SIZE(B)
vmovss %xmm3, 3 * SIZE(B)
vmovss %xmm4, 4 * SIZE(B)
vmovss %xmm5, 5 * SIZE(B)
vmovss %xmm6, 6 * SIZE(B)
vmovss %xmm7, 7 * SIZE(B)
#else

vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 1 * SIZE(AO1), %xmm1
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 3 * SIZE(AO1), %xmm3

vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1
vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2
vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3


vmovups %xmm0, 0 * SIZE(B)
vmovups %xmm1, 2 * SIZE(B)
vmovups %xmm2, 4 * SIZE(B)
vmovups %xmm3, 6 * SIZE(B)
#endif

addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-8 * SIZE, B
ALIGN_4

.L14:
movq M, I
andq $3, I
jle .L16
ALIGN_4

.L15:
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1

vmovss %xmm0, 0 * SIZE(B)
vmovss %xmm1, 1 * SIZE(B)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0

vmovups %xmm0, 0 * SIZE(B)
#endif

addq $SIZE, AO1
addq $SIZE, AO2
addq $2 * SIZE, B
decq I
jg .L15
ALIGN_4

.L16:
decq J
jg .L01
ALIGN_4

.L20:
testq $1, N
jle .L999

movq A, AO1

movq M, I
sarq $2, I
jle .L34
ALIGN_4

.L33:
#ifndef DOUBLE
vmovups 0 * SIZE(AO1), %xmm0

vmovups %xmm0, 0 * SIZE(B)
#else
vmovups 0 * SIZE(AO1), %xmm0
vmovups 2 * SIZE(AO1), %xmm1

vmovups %xmm0, 0 * SIZE(B)
vmovups %xmm1, 2 * SIZE(B)
#endif

addq $4 * SIZE, AO1
subq $-4 * SIZE, B
decq I
jg .L33
ALIGN_4

.L34:
movq M, I
andq $3, I
jle .L999
ALIGN_4

.L35:
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss %xmm0, 0 * SIZE(B)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd %xmm0, 0 * SIZE(B)
#endif

addq $SIZE, AO1
addq $1 * SIZE, B
decq I
jg .L35
ALIGN_4


.L999:
#ifdef WINDOWS_ABI
vmovups 0(%rsp), %xmm6
vmovups 16(%rsp), %xmm7
vmovups 32(%rsp), %xmm8
vmovups 48(%rsp), %xmm9
vmovups 64(%rsp), %xmm10
vmovups 80(%rsp), %xmm11
vmovups 96(%rsp), %xmm12
vmovups 112(%rsp), %xmm13
vmovups 128(%rsp), %xmm14
vmovups 144(%rsp), %xmm15

addq $STACKSIZE, %rsp
#endif

popq %r12
popq %r13

#ifdef WINDOWS_ABI
popq %r14
popq %r15
#endif
ret

EPILOGUE

+ 374
- 0
kernel/x86_64/gemm_tcopy_2_bulldozer.S View File

@@ -0,0 +1,374 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#ifndef WINDOWS_ABI

#define M ARG1 /* rdi */
#define N ARG2 /* rsi */
#define A ARG3 /* rdx */
#define LDA ARG4 /* rcx */
#define B ARG5 /* r8 */

#define I %r10
#define J %rbp

#define AO1 %r9
#define AO2 %r15
#define AO3 %r11
#define AO4 %r14
#define BO1 %r13
#define M8 %rbx
#define BO %rax

#else

#define STACKSIZE 256

#define M ARG1 /* rcx */
#define N ARG2 /* rdx */
#define A ARG3 /* r8 */
#define LDA ARG4 /* r9 */
#define OLD_B 40 + 64 + STACKSIZE(%rsp)

#define B %rdi

#define I %r10
#define J %r11

#define AO1 %r12
#define AO2 %r13
#define AO3 %r14
#define AO4 %r15

#define BO1 %rsi
#define M8 %rbp
#define BO %rax

#endif

PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
#endif
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbp
pushq %rbx

#ifdef WINDOWS_ABI
subq $STACKSIZE, %rsp

vmovups %xmm6, 0(%rsp)
vmovups %xmm7, 16(%rsp)
vmovups %xmm8, 32(%rsp)
vmovups %xmm9, 48(%rsp)
vmovups %xmm10, 64(%rsp)
vmovups %xmm11, 80(%rsp)
vmovups %xmm12, 96(%rsp)
vmovups %xmm13, 112(%rsp)
vmovups %xmm14, 128(%rsp)
vmovups %xmm15, 144(%rsp)

movq OLD_B, B
#endif

movq N, %rax
andq $-2, %rax
imulq M, %rax

leaq (B, %rax, SIZE), BO1

leaq (, LDA, SIZE), LDA
leaq (, M, SIZE), M8

movq M, J
sarq $1, J
jle .L20
ALIGN_4

.L01:
movq A, AO1
leaq (A, LDA ), AO2
leaq (A, LDA, 2), A

movq B, BO
addq $4 * SIZE, B

movq N, I
sarq $3, I
jle .L10
ALIGN_4


.L08:
#ifndef DOUBLE

vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 4 * SIZE(AO1), %xmm4
vmovsd 6 * SIZE(AO1), %xmm6
vmovsd 0 * SIZE(AO2), %xmm1
vmovsd 2 * SIZE(AO2), %xmm3
vmovsd 4 * SIZE(AO2), %xmm5
vmovsd 6 * SIZE(AO2), %xmm7

vmovsd %xmm0, 0 * SIZE(BO)
vmovsd %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovsd %xmm2, 0 * SIZE(BO)
vmovsd %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovsd %xmm4, 0 * SIZE(BO)
vmovsd %xmm5, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovsd %xmm6, 0 * SIZE(BO)
vmovsd %xmm7, 2 * SIZE(BO)
leaq (BO, M8, 2), BO


#else

prefetchnta 256(AO1)
prefetchnta 256(AO2)
vmovups 0 * SIZE(AO1), %xmm0
vmovups 2 * SIZE(AO1), %xmm2
vmovups 4 * SIZE(AO1), %xmm4
vmovups 6 * SIZE(AO1), %xmm6
vmovups 0 * SIZE(AO2), %xmm1
vmovups 2 * SIZE(AO2), %xmm3
vmovups 4 * SIZE(AO2), %xmm5
vmovups 6 * SIZE(AO2), %xmm7

vmovups %xmm0, 0 * SIZE(BO)
vmovups %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovups %xmm2, 0 * SIZE(BO)
vmovups %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovups %xmm4, 0 * SIZE(BO)
vmovups %xmm5, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovups %xmm6, 0 * SIZE(BO)
vmovups %xmm7, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

#endif
addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
decq I
jg .L08
ALIGN_4



.L10:
testq $4, N
jle .L12
#ifndef DOUBLE

vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 0 * SIZE(AO2), %xmm1
vmovsd 2 * SIZE(AO2), %xmm3

vmovsd %xmm0, 0 * SIZE(BO)
vmovsd %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovsd %xmm2, 0 * SIZE(BO)
vmovsd %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO


#else

vmovups 0 * SIZE(AO1), %xmm0
vmovups 2 * SIZE(AO1), %xmm2
vmovups 0 * SIZE(AO2), %xmm1
vmovups 2 * SIZE(AO2), %xmm3

vmovups %xmm0, 0 * SIZE(BO)
vmovups %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

vmovups %xmm2, 0 * SIZE(BO)
vmovups %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO

#endif
addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
ALIGN_4


.L12:
testq $2, N
jle .L14
#ifndef DOUBLE
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO2), %xmm1

vmovsd %xmm0, 0 * SIZE(BO)
vmovsd %xmm1, 2 * SIZE(BO)
#else
vmovups 0 * SIZE(AO1), %xmm0
vmovups 0 * SIZE(AO2), %xmm1

vmovups %xmm0, 0 * SIZE(BO)
vmovups %xmm1, 2 * SIZE(BO)
#endif
leaq (BO, M8, 2), BO
addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
ALIGN_4

.L14:
testq $1, N
jle .L19

#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1

vmovss %xmm0, 0 * SIZE(BO1)
vmovss %xmm1, 1 * SIZE(BO1)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0

vmovups %xmm0, 0 * SIZE(BO1)
#endif

addq $2 * SIZE, BO1
ALIGN_4

.L19:
decq J
jg .L01
ALIGN_4

.L20:
testq $1, M
jle .L999
ALIGN_4

.L31:
movq A, AO1
movq B, BO

movq N, I
sarq $1, I
jle .L33
ALIGN_4

.L32:
#ifndef DOUBLE
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd %xmm0, 0 * SIZE(BO)
#else
vmovups 0 * SIZE(AO1), %xmm0
vmovups %xmm0, 0 * SIZE(BO)
#endif

addq $2 * SIZE, AO1
leaq (BO, M8, 2), BO
decq I
jg .L32
ALIGN_4

.L33:
testq $1, N
jle .L999

#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss %xmm0, 0 * SIZE(BO1)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd %xmm0, 0 * SIZE(BO1)
#endif
addq $1 * SIZE, BO1
ALIGN_4

.L999:
#ifdef WINDOWS_ABI
vmovups 0(%rsp), %xmm6
vmovups 16(%rsp), %xmm7
vmovups 32(%rsp), %xmm8
vmovups 48(%rsp), %xmm9
vmovups 64(%rsp), %xmm10
vmovups 80(%rsp), %xmm11
vmovups 96(%rsp), %xmm12
vmovups 112(%rsp), %xmm13
vmovups 128(%rsp), %xmm14
vmovups 144(%rsp), %xmm15

addq $STACKSIZE, %rsp
#endif

popq %rbx
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
#ifdef WINDOWS_ABI
popq %rsi
popq %rdi
#endif

ret

EPILOGUE

+ 4657
- 0
kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
File diff suppressed because it is too large
View File


+ 48
- 8
kernel/x86_64/sgemv_n.S View File

@@ -47,7 +47,7 @@

#ifndef WINDOWS_ABI

#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_M %rdi
#define OLD_N %rsi
@@ -58,10 +58,14 @@
#define STACK_INCY 24 + STACKSIZE(%rsp)
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA 48 (%rsp)

#define MMM 56(%rsp)
#define NN 64(%rsp)
#define AA 72(%rsp)
#define LDAX 80(%rsp)
#define XX 96(%rsp)
#else

#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_M %rcx
#define OLD_N %rdx
@@ -74,6 +78,12 @@
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
#define ALPHA 224 (%rsp)

#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define LDAX 256(%rsp)
#define XX 264(%rsp)

#endif

#define LDA %r8
@@ -137,17 +147,41 @@
movq OLD_LDA, LDA
#endif

movq STACK_INCX, INCX
movq STACK_Y, Y
movq STACK_INCY, INCY
movq STACK_BUFFER, BUFFER

#ifndef WINDOWS_ABI
movss %xmm0, ALPHA
#else
movss %xmm3, ALPHA
#endif


movq M,MMM
movq A,AA
movq N,NN
movq LDA,LDAX
movq X,XX
movq STACK_Y, Y
.L0t:
xorq I,I
addq $1,I
salq $22,I
subq I,MMM
movq I,M
jge .L00t

movq MMM,M
addq I,M
jle .L999x

.L00t:
movq AA,A
movq NN,N
movq LDAX,LDA
movq XX,X

movq STACK_INCX, INCX
movq STACK_INCY, INCY
movq STACK_BUFFER, BUFFER

leaq (,INCX, SIZE), INCX
leaq (,INCY, SIZE), INCY
leaq (,LDA, SIZE), LDA
@@ -5990,6 +6024,12 @@
ALIGN_3

.L999:
leaq (,M,SIZE),%rax
addq %rax,AA
jmp .L0t
ALIGN_4

.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12


+ 5
- 5
kernel/x86_64/sgemv_t.S View File

@@ -63,7 +63,7 @@

#else

#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_M %rcx
#define OLD_N %rdx
@@ -74,10 +74,10 @@
#define STACK_Y 72 + STACKSIZE(%rsp)
#define STACK_INCY 80 + STACKSIZE(%rsp)
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
#define MMM 216(%rsp)
#define NN 224(%rsp)
#define AA 232(%rsp)
#define LDAX 240(%rsp)
#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define LDAX 256(%rsp)

#endif



+ 1
- 1
kernel/x86_64/symv_L_sse.S View File

@@ -76,7 +76,7 @@
#define movsd movlps
#endif

#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/symv_L_sse2.S View File

@@ -76,7 +76,7 @@
#define movsd movlpd
#endif

#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/symv_U_sse.S View File

@@ -76,7 +76,7 @@
#define movsd movlps
#endif

#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)


+ 1
- 1
kernel/x86_64/symv_U_sse2.S View File

@@ -76,7 +76,7 @@
#define movsd movlpd
#endif

#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)


+ 1407
- 0
kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
File diff suppressed because it is too large
View File


+ 3
- 3
kernel/x86_64/zgemm_kernel_4x4_sandy.S View File

@@ -1385,7 +1385,7 @@ ALIGN_5
EXTRA_DY $1, yvec15, xvec7;
EXTRA_DY $1, yvec14, xvec6;
EXTRA_DY $1, yvec13, xvec5;
EXTRA_DY $2, yvec12, xvec4;
EXTRA_DY $1, yvec12, xvec4;
#ifndef TRMMKERNEL
LDL_DX 0*SIZE(C0), xvec0, xvec0;
LDH_DX 1*SIZE(C0), xvec0, xvec0;
@@ -1406,8 +1406,8 @@ STL_DX xvec7, 2*SIZE(C0, ldc, 1);
STH_DX xvec7, 3*SIZE(C0, ldc, 1);
STL_DX xvec13, 0*SIZE(C0, ldc, 1);
STH_DX xvec13, 1*SIZE(C0, ldc, 1);
STL_DX xvec6, 2*SIZE(C0);
STH_DX xvec6, 3*SIZE(C0);
STL_DX xvec5, 2*SIZE(C0);
STH_DX xvec5, 3*SIZE(C0);
#ifndef TRMMKERNEL
LDL_DX 0*SIZE(C1), xvec0, xvec0;
LDH_DX 1*SIZE(C1), xvec0, xvec0;


Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save