Conflicts: kernel/x86_64/KERNEL.BULLDOZERtags/v0.2.9.rc1
| @@ -4,12 +4,16 @@ | |||||
| *.dylib | *.dylib | ||||
| *.def | *.def | ||||
| *.o | *.o | ||||
| *.out | |||||
| lapack-3.1.1 | lapack-3.1.1 | ||||
| lapack-3.1.1.tgz | lapack-3.1.1.tgz | ||||
| lapack-3.4.1 | lapack-3.4.1 | ||||
| lapack-3.4.1.tgz | lapack-3.4.1.tgz | ||||
| lapack-3.4.2 | lapack-3.4.2 | ||||
| lapack-3.4.2.tgz | lapack-3.4.2.tgz | ||||
| lapack-netlib/make.inc | |||||
| lapack-netlib/lapacke/include/lapacke_mangling.h | |||||
| lapack-netlib/TESTING/testing_results.txt | |||||
| *.so | *.so | ||||
| *.a | *.a | ||||
| .svn | .svn | ||||
| @@ -0,0 +1,24 @@ | |||||
| language: c | |||||
| compiler: | |||||
| - gcc | |||||
| env: | |||||
| - TARGET_BOX=LINUX64 BTYPE="BINARY=64" | |||||
| - TARGET_BOX=LINUX64 BTYPE="BINARY=64 USE_OPENMP=1" | |||||
| - TARGET_BOX=LINUX64 BTYPE="BINARY=64 INTERFACE64=1" | |||||
| - TARGET_BOX=LINUX32 BTYPE="BINARY=32" | |||||
| - TARGET_BOX=WIN64 BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" | |||||
| before_install: | |||||
| - sudo apt-get update -qq | |||||
| - sudo apt-get install -qq gfortran | |||||
| - if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi | |||||
| - if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi | |||||
| script: make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE | |||||
| # whitelist | |||||
| branches: | |||||
| only: | |||||
| - master | |||||
| - develop | |||||
| @@ -0,0 +1,87 @@ | |||||
| # Contributions to the OpenBLAS project | |||||
| ## Creator & Maintainer | |||||
| * Zhang Xianyi <traits.zhang@gmail.com> | |||||
| ## Active Developers | |||||
| * Wang Qian <traz0824@gmail.com> | |||||
| * Optimize BLAS3 on ICT Loongson 3A. | |||||
| * Optimize BLAS3 on Intel Sandy Bridge. | |||||
| * Zaheer Chothia <zaheer.chothia@gmail.com> | |||||
| * Improve the compatibility about complex number | |||||
| * Build LAPACKE: C interface to LAPACK | |||||
| * Improve the windows build. | |||||
| ## Previous Developers | |||||
| * Chen Shaohu <huhumartinwar@gmail.com> | |||||
| * Optimize GEMV on the Loongson 3A processor. | |||||
| * Luo Wen | |||||
| * Intern. Test Level-2 BLAS. | |||||
| ## Contributors | |||||
| In chronological order: | |||||
| * pipping <http://page.mi.fu-berlin.de/pipping> | |||||
| * [2011-06-11] Make USE_OPENMP=0 disable openmp. | |||||
| * Stefan Karpinski <stefan@karpinski.org> | |||||
| * [2011-12-28] Fix a bug about SystemStubs on Mac OS X. | |||||
| * Alexander Eberspächer <https://github.com/aeberspaecher> | |||||
| * [2012-05-02] Add note on patch for segfaults on Linux kernel 2.6.32. | |||||
| * Mike Nolta <mike@nolta.net> | |||||
| * [2012-05-19] Fix building bug on FreeBSD and NetBSD. | |||||
| * Sylvestre Ledru <https://github.com/sylvestre> | |||||
| * [2012-07-01] Improve the detection of sparc. Fix building bug under | |||||
| Hurd and kfreebsd. | |||||
| * Jameson Nash <https://github.com/vtjnash> | |||||
| * [2012-08-20] Provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to | |||||
| make on the command line. | |||||
| * Alexander Nasonov <alnsn@yandex.ru> | |||||
| * [2012-11-10] Fix NetBSD build. | |||||
| * Sébastien Villemot <sebastien@debian.org> | |||||
| * [2012-11-14] Fix compilation with TARGET=GENERIC. Patch applied to Debian package. | |||||
| * Werner Saar <wernsaar@googlemail.com> | |||||
| * [2013-03-04] Optimize AVX and FMA4 DGEMM on AMD Bulldozer | |||||
| * [2013-04-27] Optimize AVX and FMA4 TRSM on AMD Bulldozer | |||||
| * [2013-06-09] Optimize AVX and FMA4 SGEMM on AMD Bulldozer | |||||
| * [2013-06-11] Optimize AVX and FMA4 ZGEMM on AMD Bulldozer | |||||
| * [2013-06-12] Optimize AVX and FMA4 CGEMM on AMD Bulldozer | |||||
| * [2013-06-16] Optimize dgemv_n kernel on AMD Bulldozer | |||||
| * [2013-06-20] Optimize ddot, daxpy kernel on AMD Bulldozer | |||||
| * [2013-06-21] Optimize dcopy kernel on AMD Bulldozer | |||||
| * Kang-Che Sung <Explorer09@gmail.com> | |||||
| * [2013-05-17] Fix typo in the document. Re-order the architecture list in getarch.c. | |||||
| * Kenneth Hoste <kenneth.hoste@gmail.com> | |||||
| * [2013-05-22] Adjust Makefile about downloading LAPACK source files. | |||||
| * Lei WANG <https://github.com/wlbksy> | |||||
| * [2013-05-22] Fix a bug about wget. | |||||
| * Dan Luu <http://www.linkedin.com/in/danluu> | |||||
| * [2013-06-30] Add Intel Haswell support (using sandybridge optimizations). | |||||
| * grisuthedragon <https://github.com/grisuthedragon> | |||||
| * [2013-07-11] create openblas_get_parallel to retrieve information which parallelization | |||||
| model is used by OpenBLAS. | |||||
| * Sébastien Fabbro <bicatali@gentoo.org> | |||||
| * [2013-07-24] Modify makefile to respect user's LDFLAGS | |||||
| * [2013-07-24] Add stack markings for GNU as arch-independent for assembler files | |||||
| * [Your name or handle] <[email or website]> | |||||
| * [Date] [Brief summary of your changes] | |||||
| @@ -1,4 +1,54 @@ | |||||
| OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
| ==================================================================== | |||||
| Version 0.2.7 | |||||
| 20-Jul-2013 | |||||
| common: | |||||
| * Support LSB (Linux Standard Base) 4.1. | |||||
| e.g. make CC=lsbcc | |||||
| * Include LAPACK 3.4.2 source codes to the repo. | |||||
| Avoid downloading at compile time. | |||||
| * Add NO_PARALLEL_MAKE flag to disable parallel make. | |||||
| * Create openblas_get_parallel to retrieve information which | |||||
| parallelization model is used by OpenBLAS. (Thank grisuthedragon) | |||||
| * Detect LLVM/Clang compiler. The default compiler is Clang on Mac OS X. | |||||
| * Change LIBSUFFIX from .lib to .a on windows. | |||||
| * A walk round for dtrti_U single thread bug. Replace it with LAPACK codes. (#191) | |||||
| x86/x86-64: | |||||
| * Optimize c/zgemm, trsm, dgemv_n, ddot, daxpy, dcopy on | |||||
| AMD Bulldozer. (Thank Werner Saar) | |||||
| * Add Intel Haswell support (using Sandybridge optimizations). | |||||
| (Thank Dan Luu) | |||||
| * Add AMD Piledriver support (using Bulldozer optimizations). | |||||
| * Fix the computational error in zgemm avx kernel on | |||||
| Sandybridge. (#237) | |||||
| * Fix the overflow bug in gemv. | |||||
| * Fix the overflow bug in multi-threaded BLAS3, getrf when NUM_THREADS | |||||
| is very large.(#214, #221, #246). | |||||
| MIPS64: | |||||
| * Support loongcc (Open64 based) compiler for ICT Loongson 3A/B. | |||||
| Power: | |||||
| * Support Power7 by old Power6 kernels. (#220) | |||||
| ==================================================================== | |||||
| Version 0.2.6 | |||||
| 2-Mar-2013 | |||||
| common: | |||||
| * Improved OpenMP performance slightly. (d744c9) | |||||
| * Improved cblas.h compatibility with Intel MKL.(#185) | |||||
| * Fixed the overflowing bug in single thread cholesky factorization. | |||||
| * Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174) | |||||
| x86/x86-64: | |||||
| * Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) | |||||
| We will tune the performance in future. | |||||
| * Auto-detect Intel Xeon E7540. | |||||
| * Fixed the overflowing buffer bug of gemv. (#173) | |||||
| * Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189) | |||||
| MIPS64: | |||||
| ==================================================================== | ==================================================================== | ||||
| Version 0.2.5 | Version 0.2.5 | ||||
| 26-Nov-2012 | 26-Nov-2012 | ||||
| @@ -82,27 +82,27 @@ endif | |||||
| shared : | shared : | ||||
| ifndef NO_SHARED | ifndef NO_SHARED | ||||
| ifeq ($(OSNAME), Linux) | ifeq ($(OSNAME), Linux) | ||||
| $(MAKE) -C exports so | |||||
| -ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||||
| -ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||||
| @$(MAKE) -C exports so | |||||
| @-ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||||
| @-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), FreeBSD) | ifeq ($(OSNAME), FreeBSD) | ||||
| $(MAKE) -C exports so | |||||
| -ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||||
| @$(MAKE) -C exports so | |||||
| @-ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), NetBSD) | ifeq ($(OSNAME), NetBSD) | ||||
| $(MAKE) -C exports so | |||||
| -ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||||
| @$(MAKE) -C exports so | |||||
| @-ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), Darwin) | ifeq ($(OSNAME), Darwin) | ||||
| $(MAKE) -C exports dyn | |||||
| -ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib | |||||
| @$(MAKE) -C exports dyn | |||||
| @-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), WINNT) | ifeq ($(OSNAME), WINNT) | ||||
| $(MAKE) -C exports dll | |||||
| @$(MAKE) -C exports dll | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), CYGWIN_NT) | ifeq ($(OSNAME), CYGWIN_NT) | ||||
| $(MAKE) -C exports dll | |||||
| @$(MAKE) -C exports dll | |||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -131,30 +131,33 @@ endif | |||||
| ifeq ($(NOFORTRAN), 1) | ifeq ($(NOFORTRAN), 1) | ||||
| $(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.) | $(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.) | ||||
| endif | endif | ||||
| -ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||||
| for d in $(SUBDIRS) ; \ | |||||
| @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||||
| @for d in $(SUBDIRS) ; \ | |||||
| do if test -d $$d; then \ | do if test -d $$d; then \ | ||||
| $(MAKE) -C $$d $(@F) || exit 1 ; \ | $(MAKE) -C $$d $(@F) || exit 1 ; \ | ||||
| fi; \ | fi; \ | ||||
| done | done | ||||
| #Save the config files for installation | #Save the config files for installation | ||||
| cp Makefile.conf Makefile.conf_last | |||||
| cp config.h config_last.h | |||||
| @cp Makefile.conf Makefile.conf_last | |||||
| @cp config.h config_last.h | |||||
| ifdef QUAD_PRECISION | ifdef QUAD_PRECISION | ||||
| echo "#define QUAD_PRECISION">> config_last.h | |||||
| @echo "#define QUAD_PRECISION">> config_last.h | |||||
| endif | endif | ||||
| ifeq ($(EXPRECISION), 1) | ifeq ($(EXPRECISION), 1) | ||||
| echo "#define EXPRECISION">> config_last.h | |||||
| @echo "#define EXPRECISION">> config_last.h | |||||
| endif | endif | ||||
| ## | ## | ||||
| ifeq ($(DYNAMIC_ARCH), 1) | ifeq ($(DYNAMIC_ARCH), 1) | ||||
| $(MAKE) -C kernel commonlibs || exit 1 | |||||
| for d in $(DYNAMIC_CORE) ; \ | |||||
| @$(MAKE) -C kernel commonlibs || exit 1 | |||||
| @for d in $(DYNAMIC_CORE) ; \ | |||||
| do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ | do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ | ||||
| done | done | ||||
| echo DYNAMIC_ARCH=1 >> Makefile.conf_last | |||||
| @echo DYNAMIC_ARCH=1 >> Makefile.conf_last | |||||
| endif | endif | ||||
| touch lib.grd | |||||
| ifdef USE_THREAD | |||||
| @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last | |||||
| endif | |||||
| @touch lib.grd | |||||
| prof : prof_blas prof_lapack | prof : prof_blas prof_lapack | ||||
| @@ -203,19 +206,19 @@ ifeq ($(NO_LAPACK), 1) | |||||
| netlib : | netlib : | ||||
| else | else | ||||
| netlib : lapack-3.4.2 patch.for_lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc | |||||
| netlib : lapack_prebuild | |||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib | |||||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib | |||||
| endif | endif | ||||
| ifndef NO_LAPACKE | ifndef NO_LAPACKE | ||||
| -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib | |||||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib | |||||
| endif | endif | ||||
| endif | endif | ||||
| prof_lapack : lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | |||||
| prof_lapack : lapack_prebuild | |||||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | |||||
| $(NETLIB_LAPACK_DIR)/make.inc : | |||||
| lapack_prebuild : | |||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | ||||
| -@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| @@ -224,11 +227,7 @@ ifndef NOFORTRAN | |||||
| -@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| ifdef INTERFACE64 | |||||
| -@echo "CFLAGS = $(CFLAGS) -DHAVE_LAPACK_CONFIG_H -DLAPACK_ILP64" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| else | |||||
| -@echo "CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| endif | |||||
| -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| @@ -244,7 +243,7 @@ endif | |||||
| lapack-3.4.2 : lapack-3.4.2.tgz | lapack-3.4.2 : lapack-3.4.2.tgz | ||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| ifndef NO_LAPACK | ifndef NO_LAPACK | ||||
| @if test `$(MD5SUM) lapack-3.4.2.tgz | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \ | |||||
| @if test `$(MD5SUM) $< | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \ | |||||
| echo $(TAR) zxf $< ;\ | echo $(TAR) zxf $< ;\ | ||||
| $(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.2) ;\ | $(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.2) ;\ | ||||
| rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\ | rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\ | ||||
| @@ -262,27 +261,31 @@ lapack-3.4.2.tgz : | |||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or | #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or | ||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD)) | ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD)) | ||||
| curl -O $(LAPACK_URL) | |||||
| curl -O $(LAPACK_URL); | |||||
| else | else | ||||
| ifeq ($(OSNAME), FreeBSD) | ifeq ($(OSNAME), FreeBSD) | ||||
| fetch $(LAPACK_URL) | |||||
| fetch $(LAPACK_URL); | |||||
| else | else | ||||
| wget $(LAPACK_URL) | |||||
| wget -O $@ $(LAPACK_URL); | |||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| large.tgz : | large.tgz : | ||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| -wget http://www.netlib.org/lapack/timing/large.tgz | |||||
| if [ ! -a $< ]; then | |||||
| -wget http://www.netlib.org/lapack/timing/large.tgz; | |||||
| fi | |||||
| endif | endif | ||||
| timing.tgz : | timing.tgz : | ||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| -wget http://www.netlib.org/lapack/timing/timing.tgz | |||||
| if [ ! -a $< ]; then | |||||
| -wget http://www.netlib.org/lapack/timing/timing.tgz; | |||||
| fi | |||||
| endif | endif | ||||
| lapack-timing : lapack-3.4.2 large.tgz timing.tgz | |||||
| lapack-timing : large.tgz timing.tgz | |||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) | (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) | ||||
| (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) | (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) | ||||
| @@ -314,10 +317,12 @@ clean :: | |||||
| #endif | #endif | ||||
| @$(MAKE) -C reference clean | @$(MAKE) -C reference clean | ||||
| @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h | @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h | ||||
| ifeq ($(OSNAME), Darwin) | |||||
| @rm -rf getarch.dSYM getarch_2nd.dSYM | |||||
| endif | |||||
| @rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib | @rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib | ||||
| @if test -d $(NETLIB_LAPACK_DIR); then \ | |||||
| echo deleting $(NETLIB_LAPACK_DIR); \ | |||||
| rm -rf $(NETLIB_LAPACK_DIR) ;\ | |||||
| fi | |||||
| @touch $(NETLIB_LAPACK_DIR)/make.inc | |||||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | |||||
| @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | |||||
| @rm -f *.grd Makefile.conf_last config_last.h | @rm -f *.grd Makefile.conf_last config_last.h | ||||
| @echo Done. | |||||
| @echo Done. | |||||
| @@ -1,6 +1 @@ | |||||
| COPT = -Wall -O2 # -DGEMMTEST | COPT = -Wall -O2 # -DGEMMTEST | ||||
| ifdef BINARY64 | |||||
| else | |||||
| # LDFLAGS = -m elf32ppc | |||||
| LDFLAGS = -m elf_i386 | |||||
| endif | |||||
| @@ -5,6 +5,7 @@ include ./Makefile.system | |||||
| OPENBLAS_INCLUDE_DIR:=$(PREFIX)/include | OPENBLAS_INCLUDE_DIR:=$(PREFIX)/include | ||||
| OPENBLAS_LIBRARY_DIR:=$(PREFIX)/lib | OPENBLAS_LIBRARY_DIR:=$(PREFIX)/lib | ||||
| OPENBLAS_BUILD_DIR:=$(CURDIR) | |||||
| .PHONY : install | .PHONY : install | ||||
| .NOTPARALLEL : install | .NOTPARALLEL : install | ||||
| @@ -48,32 +49,36 @@ endif | |||||
| #for install static library | #for install static library | ||||
| @echo Copy the static library to $(OPENBLAS_LIBRARY_DIR) | @echo Copy the static library to $(OPENBLAS_LIBRARY_DIR) | ||||
| @cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR) | @cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR) | ||||
| @-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(LIBSUFFIX) | |||||
| @cd $(OPENBLAS_LIBRARY_DIR) ; \ | |||||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||||
| #for install shared library | #for install shared library | ||||
| @echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR) | @echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR) | ||||
| ifeq ($(OSNAME), Linux) | ifeq ($(OSNAME), Linux) | ||||
| -cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) | |||||
| -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so | |||||
| -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so.$(MAJOR_VERSION) | |||||
| @cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) | |||||
| @cd $(OPENBLAS_LIBRARY_DIR) ; \ | |||||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | |||||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), FreeBSD) | ifeq ($(OSNAME), FreeBSD) | ||||
| -cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) | |||||
| -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so | |||||
| @cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) | |||||
| @cd $(OPENBLAS_LIBRARY_DIR) ; \ | |||||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), NetBSD) | ifeq ($(OSNAME), NetBSD) | ||||
| -cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) | |||||
| -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so | |||||
| @cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) | |||||
| @cd $(OPENBLAS_LIBRARY_DIR) ; \ | |||||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), Darwin) | ifeq ($(OSNAME), Darwin) | ||||
| -cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR) | |||||
| -install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) | |||||
| -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib | |||||
| @-cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR) | |||||
| @-install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) | |||||
| @-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), WINNT) | ifeq ($(OSNAME), WINNT) | ||||
| -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) | |||||
| @-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), CYGWIN_NT) | ifeq ($(OSNAME), CYGWIN_NT) | ||||
| -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) | |||||
| @-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) | |||||
| endif | endif | ||||
| @echo Install OK! | @echo Install OK! | ||||
| @@ -17,13 +17,7 @@ endif | |||||
| endif | endif | ||||
| ifdef BINARY64 | ifdef BINARY64 | ||||
| ifeq ($(OSNAME), Linux) | |||||
| LDFLAGS = -m elf64ppc | |||||
| endif | |||||
| ifeq ($(OSNAME), Darwin) | |||||
| LDFLAGS = -arch ppc64 | |||||
| endif | |||||
| ifeq ($(OSNAME), AIX) | ifeq ($(OSNAME), AIX) | ||||
| CCOMMON_OPT += -mpowerpc64 -maix64 | CCOMMON_OPT += -mpowerpc64 -maix64 | ||||
| @@ -34,17 +28,12 @@ ifeq ($(COMPILER_F77), xlf) | |||||
| FCOMMON_OPT += -q64 | FCOMMON_OPT += -q64 | ||||
| endif | endif | ||||
| ARFLAGS = -X 64 | ARFLAGS = -X 64 | ||||
| LDFLAGS = -b64 | |||||
| ASFLAGS = -a64 | ASFLAGS = -a64 | ||||
| endif | endif | ||||
| else | else | ||||
| ifeq ($(OSNAME), Linux) | |||||
| LDFLAGS = -m elf32ppc | |||||
| endif | |||||
| ifeq ($(OSNAME), AIX) | ifeq ($(OSNAME), AIX) | ||||
| CCOMMON_OPT += -Wa,-a32 | CCOMMON_OPT += -Wa,-a32 | ||||
| ARFLAGS = -X 32 | ARFLAGS = -X 32 | ||||
| LDFLAGS = -b32 | |||||
| ASFLAGS = -a32 | ASFLAGS = -a32 | ||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.2.5 | |||||
| VERSION = 0.2.7 | |||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
| @@ -81,6 +81,9 @@ VERSION = 0.2.5 | |||||
| # and OS. However, the performance is low. | # and OS. However, the performance is low. | ||||
| # NO_AVX = 1 | # NO_AVX = 1 | ||||
| # Don't use parallel make. | |||||
| # NO_PARALLEL_MAKE = 1 | |||||
| # If you would like to know minute performance report of GotoBLAS. | # If you would like to know minute performance report of GotoBLAS. | ||||
| # FUNCTION_PROFILE = 1 | # FUNCTION_PROFILE = 1 | ||||
| @@ -104,8 +107,8 @@ VERSION = 0.2.5 | |||||
| # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | ||||
| # with single thread. You can use this flag to avoid the overhead of multi-threading | # with single thread. You can use this flag to avoid the overhead of multi-threading | ||||
| # in small matrix sizes. The default value is 50. | |||||
| # GEMM_MULTITHREAD_THRESHOLD = 50 | |||||
| # in small matrix sizes. The default value is 4. | |||||
| # GEMM_MULTITHREAD_THRESHOLD = 4 | |||||
| # If you need santy check by comparing reference BLAS. It'll be very | # If you need santy check by comparing reference BLAS. It'll be very | ||||
| # slow (Not implemented yet). | # slow (Not implemented yet). | ||||
| @@ -10,7 +10,6 @@ endif | |||||
| ifeq ($(COMPILER_F77), f90) | ifeq ($(COMPILER_F77), f90) | ||||
| FCOMMON_OPT += -xarch=v9 | FCOMMON_OPT += -xarch=v9 | ||||
| endif | endif | ||||
| LDFLAGS = -64 | |||||
| else | else | ||||
| CCOMMON_OPT += -mcpu=v9 | CCOMMON_OPT += -mcpu=v9 | ||||
| @@ -9,9 +9,7 @@ ifndef TOPDIR | |||||
| TOPDIR = . | TOPDIR = . | ||||
| endif | endif | ||||
| ifndef NETLIB_LAPACK_DIR | |||||
| NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.2 | |||||
| endif | |||||
| NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib | |||||
| # Default C compiler | # Default C compiler | ||||
| # - Only set if not specified on the command line or inherited from the environment. | # - Only set if not specified on the command line or inherited from the environment. | ||||
| @@ -20,6 +18,12 @@ endif | |||||
| # - Default value is 'cc' which is not always a valid command (e.g. MinGW). | # - Default value is 'cc' which is not always a valid command (e.g. MinGW). | ||||
| ifeq ($(origin CC),default) | ifeq ($(origin CC),default) | ||||
| CC = gcc | CC = gcc | ||||
| # Change the default compile to clang on Mac OSX. | |||||
| # http://stackoverflow.com/questions/714100/os-detecting-makefile | |||||
| UNAME_S := $(shell uname -s) | |||||
| ifeq ($(UNAME_S),Darwin) | |||||
| CC = clang | |||||
| endif | |||||
| endif | endif | ||||
| # Default Fortran compiler (FC) is selected by f_check. | # Default Fortran compiler (FC) is selected by f_check. | ||||
| @@ -53,7 +57,7 @@ GETARCH_FLAGS += -DUSE64BITINT | |||||
| endif | endif | ||||
| ifndef GEMM_MULTITHREAD_THRESHOLD | ifndef GEMM_MULTITHREAD_THRESHOLD | ||||
| GEMM_MULTITHREAD_THRESHOLD=50 | |||||
| GEMM_MULTITHREAD_THRESHOLD=4 | |||||
| endif | endif | ||||
| GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) | GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) | ||||
| @@ -65,6 +69,19 @@ ifeq ($(DEBUG), 1) | |||||
| GETARCH_FLAGS += -g | GETARCH_FLAGS += -g | ||||
| endif | endif | ||||
| ifeq ($(QUIET_MAKE), 1) | |||||
| MAKE += -s | |||||
| endif | |||||
| ifndef NO_PARALLEL_MAKE | |||||
| NO_PARALLEL_MAKE=0 | |||||
| endif | |||||
| GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE) | |||||
| ifeq ($(HOSTCC), loongcc) | |||||
| GETARCH_FLAGS += -static | |||||
| endif | |||||
| # This operation is expensive, so execution should be once. | # This operation is expensive, so execution should be once. | ||||
| ifndef GOTOBLAS_MAKEFILE | ifndef GOTOBLAS_MAKEFILE | ||||
| export GOTOBLAS_MAKEFILE = 1 | export GOTOBLAS_MAKEFILE = 1 | ||||
| @@ -148,7 +165,12 @@ EXTRALIB += -defaultlib:advapi32 | |||||
| SUFFIX = obj | SUFFIX = obj | ||||
| PSUFFIX = pobj | PSUFFIX = pobj | ||||
| LIBSUFFIX = lib | |||||
| LIBSUFFIX = a | |||||
| ifeq ($(C_COMPILER), CLANG) | |||||
| CCOMMON_OPT += -DMS_ABI | |||||
| endif | |||||
| ifeq ($(C_COMPILER), GCC) | ifeq ($(C_COMPILER), GCC) | ||||
| #Test for supporting MS_ABI | #Test for supporting MS_ABI | ||||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | ||||
| @@ -167,8 +189,15 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1) | |||||
| CCOMMON_OPT += -DMS_ABI | CCOMMON_OPT += -DMS_ABI | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| # Ensure the correct stack alignment on Win32 | |||||
| # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 | |||||
| ifeq ($(ARCH), x86) | |||||
| CCOMMON_OPT += -mincoming-stack-boundary=2 | |||||
| FCOMMON_OPT += -mincoming-stack-boundary=2 | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(OSNAME), Interix) | ifeq ($(OSNAME), Interix) | ||||
| @@ -223,11 +252,17 @@ NO_BINARY_MODE = 1 | |||||
| endif | endif | ||||
| ifndef NO_EXPRECISION | ifndef NO_EXPRECISION | ||||
| ifeq ($(F_COMPILER), GFORTRAN) | ifeq ($(F_COMPILER), GFORTRAN) | ||||
| ifeq ($(C_COMPILER), GCC) | |||||
| # ifeq logical or. GCC or LSB | |||||
| ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) | |||||
| EXPRECISION = 1 | EXPRECISION = 1 | ||||
| CCOMMON_OPT += -DEXPRECISION -m128bit-long-double | CCOMMON_OPT += -DEXPRECISION -m128bit-long-double | ||||
| FCOMMON_OPT += -m128bit-long-double | FCOMMON_OPT += -m128bit-long-double | ||||
| endif | endif | ||||
| ifeq ($(C_COMPILER), CLANG) | |||||
| EXPRECISION = 1 | |||||
| CCOMMON_OPT += -DEXPRECISION | |||||
| FCOMMON_OPT += -m128bit-long-double | |||||
| endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -235,11 +270,17 @@ endif | |||||
| ifeq ($(ARCH), x86_64) | ifeq ($(ARCH), x86_64) | ||||
| ifndef NO_EXPRECISION | ifndef NO_EXPRECISION | ||||
| ifeq ($(F_COMPILER), GFORTRAN) | ifeq ($(F_COMPILER), GFORTRAN) | ||||
| ifeq ($(C_COMPILER), GCC) | |||||
| # ifeq logical or. GCC or LSB | |||||
| ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) | |||||
| EXPRECISION = 1 | EXPRECISION = 1 | ||||
| CCOMMON_OPT += -DEXPRECISION -m128bit-long-double | CCOMMON_OPT += -DEXPRECISION -m128bit-long-double | ||||
| FCOMMON_OPT += -m128bit-long-double | FCOMMON_OPT += -m128bit-long-double | ||||
| endif | endif | ||||
| ifeq ($(C_COMPILER), CLANG) | |||||
| EXPRECISION = 1 | |||||
| CCOMMON_OPT += -DEXPRECISION | |||||
| FCOMMON_OPT += -m128bit-long-double | |||||
| endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -249,7 +290,13 @@ CCOMMON_OPT += -wd981 | |||||
| endif | endif | ||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| ifeq ($(C_COMPILER), GCC) | |||||
| # ifeq logical or. GCC or LSB | |||||
| ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) | |||||
| CCOMMON_OPT += -fopenmp | |||||
| endif | |||||
| ifeq ($(C_COMPILER), CLANG) | |||||
| $(error OpenBLAS: Clang didn't support OpenMP yet.) | |||||
| CCOMMON_OPT += -fopenmp | CCOMMON_OPT += -fopenmp | ||||
| endif | endif | ||||
| @@ -277,14 +324,14 @@ ifeq ($(ARCH), x86) | |||||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | ||||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
| ifneq ($(NO_AVX), 1) | ifneq ($(NO_AVX), 1) | ||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(ARCH), x86_64) | ifeq ($(ARCH), x86_64) | ||||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
| ifneq ($(NO_AVX), 1) | ifneq ($(NO_AVX), 1) | ||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER | |||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -318,11 +365,18 @@ endif | |||||
| # C Compiler dependent settings | # C Compiler dependent settings | ||||
| # | # | ||||
| ifeq ($(C_COMPILER), GCC) | |||||
| # ifeq logical or. GCC or CLANG or LSB | |||||
| # http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or | |||||
| ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG LSB)) | |||||
| CCOMMON_OPT += -Wall | CCOMMON_OPT += -Wall | ||||
| COMMON_PROF += -fno-inline | COMMON_PROF += -fno-inline | ||||
| NO_UNINITIALIZED_WARN = -Wno-uninitialized | NO_UNINITIALIZED_WARN = -Wno-uninitialized | ||||
| ifeq ($(QUIET_MAKE), 1) | |||||
| CCOMMON_OPT += $(NO_UNINITIALIZED_WARN) -Wno-unused | |||||
| endif | |||||
| ifdef NO_BINARY_MODE | ifdef NO_BINARY_MODE | ||||
| ifeq ($(ARCH), mips64) | ifeq ($(ARCH), mips64) | ||||
| @@ -407,7 +461,12 @@ endif | |||||
| ifeq ($(F_COMPILER), GFORTRAN) | ifeq ($(F_COMPILER), GFORTRAN) | ||||
| CCOMMON_OPT += -DF_INTERFACE_GFORT | CCOMMON_OPT += -DF_INTERFACE_GFORT | ||||
| FCOMMON_OPT += -Wall | FCOMMON_OPT += -Wall | ||||
| #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | |||||
| ifneq ($(NO_LAPACK), 1) | |||||
| ifneq ($(C_COMPILER), LSB) | |||||
| EXTRALIB += -lgfortran | EXTRALIB += -lgfortran | ||||
| endif | |||||
| endif | |||||
| ifdef NO_BINARY_MODE | ifdef NO_BINARY_MODE | ||||
| ifeq ($(ARCH), mips64) | ifeq ($(ARCH), mips64) | ||||
| ifdef BINARY64 | ifdef BINARY64 | ||||
| @@ -514,11 +573,28 @@ ifdef INTERFACE64 | |||||
| FCOMMON_OPT += -i8 | FCOMMON_OPT += -i8 | ||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(ARCH), mips64) | |||||
| ifndef BINARY64 | |||||
| FCOMMON_OPT += -n32 | |||||
| else | |||||
| FCOMMON_OPT += -n64 | |||||
| endif | |||||
| ifeq ($(CORE), LOONGSON3A) | |||||
| FCOMMON_OPT += -loongson3 -static | |||||
| endif | |||||
| ifeq ($(CORE), LOONGSON3B) | |||||
| FCOMMON_OPT += -loongson3 -static | |||||
| endif | |||||
| else | |||||
| ifndef BINARY64 | ifndef BINARY64 | ||||
| FCOMMON_OPT += -m32 | FCOMMON_OPT += -m32 | ||||
| else | else | ||||
| FCOMMON_OPT += -m64 | FCOMMON_OPT += -m64 | ||||
| endif | endif | ||||
| endif | |||||
| ifdef USE_OPENMP | ifdef USE_OPENMP | ||||
| FEXTRALIB += -lstdc++ | FEXTRALIB += -lstdc++ | ||||
| @@ -527,12 +603,30 @@ endif | |||||
| endif | endif | ||||
| ifeq ($(C_COMPILER), OPEN64) | ifeq ($(C_COMPILER), OPEN64) | ||||
| ifeq ($(ARCH), mips64) | |||||
| ifndef BINARY64 | |||||
| CCOMMON_OPT += -n32 | |||||
| else | |||||
| CCOMMON_OPT += -n64 | |||||
| endif | |||||
| ifeq ($(CORE), LOONGSON3A) | |||||
| CCOMMON_OPT += -loongson3 -static | |||||
| endif | |||||
| ifeq ($(CORE), LOONGSON3B) | |||||
| CCOMMON_OPT += -loongson3 -static | |||||
| endif | |||||
| else | |||||
| ifndef BINARY64 | ifndef BINARY64 | ||||
| CCOMMON_OPT += -m32 | CCOMMON_OPT += -m32 | ||||
| else | else | ||||
| CCOMMON_OPT += -m64 | CCOMMON_OPT += -m64 | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(C_COMPILER), SUN) | ifeq ($(C_COMPILER), SUN) | ||||
| CCOMMON_OPT += -w | CCOMMON_OPT += -w | ||||
| @@ -741,6 +835,15 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) | |||||
| override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) | override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) | ||||
| #MAKEOVERRIDES = | #MAKEOVERRIDES = | ||||
| LAPACK_CFLAGS = $(CFLAGS) | |||||
| LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H | |||||
| ifdef INTERFACE64 | |||||
| LAPACK_CFLAGS += -DLAPACK_ILP64 | |||||
| endif | |||||
| ifeq ($(C_COMPILER), LSB) | |||||
| LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE | |||||
| endif | |||||
| ifndef SUFFIX | ifndef SUFFIX | ||||
| SUFFIX = o | SUFFIX = o | ||||
| endif | endif | ||||
| @@ -835,6 +938,13 @@ export ZGEMM_UNROLL_M | |||||
| export ZGEMM_UNROLL_N | export ZGEMM_UNROLL_N | ||||
| export XGEMM_UNROLL_M | export XGEMM_UNROLL_M | ||||
| export XGEMM_UNROLL_N | export XGEMM_UNROLL_N | ||||
| export CGEMM3M_UNROLL_M | |||||
| export CGEMM3M_UNROLL_N | |||||
| export ZGEMM3M_UNROLL_M | |||||
| export ZGEMM3M_UNROLL_N | |||||
| export XGEMM3M_UNROLL_M | |||||
| export XGEMM3M_UNROLL_N | |||||
| ifdef USE_CUDA | ifdef USE_CUDA | ||||
| export CUDADIR | export CUDADIR | ||||
| @@ -1,8 +1,5 @@ | |||||
| # COMPILER_PREFIX = mingw32- | # COMPILER_PREFIX = mingw32- | ||||
| ifeq ($(OSNAME), Linux) | |||||
| LDFLAGS = -melf_i386 | |||||
| endif | |||||
| ifeq ($(OSNAME), Interix) | ifeq ($(OSNAME), Interix) | ||||
| ARFLAGS = -m x86 | ARFLAGS = -m x86 | ||||
| @@ -2,25 +2,12 @@ | |||||
| ifeq ($(OSNAME), SunOS) | ifeq ($(OSNAME), SunOS) | ||||
| ifdef BINARY64 | ifdef BINARY64 | ||||
| LDFLAGS = -64 | |||||
| ifeq ($(F_COMPILER), SUN) | ifeq ($(F_COMPILER), SUN) | ||||
| FCOMMON_OPT += -m64 | FCOMMON_OPT += -m64 | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(OSNAME), FreeBSD) | |||||
| LDFLAGS = -m elf_x86_64_fbsd | |||||
| endif | |||||
| ifeq ($(OSNAME), Linux) | |||||
| LDFLAGS = -m elf_x86_64 | |||||
| endif | |||||
| ifeq ($(OSNAME), Darwin) | |||||
| LDFLAGS = | |||||
| endif | |||||
| ifeq ($(OSNAME), Interix) | ifeq ($(OSNAME), Interix) | ||||
| ARFLAGS = -m x64 | ARFLAGS = -m x64 | ||||
| endif | endif | ||||
| @@ -1,11 +1,20 @@ | |||||
| # OpenBLAS | # OpenBLAS | ||||
| [](https://travis-ci.org/xianyi/OpenBLAS) | |||||
| ## Introduction | ## Introduction | ||||
| OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS <http://www.rdcps.ac.cn>. | |||||
| OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. | |||||
| Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>. | Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>. | ||||
| ## Installation | |||||
| ## Binary Packages | |||||
| We provide binary packages for the following platform. | |||||
| * Windows x86/x86_64 | |||||
| You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). | |||||
| ## Installation from Source | |||||
| Download from project homepage. http://xianyi.github.com/OpenBLAS/ | Download from project homepage. http://xianyi.github.com/OpenBLAS/ | ||||
| Or, check out codes from git://github.com/xianyi/OpenBLAS.git | Or, check out codes from git://github.com/xianyi/OpenBLAS.git | ||||
| @@ -23,11 +32,15 @@ On X86 box, compile this library for loongson3a CPU. | |||||
| make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A | make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A | ||||
| On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler. | |||||
| make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 | |||||
| ### Debug version | ### Debug version | ||||
| make DEBUG=1 | make DEBUG=1 | ||||
| ### Intall to the directory (Optional) | |||||
| ### Install to the directory (optional) | |||||
| Example: | Example: | ||||
| @@ -43,8 +56,10 @@ Please read GotoBLAS_01Readme.txt | |||||
| #### x86/x86-64: | #### x86/x86-64: | ||||
| - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. | - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. | ||||
| - **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. | - **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. | ||||
| - **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge). | |||||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | ||||
| - **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes. | |||||
| - **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) | |||||
| - **AMD PILEDRIVER**: Used Bulldozer codes. | |||||
| #### MIPS64: | #### MIPS64: | ||||
| - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. | - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. | ||||
| @@ -54,7 +69,7 @@ Please read GotoBLAS_01Readme.txt | |||||
| - **GNU/Linux** | - **GNU/Linux** | ||||
| - **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | - **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | ||||
| - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. | - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. | ||||
| - **FreeBSD**: Supportted by community. We didn't test the library on this OS. | |||||
| - **FreeBSD**: Supported by community. We didn't test the library on this OS. | |||||
| ## Usages | ## Usages | ||||
| Link with libopenblas.a or -lopenblas for shared library. | Link with libopenblas.a or -lopenblas for shared library. | ||||
| @@ -79,7 +94,7 @@ If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS enviro | |||||
| ### Set the number of threads on runtime. | ### Set the number of threads on runtime. | ||||
| We provided the below functions to controll the number of threads on runtime. | |||||
| We provided the below functions to control the number of threads on runtime. | |||||
| void goto_set_num_threads(int num_threads); | void goto_set_num_threads(int num_threads); | ||||
| @@ -91,7 +106,8 @@ If you compile this lib with USE_OPENMP=1, you should use the above functions, t | |||||
| Please add a issue in https://github.com/xianyi/OpenBLAS/issues | Please add a issue in https://github.com/xianyi/OpenBLAS/issues | ||||
| ## Contact | ## Contact | ||||
| OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas | |||||
| * OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users | |||||
| * OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev | |||||
| ## ChangeLog | ## ChangeLog | ||||
| Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | ||||
| @@ -104,10 +120,9 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve | |||||
| * On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1. | * On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1. | ||||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. | * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. | ||||
| ## Specification of Git Branches | |||||
| We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | |||||
| Now, there are 4 branches in github.com. | |||||
| * The master branch. This a main branch to reflect a production-ready state. | |||||
| * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. | |||||
| * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. | |||||
| * The gh-pages branch. This is for web pages | |||||
| ## Contributing | |||||
| 1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug. | |||||
| 1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. | |||||
| 1. Write a test which shows that the bug was fixed or that the feature works as expected. | |||||
| 1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. | |||||
| @@ -8,8 +8,8 @@ Supported List: | |||||
| 1.X86/X86_64 | 1.X86/X86_64 | ||||
| a)Intel CPU: | a)Intel CPU: | ||||
| P2 | P2 | ||||
| COPPERMINE | |||||
| KATMAI | KATMAI | ||||
| COPPERMINE | |||||
| NORTHWOOD | NORTHWOOD | ||||
| PRESCOTT | PRESCOTT | ||||
| BANIAS | BANIAS | ||||
| @@ -33,6 +33,8 @@ if ($ARGV[0] =~ /(.*)(-[.\d]+)/) { | |||||
| } | } | ||||
| $compiler = ""; | $compiler = ""; | ||||
| $compiler = LSB if ($data =~ /COMPILER_LSB/); | |||||
| $compiler = CLANG if ($data =~ /COMPILER_CLANG/); | |||||
| $compiler = PGI if ($data =~ /COMPILER_PGI/); | $compiler = PGI if ($data =~ /COMPILER_PGI/); | ||||
| $compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/); | $compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/); | ||||
| $compiler = INTEL if ($data =~ /COMPILER_INTEL/); | $compiler = INTEL if ($data =~ /COMPILER_INTEL/); | ||||
| @@ -117,7 +119,11 @@ if ($compiler eq "OPEN64") { | |||||
| $openmp = "-mp"; | $openmp = "-mp"; | ||||
| } | } | ||||
| if ($compiler eq "GCC") { | |||||
| if ($compiler eq "CLANG") { | |||||
| $openmp = "-fopenmp"; | |||||
| } | |||||
| if ($compiler eq "GCC" || $compiler eq "LSB") { | |||||
| $openmp = "-fopenmp"; | $openmp = "-fopenmp"; | ||||
| } | } | ||||
| @@ -241,13 +247,13 @@ print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; | |||||
| if ($os eq "LINUX") { | if ($os eq "LINUX") { | ||||
| @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`); | |||||
| # @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`); | |||||
| if ($pthread[2] ne "") { | |||||
| print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n"; | |||||
| } else { | |||||
| # if ($pthread[2] ne "") { | |||||
| # print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n"; | |||||
| # } else { | |||||
| print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; | print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; | ||||
| } | |||||
| # } | |||||
| } else { | } else { | ||||
| print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; | print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; | ||||
| } | } | ||||
| @@ -16,6 +16,16 @@ void goto_set_num_threads(int num_threads); | |||||
| /*Get the build configure on runtime.*/ | /*Get the build configure on runtime.*/ | ||||
| char* openblas_get_config(void); | char* openblas_get_config(void); | ||||
| /* Get the parallelization type which is used by OpenBLAS */ | |||||
| int openblas_get_parallel(void); | |||||
| /* OpenBLAS is compiled for sequential use */ | |||||
| #define OPENBLAS_SEQUENTIAL 0 | |||||
| /* OpenBLAS is compiled using normal threading model */ | |||||
| #define OPENBLAS_THREAD 1 | |||||
| /* OpenBLAS is compiled using OpenMP threading model */ | |||||
| #define OPENBLAS_OPENMP 2 | |||||
| #define CBLAS_INDEX size_t | #define CBLAS_INDEX size_t | ||||
| typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; | typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; | ||||
| @@ -314,6 +314,23 @@ typedef int blasint; | |||||
| #define YIELDING sched_yield() | #define YIELDING sched_yield() | ||||
| #endif | #endif | ||||
| /*** | |||||
| To alloc job_t on heap or statck. | |||||
| please https://github.com/xianyi/OpenBLAS/issues/246 | |||||
| ***/ | |||||
| #if defined(OS_WINDOWS) | |||||
| #define GETRF_MEM_ALLOC_THRESHOLD 32 | |||||
| #define BLAS3_MEM_ALLOC_THRESHOLD 32 | |||||
| #endif | |||||
| #ifndef GETRF_MEM_ALLOC_THRESHOLD | |||||
| #define GETRF_MEM_ALLOC_THRESHOLD 80 | |||||
| #endif | |||||
| #ifndef BLAS3_MEM_ALLOC_THRESHOLD | |||||
| #define BLAS3_MEM_ALLOC_THRESHOLD 160 | |||||
| #endif | |||||
| #ifdef QUAD_PRECISION | #ifdef QUAD_PRECISION | ||||
| #include "common_quad.h" | #include "common_quad.h" | ||||
| #endif | #endif | ||||
| @@ -150,9 +150,17 @@ REALNAME: | |||||
| #define PROFCODE .prologue 0 | #define PROFCODE .prologue 0 | ||||
| #endif | #endif | ||||
| #if defined(__linux__) && defined(__ELF__) | |||||
| #define GNUSTACK .section .note.GNU-stack,"",%progbits | |||||
| #else | |||||
| #define GNUSTACK | |||||
| #endif | |||||
| #define EPILOGUE \ | #define EPILOGUE \ | ||||
| .end REALNAME; \ | .end REALNAME; \ | ||||
| .ident VERSION | |||||
| .ident VERSION; \ | |||||
| GNUSTACK | |||||
| #endif | #endif | ||||
| #ifdef DOUBLE | #ifdef DOUBLE | ||||
| @@ -379,8 +379,15 @@ REALNAME: | |||||
| #define PROFCODE | #define PROFCODE | ||||
| #endif | #endif | ||||
| #if defined(__linux__) && defined(__ELF__) | |||||
| #define GNUSTACK .section .note.GNU-stack,"",%progbits | |||||
| #else | |||||
| #define GNUSTACK | |||||
| #endif | |||||
| #define EPILOGUE \ | #define EPILOGUE \ | ||||
| .endp REALNAME | |||||
| .endp REALNAME ; \ | |||||
| GNUSTACK | |||||
| #define START_ADDRESS 0x20000fc800000000UL | #define START_ADDRESS 0x20000fc800000000UL | ||||
| @@ -65,9 +65,16 @@ extern long int syscall (long int __sysno, ...); | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| static inline int my_mbind(void *addr, unsigned long len, int mode, | static inline int my_mbind(void *addr, unsigned long len, int mode, | ||||
| unsigned long *nodemask, unsigned long maxnode, | unsigned long *nodemask, unsigned long maxnode, | ||||
| unsigned flags) { | unsigned flags) { | ||||
| #if defined (__LSB_VERSION__) | |||||
| // So far, LSB (Linux Standard Base) don't support syscall(). | |||||
| // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 | |||||
| return 0; | |||||
| #else | |||||
| #if defined (LOONGSON3B) | #if defined (LOONGSON3B) | ||||
| #if defined (__64BIT__) | #if defined (__64BIT__) | ||||
| return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | ||||
| @@ -79,11 +86,17 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, | |||||
| // unsigned long null_nodemask=0; | // unsigned long null_nodemask=0; | ||||
| return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | ||||
| #endif | #endif | ||||
| #endif | |||||
| } | } | ||||
| static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { | static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { | ||||
| #if defined (__LSB_VERSION__) | |||||
| // So far, LSB (Linux Standard Base) don't support syscall(). | |||||
| // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 | |||||
| return 0; | |||||
| #else | |||||
| return syscall(SYS_set_mempolicy, mode, addr, flag); | return syscall(SYS_set_mempolicy, mode, addr, flag); | ||||
| #endif | |||||
| } | } | ||||
| static inline int my_gettid(void) { | static inline int my_gettid(void) { | ||||
| @@ -235,10 +235,17 @@ REALNAME: ;\ | |||||
| .set noreorder ;\ | .set noreorder ;\ | ||||
| .set nomacro | .set nomacro | ||||
| #if defined(__linux__) && defined(__ELF__) | |||||
| #define GNUSTACK .section .note.GNU-stack,"",%progbits | |||||
| #else | |||||
| #define GNUSTACK | |||||
| #endif | |||||
| #define EPILOGUE \ | #define EPILOGUE \ | ||||
| .set macro ;\ | .set macro ;\ | ||||
| .set reorder ;\ | .set reorder ;\ | ||||
| .end REALNAME | |||||
| .end REALNAME ;\ | |||||
| GNUSTACK | |||||
| #define PROFCODE | #define PROFCODE | ||||
| #endif | #endif | ||||
| @@ -255,8 +262,8 @@ REALNAME: ;\ | |||||
| #endif | #endif | ||||
| #if defined(LOONGSON3B) | #if defined(LOONGSON3B) | ||||
| #define PAGESIZE (32UL << 10) | |||||
| #define FIXED_PAGESIZE (32UL << 10) | |||||
| #define PAGESIZE (16UL << 10) | |||||
| #define FIXED_PAGESIZE (16UL << 10) | |||||
| #endif | #endif | ||||
| #ifndef PAGESIZE | #ifndef PAGESIZE | ||||
| @@ -199,8 +199,17 @@ static __inline int blas_quickdivide(blasint x, blasint y){ | |||||
| .type REALNAME, #function; \ | .type REALNAME, #function; \ | ||||
| .proc 07; \ | .proc 07; \ | ||||
| REALNAME:; | REALNAME:; | ||||
| #if defined(__linux__) && defined(__ELF__) | |||||
| #define GNUSTACK .section .note.GNU-stack,"",%progbits | |||||
| #else | |||||
| #define GNUSTACK | |||||
| #endif | |||||
| #define EPILOGUE \ | #define EPILOGUE \ | ||||
| .size REALNAME, .-REALNAME | |||||
| .size REALNAME, .-REALNAME; \ | |||||
| GNUSTACK | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -171,6 +171,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
| #define MMXSTORE movd | #define MMXSTORE movd | ||||
| #endif | #endif | ||||
| #if defined(PILEDRIVER) || defined(BULLDOZER) | |||||
| //Enable some optimazation for barcelona. | |||||
| #define BARCELONA_OPTIMIZATION | |||||
| #endif | |||||
| #if defined(HAVE_3DNOW) | #if defined(HAVE_3DNOW) | ||||
| #define EMMS femms | #define EMMS femms | ||||
| #elif defined(HAVE_MMX) | #elif defined(HAVE_MMX) | ||||
| @@ -296,7 +301,9 @@ REALNAME: | |||||
| #define PROFCODE | #define PROFCODE | ||||
| #endif | #endif | ||||
| #define EPILOGUE .size REALNAME, .-REALNAME | |||||
| #define EPILOGUE \ | |||||
| .size REALNAME, .-REALNAME; \ | |||||
| .section .note.GNU-stack,"",%progbits | |||||
| #endif | #endif | ||||
| @@ -335,6 +342,7 @@ REALNAME: | |||||
| #define ALIGN_2 .align 2 | #define ALIGN_2 .align 2 | ||||
| #define ALIGN_3 .align 3 | #define ALIGN_3 .align 3 | ||||
| #define ALIGN_4 .align 4 | #define ALIGN_4 .align 4 | ||||
| #define ALIGN_5 .align 5 | |||||
| #define ffreep fstp | #define ffreep fstp | ||||
| #endif | #endif | ||||
| @@ -356,11 +364,10 @@ REALNAME: | |||||
| #ifndef ALIGN_6 | #ifndef ALIGN_6 | ||||
| #define ALIGN_6 .align 64 | #define ALIGN_6 .align 64 | ||||
| #endif | |||||
| // ffreep %st(0). | // ffreep %st(0). | ||||
| // Because Clang didn't support ffreep, we directly use the opcode. | // Because Clang didn't support ffreep, we directly use the opcode. | ||||
| // Please check out http://www.sandpile.org/x86/opc_fpu.htm | // Please check out http://www.sandpile.org/x86/opc_fpu.htm | ||||
| #ifndef ffreep | #ifndef ffreep | ||||
| #define ffreep .byte 0xdf, 0xc0 # | #define ffreep .byte 0xdf, 0xc0 # | ||||
| #endif | #endif | ||||
| #endif | |||||
| @@ -218,6 +218,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
| #ifdef ASSEMBLER | #ifdef ASSEMBLER | ||||
| #if defined(PILEDRIVER) || defined(BULLDOZER) | |||||
| //Enable some optimazation for barcelona. | |||||
| #define BARCELONA_OPTIMIZATION | |||||
| #endif | |||||
| #if defined(HAVE_3DNOW) | #if defined(HAVE_3DNOW) | ||||
| #define EMMS femms | #define EMMS femms | ||||
| #elif defined(HAVE_MMX) | #elif defined(HAVE_MMX) | ||||
| @@ -367,7 +372,10 @@ REALNAME: | |||||
| #define PROFCODE | #define PROFCODE | ||||
| #endif | #endif | ||||
| #define EPILOGUE .size REALNAME, .-REALNAME | |||||
| #define EPILOGUE \ | |||||
| .size REALNAME, .-REALNAME; \ | |||||
| .section .note.GNU-stack,"",%progbits | |||||
| #endif | #endif | ||||
| @@ -106,6 +106,8 @@ | |||||
| #define CORE_SANDYBRIDGE 20 | #define CORE_SANDYBRIDGE 20 | ||||
| #define CORE_BOBCAT 21 | #define CORE_BOBCAT 21 | ||||
| #define CORE_BULLDOZER 22 | #define CORE_BULLDOZER 22 | ||||
| #define CORE_PILEDRIVER 23 | |||||
| #define CORE_HASWELL CORE_SANDYBRIDGE | |||||
| #define HAVE_SSE (1 << 0) | #define HAVE_SSE (1 << 0) | ||||
| #define HAVE_SSE2 (1 << 1) | #define HAVE_SSE2 (1 << 1) | ||||
| @@ -127,6 +129,7 @@ | |||||
| #define HAVE_FASTMOVU (1 << 17) | #define HAVE_FASTMOVU (1 << 17) | ||||
| #define HAVE_AVX (1 << 18) | #define HAVE_AVX (1 << 18) | ||||
| #define HAVE_FMA4 (1 << 19) | #define HAVE_FMA4 (1 << 19) | ||||
| #define HAVE_FMA3 (1 << 20) | |||||
| #define CACHE_INFO_L1_I 1 | #define CACHE_INFO_L1_I 1 | ||||
| #define CACHE_INFO_L1_D 2 | #define CACHE_INFO_L1_D 2 | ||||
| @@ -196,4 +199,8 @@ typedef struct { | |||||
| #define CPUTYPE_SANDYBRIDGE 44 | #define CPUTYPE_SANDYBRIDGE 44 | ||||
| #define CPUTYPE_BOBCAT 45 | #define CPUTYPE_BOBCAT 45 | ||||
| #define CPUTYPE_BULLDOZER 46 | #define CPUTYPE_BULLDOZER 46 | ||||
| #define CPUTYPE_PILEDRIVER 47 | |||||
| // this define is because BLAS doesn't have haswell specific optimizations yet | |||||
| #define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE | |||||
| #endif | #endif | ||||
| @@ -114,6 +114,7 @@ int detect(void){ | |||||
| if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970; | if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970; | ||||
| if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; | if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; | ||||
| if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | ||||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | ||||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | ||||
| @@ -41,10 +41,14 @@ | |||||
| #include "cpuid.h" | #include "cpuid.h" | ||||
| #ifdef NO_AVX | #ifdef NO_AVX | ||||
| #define CPUTYPE_HASWELL CPUTYPE_NEHALEM | |||||
| #define CORE_HASWELL CORE_NEHALEM | |||||
| #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM | #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM | ||||
| #define CORE_SANDYBRIDGE CORE_NEHALEM | #define CORE_SANDYBRIDGE CORE_NEHALEM | ||||
| #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | ||||
| #define CORE_BULLDOZER CORE_BARCELONA | #define CORE_BULLDOZER CORE_BARCELONA | ||||
| #define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA | |||||
| #define CORE_PILEDRIVER CORE_BARCELONA | |||||
| #endif | #endif | ||||
| #ifndef CPUIDEMU | #ifndef CPUIDEMU | ||||
| @@ -130,7 +134,7 @@ int support_avx(){ | |||||
| int ret=0; | int ret=0; | ||||
| cpuid(1, &eax, &ebx, &ecx, &edx); | cpuid(1, &eax, &ebx, &ecx, &edx); | ||||
| if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){ | |||||
| if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ | |||||
| xgetbv(0, &eax, &edx); | xgetbv(0, &eax, &edx); | ||||
| if((eax & 6) == 6){ | if((eax & 6) == 6){ | ||||
| ret=1; //OS support AVX | ret=1; //OS support AVX | ||||
| @@ -225,6 +229,7 @@ int get_cputype(int gettype){ | |||||
| if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; | if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; | ||||
| #ifndef NO_AVX | #ifndef NO_AVX | ||||
| if (support_avx()) feature |= HAVE_AVX; | if (support_avx()) feature |= HAVE_AVX; | ||||
| if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; | |||||
| #endif | #endif | ||||
| if (have_excpuid() >= 0x01) { | if (have_excpuid() >= 0x01) { | ||||
| @@ -1050,8 +1055,22 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_SANDYBRIDGE; | return CPUTYPE_SANDYBRIDGE; | ||||
| else | else | ||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| case 12: | |||||
| if(support_avx()) | |||||
| return CPUTYPE_HASWELL; | |||||
| else | |||||
| return CPUTYPE_NEHALEM; | |||||
| } | } | ||||
| break; | break; | ||||
| case 4: | |||||
| switch (model) { | |||||
| case 5: | |||||
| if(support_avx()) | |||||
| return CPUTYPE_HASWELL; | |||||
| else | |||||
| return CPUTYPE_NEHALEM; | |||||
| } | |||||
| break; | |||||
| } | } | ||||
| break; | break; | ||||
| case 0x7: | case 0x7: | ||||
| @@ -1084,11 +1103,21 @@ int get_cpuname(void){ | |||||
| case 1: | case 1: | ||||
| case 10: | case 10: | ||||
| return CPUTYPE_BARCELONA; | return CPUTYPE_BARCELONA; | ||||
| case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | |||||
| return CPUTYPE_BULLDOZER; | |||||
| else | |||||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||||
| case 6: | |||||
| switch (model) { | |||||
| case 1: | |||||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | |||||
| return CPUTYPE_BULLDOZER; | |||||
| else | |||||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||||
| case 2: | |||||
| if(support_avx()) | |||||
| return CPUTYPE_PILEDRIVER; | |||||
| else | |||||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||||
| } | |||||
| break; | |||||
| case 5: | case 5: | ||||
| return CPUTYPE_BOBCAT; | return CPUTYPE_BOBCAT; | ||||
| } | } | ||||
| @@ -1213,6 +1242,7 @@ static char *cpuname[] = { | |||||
| "SANDYBRIDGE", | "SANDYBRIDGE", | ||||
| "BOBCAT", | "BOBCAT", | ||||
| "BULLDOZER", | "BULLDOZER", | ||||
| "PILEDRIVER", | |||||
| }; | }; | ||||
| static char *lowercpuname[] = { | static char *lowercpuname[] = { | ||||
| @@ -1262,6 +1292,7 @@ static char *lowercpuname[] = { | |||||
| "sandybridge", | "sandybridge", | ||||
| "bobcat", | "bobcat", | ||||
| "bulldozer", | "bulldozer", | ||||
| "piledriver", | |||||
| }; | }; | ||||
| static char *corename[] = { | static char *corename[] = { | ||||
| @@ -1288,6 +1319,7 @@ static char *corename[] = { | |||||
| "SANDYBRIDGE", | "SANDYBRIDGE", | ||||
| "BOBCAT", | "BOBCAT", | ||||
| "BULLDOZER", | "BULLDOZER", | ||||
| "PILEDRIVER", | |||||
| }; | }; | ||||
| static char *corename_lower[] = { | static char *corename_lower[] = { | ||||
| @@ -1314,6 +1346,7 @@ static char *corename_lower[] = { | |||||
| "sandybridge", | "sandybridge", | ||||
| "bobcat", | "bobcat", | ||||
| "bulldozer", | "bulldozer", | ||||
| "piledriver", | |||||
| }; | }; | ||||
| @@ -1424,8 +1457,22 @@ int get_coretype(void){ | |||||
| return CORE_SANDYBRIDGE; | return CORE_SANDYBRIDGE; | ||||
| else | else | ||||
| return CORE_NEHALEM; //OS doesn't support AVX | return CORE_NEHALEM; //OS doesn't support AVX | ||||
| case 12: | |||||
| if(support_avx()) | |||||
| return CORE_HASWELL; | |||||
| else | |||||
| return CORE_NEHALEM; | |||||
| } | } | ||||
| break; | break; | ||||
| case 4: | |||||
| switch (model) { | |||||
| case 5: | |||||
| if(support_avx()) | |||||
| return CORE_HASWELL; | |||||
| else | |||||
| return CORE_NEHALEM; | |||||
| } | |||||
| break; | |||||
| } | } | ||||
| break; | break; | ||||
| @@ -1442,11 +1489,19 @@ int get_coretype(void){ | |||||
| if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | ||||
| else if (exfamily == 5) return CORE_BOBCAT; | else if (exfamily == 5) return CORE_BOBCAT; | ||||
| else if (exfamily == 6) { | else if (exfamily == 6) { | ||||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | |||||
| return CORE_BULLDOZER; | |||||
| else | |||||
| return CORE_BARCELONA; //OS don't support AVX. Use old kernels. | |||||
| switch (model) { | |||||
| case 1: | |||||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | |||||
| return CORE_BULLDOZER; | |||||
| else | |||||
| return CORE_BARCELONA; //OS don't support AVX. | |||||
| case 2: | |||||
| if(support_avx()) | |||||
| return CORE_PILEDRIVER; | |||||
| else | |||||
| return CORE_BARCELONA; //OS don't support AVX. | |||||
| } | |||||
| }else return CORE_BARCELONA; | }else return CORE_BARCELONA; | ||||
| } | } | ||||
| } | } | ||||
| @@ -1534,6 +1589,7 @@ void get_cpuconfig(void){ | |||||
| if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | ||||
| if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | ||||
| if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | ||||
| if (features & HAVE_FMA3 ) printf("#define HAVE_FMA3\n"); | |||||
| if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); | if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); | ||||
| if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); | if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); | ||||
| if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); | if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); | ||||
| @@ -1601,5 +1657,6 @@ void get_sse(void){ | |||||
| if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | ||||
| if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | ||||
| if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | ||||
| if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n"); | |||||
| } | } | ||||
| @@ -1,3 +1,17 @@ | |||||
| //LSB (Linux Standard Base) compiler | |||||
| //only support lsbc++ | |||||
| #if defined (__LSB_VERSION__) | |||||
| #if !defined (__cplusplus) | |||||
| COMPILER_LSB | |||||
| #else | |||||
| #error "OpenBLAS only supports lsbcc." | |||||
| #endif | |||||
| #endif | |||||
| #if defined(__clang__) | |||||
| COMPILER_CLANG | |||||
| #endif | |||||
| #if defined(__PGI) || defined(__PGIC__) | #if defined(__PGI) || defined(__PGIC__) | ||||
| COMPILER_PGI | COMPILER_PGI | ||||
| #endif | #endif | ||||
| @@ -77,7 +77,7 @@ endif | |||||
| clean :: | clean :: | ||||
| rm -f x* | rm -f x* | ||||
| FLDFLAGS = $(FFLAGS:-fPIC=) | |||||
| FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | |||||
| CEXTRALIB = | CEXTRALIB = | ||||
| # Single real | # Single real | ||||
| @@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||||
| a = (FLOAT *)args -> a; | a = (FLOAT *)args -> a; | ||||
| x = (FLOAT *)args -> b; | x = (FLOAT *)args -> b; | ||||
| y = (FLOAT *)args -> c; | |||||
| lda = args -> lda; | lda = args -> lda; | ||||
| incx = args -> ldb; | incx = args -> ldb; | ||||
| @@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||||
| n_from = 0; | n_from = 0; | ||||
| n_to = n; | n_to = n; | ||||
| //Use y as each thread's n* COMPSIZE elements in sb buffer | |||||
| y = buffer; | |||||
| buffer += ((COMPSIZE * n + 1023) & ~1023); | |||||
| if (range_m) { | if (range_m) { | ||||
| n_from = *(range_m + 0); | n_from = *(range_m + 0); | ||||
| n_to = *(range_m + 1); | n_to = *(range_m + 1); | ||||
| @@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||||
| a += n_from * lda * COMPSIZE; | a += n_from * lda * COMPSIZE; | ||||
| } | } | ||||
| if (range_n) y += *range_n * COMPSIZE; | |||||
| if (incx != 1) { | if (incx != 1) { | ||||
| COPY_K(n, x, incx, buffer, 1); | COPY_K(n, x, incx, buffer, 1); | ||||
| @@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||||
| if (num_cpu) { | if (num_cpu) { | ||||
| queue[0].sa = NULL; | queue[0].sa = NULL; | ||||
| queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; | |||||
| queue[0].sb = buffer; | |||||
| queue[num_cpu - 1].next = NULL; | queue[num_cpu - 1].next = NULL; | ||||
| exec_blas(num_cpu, queue); | exec_blas(num_cpu, queue); | ||||
| @@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||||
| #else | #else | ||||
| ONE, ZERO, | ONE, ZERO, | ||||
| #endif | #endif | ||||
| buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); | |||||
| (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0); | |||||
| } | } | ||||
| AXPYU_K(n, 0, 0, | AXPYU_K(n, 0, 0, | ||||
| @@ -71,7 +71,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||||
| queue[num_cpu].args = arg; | queue[num_cpu].args = arg; | ||||
| queue[num_cpu].range_m = range_m; | queue[num_cpu].range_m = range_m; | ||||
| queue[num_cpu].range_n = &range[num_cpu]; | queue[num_cpu].range_n = &range[num_cpu]; | ||||
| #if defined(LOONGSON3A) | |||||
| #if 0 //defined(LOONGSON3A) | |||||
| queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; | queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; | ||||
| queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; | queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; | ||||
| #else | #else | ||||
| @@ -83,7 +83,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||||
| } | } | ||||
| if (num_cpu) { | if (num_cpu) { | ||||
| #if defined(LOONGSON3A) | |||||
| #if 0 //defined(LOONGSON3A) | |||||
| queue[0].sa = sa; | queue[0].sa = sa; | ||||
| queue[0].sb = sa + GEMM_OFFSET_A1 * 5; | queue[0].sb = sa + GEMM_OFFSET_A1 * 5; | ||||
| #else | #else | ||||
| @@ -332,7 +332,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| #else | #else | ||||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | for(jjs = js; jjs < js + min_j; jjs += min_jj){ | ||||
| min_jj = min_j + js - jjs; | min_jj = min_j + js - jjs; | ||||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||||
| #if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) | |||||
| if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; | |||||
| else | |||||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||||
| else | |||||
| if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | |||||
| else | |||||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||||
| #else | |||||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||||
| #endif | |||||
| START_RPCC(); | START_RPCC(); | ||||
| @@ -48,6 +48,12 @@ | |||||
| #define SWITCH_RATIO 2 | #define SWITCH_RATIO 2 | ||||
| #endif | #endif | ||||
| //The array of job_t may overflow the stack. | |||||
| //Instead, use malloc to alloc job_t. | |||||
| #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD | |||||
| #define USE_ALLOC_HEAP | |||||
| #endif | |||||
| #ifndef GEMM3M_LOCAL | #ifndef GEMM3M_LOCAL | ||||
| #if defined(NN) | #if defined(NN) | ||||
| #define GEMM3M_LOCAL GEMM3M_NN | #define GEMM3M_LOCAL GEMM3M_NN | ||||
| @@ -836,7 +842,11 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||||
| BLASLONG range_M[MAX_CPU_NUMBER + 1]; | BLASLONG range_M[MAX_CPU_NUMBER + 1]; | ||||
| BLASLONG range_N[MAX_CPU_NUMBER + 1]; | BLASLONG range_N[MAX_CPU_NUMBER + 1]; | ||||
| job_t job[MAX_CPU_NUMBER]; | |||||
| #ifndef USE_ALLOC_HEAP | |||||
| job_t job[MAX_CPU_NUMBER]; | |||||
| #else | |||||
| job_t * job = NULL; | |||||
| #endif | |||||
| BLASLONG num_cpu_m, num_cpu_n; | BLASLONG num_cpu_m, num_cpu_n; | ||||
| @@ -866,6 +876,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||||
| newarg.alpha = args -> alpha; | newarg.alpha = args -> alpha; | ||||
| newarg.beta = args -> beta; | newarg.beta = args -> beta; | ||||
| newarg.nthreads = args -> nthreads; | newarg.nthreads = args -> nthreads; | ||||
| #ifdef USE_ALLOC_HEAP | |||||
| job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); | |||||
| if(job==NULL){ | |||||
| fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); | |||||
| exit(1); | |||||
| } | |||||
| #endif | |||||
| newarg.common = (void *)job; | newarg.common = (void *)job; | ||||
| if (!range_m) { | if (!range_m) { | ||||
| @@ -945,6 +964,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||||
| exec_blas(num_cpu_m, queue); | exec_blas(num_cpu_m, queue); | ||||
| } | } | ||||
| #ifdef USE_ALLOC_HEAP | |||||
| free(job); | |||||
| #endif | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| @@ -48,6 +48,12 @@ | |||||
| #define SWITCH_RATIO 2 | #define SWITCH_RATIO 2 | ||||
| #endif | #endif | ||||
| //The array of job_t may overflow the stack. | |||||
| //Instead, use malloc to alloc job_t. | |||||
| #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD | |||||
| #define USE_ALLOC_HEAP | |||||
| #endif | |||||
| #ifndef SYRK_LOCAL | #ifndef SYRK_LOCAL | ||||
| #if !defined(LOWER) && !defined(TRANS) | #if !defined(LOWER) && !defined(TRANS) | ||||
| #define SYRK_LOCAL SYRK_UN | #define SYRK_LOCAL SYRK_UN | ||||
| @@ -502,7 +508,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| blas_arg_t newarg; | blas_arg_t newarg; | ||||
| #ifndef USE_ALLOC_HEAP | |||||
| job_t job[MAX_CPU_NUMBER]; | job_t job[MAX_CPU_NUMBER]; | ||||
| #else | |||||
| job_t * job = NULL; | |||||
| #endif | |||||
| blas_queue_t queue[MAX_CPU_NUMBER]; | blas_queue_t queue[MAX_CPU_NUMBER]; | ||||
| BLASLONG range[MAX_CPU_NUMBER + 100]; | BLASLONG range[MAX_CPU_NUMBER + 100]; | ||||
| @@ -556,6 +567,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| newarg.ldc = args -> ldc; | newarg.ldc = args -> ldc; | ||||
| newarg.alpha = args -> alpha; | newarg.alpha = args -> alpha; | ||||
| newarg.beta = args -> beta; | newarg.beta = args -> beta; | ||||
| #ifdef USE_ALLOC_HEAP | |||||
| job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); | |||||
| if(job==NULL){ | |||||
| fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); | |||||
| exit(1); | |||||
| } | |||||
| #endif | |||||
| newarg.common = (void *)job; | newarg.common = (void *)job; | ||||
| if (!range_n) { | if (!range_n) { | ||||
| @@ -668,6 +688,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| exec_blas(num_cpu, queue); | exec_blas(num_cpu, queue); | ||||
| } | } | ||||
| #ifdef USE_ALLOC_HEAP | |||||
| free(job); | |||||
| #endif | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| @@ -48,6 +48,12 @@ | |||||
| #define SWITCH_RATIO 2 | #define SWITCH_RATIO 2 | ||||
| #endif | #endif | ||||
| //The array of job_t may overflow the stack. | |||||
| //Instead, use malloc to alloc job_t. | |||||
| #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD | |||||
| #define USE_ALLOC_HEAP | |||||
| #endif | |||||
| #ifndef GEMM_LOCAL | #ifndef GEMM_LOCAL | ||||
| #if defined(NN) | #if defined(NN) | ||||
| #define GEMM_LOCAL GEMM_NN | #define GEMM_LOCAL GEMM_NN | ||||
| @@ -360,8 +366,20 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ | for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ | ||||
| min_jj = MIN(n_to, xxx + div_n) - jjs; | min_jj = MIN(n_to, xxx + div_n) - jjs; | ||||
| #if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) | |||||
| if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; | |||||
| else | |||||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||||
| else | |||||
| if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | |||||
| else | |||||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||||
| #else | |||||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||
| #endif | |||||
| START_RPCC(); | START_RPCC(); | ||||
| OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, | OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, | ||||
| @@ -519,7 +537,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||||
| blas_arg_t newarg; | blas_arg_t newarg; | ||||
| #ifndef USE_ALLOC_HEAP | |||||
| job_t job[MAX_CPU_NUMBER]; | job_t job[MAX_CPU_NUMBER]; | ||||
| #else | |||||
| job_t * job = NULL; | |||||
| #endif | |||||
| blas_queue_t queue[MAX_CPU_NUMBER]; | blas_queue_t queue[MAX_CPU_NUMBER]; | ||||
| BLASLONG range_M[MAX_CPU_NUMBER + 1]; | BLASLONG range_M[MAX_CPU_NUMBER + 1]; | ||||
| @@ -563,6 +586,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||||
| newarg.alpha = args -> alpha; | newarg.alpha = args -> alpha; | ||||
| newarg.beta = args -> beta; | newarg.beta = args -> beta; | ||||
| newarg.nthreads = args -> nthreads; | newarg.nthreads = args -> nthreads; | ||||
| #ifdef USE_ALLOC_HEAP | |||||
| job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); | |||||
| if(job==NULL){ | |||||
| fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); | |||||
| exit(1); | |||||
| } | |||||
| #endif | |||||
| newarg.common = (void *)job; | newarg.common = (void *)job; | ||||
| #ifdef PARAMTEST | #ifdef PARAMTEST | ||||
| @@ -634,7 +666,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||||
| num_cpu_n ++; | num_cpu_n ++; | ||||
| } | } | ||||
| for (j = 0; j < num_cpu_m; j++) { | for (j = 0; j < num_cpu_m; j++) { | ||||
| for (i = 0; i < num_cpu_m; i++) { | for (i = 0; i < num_cpu_m; i++) { | ||||
| for (k = 0; k < DIVIDE_RATE; k++) { | for (k = 0; k < DIVIDE_RATE; k++) { | ||||
| @@ -648,6 +680,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||||
| exec_blas(num_cpu_m, queue); | exec_blas(num_cpu_m, queue); | ||||
| } | } | ||||
| #ifdef USE_ALLOC_HEAP | |||||
| free(job); | |||||
| #endif | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| @@ -1,7 +1,7 @@ | |||||
| TOPDIR = ../.. | TOPDIR = ../.. | ||||
| include ../../Makefile.system | include ../../Makefile.system | ||||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) | |||||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) | |||||
| COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | ||||
| @@ -106,6 +106,9 @@ openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c | |||||
| openblas_get_config.$(SUFFIX) : openblas_get_config.c | openblas_get_config.$(SUFFIX) : openblas_get_config.c | ||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | $(CC) $(CFLAGS) -c $< -o $(@F) | ||||
| openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c | |||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||||
| blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h | blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h | ||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | $(CC) $(CFLAGS) -c $< -o $(@F) | ||||
| @@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){ | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | ||||
| } | } | ||||
| } | } | ||||
| queue->sb=sb; | |||||
| } | } | ||||
| #ifdef MONITOR | #ifdef MONITOR | ||||
| @@ -49,8 +49,12 @@ | |||||
| int blas_server_avail = 0; | int blas_server_avail = 0; | ||||
| static void * blas_thread_buffer[MAX_CPU_NUMBER]; | |||||
| void goto_set_num_threads(int num_threads) { | void goto_set_num_threads(int num_threads) { | ||||
| int i=0; | |||||
| if (num_threads < 1) num_threads = blas_num_threads; | if (num_threads < 1) num_threads = blas_num_threads; | ||||
| if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | ||||
| @@ -62,7 +66,19 @@ void goto_set_num_threads(int num_threads) { | |||||
| blas_cpu_number = num_threads; | blas_cpu_number = num_threads; | ||||
| omp_set_num_threads(blas_cpu_number); | omp_set_num_threads(blas_cpu_number); | ||||
| //adjust buffer for each thread | |||||
| for(i=0; i<blas_cpu_number; i++){ | |||||
| if(blas_thread_buffer[i]==NULL){ | |||||
| blas_thread_buffer[i]=blas_memory_alloc(2); | |||||
| } | |||||
| } | |||||
| for(; i<MAX_CPU_NUMBER; i++){ | |||||
| if(blas_thread_buffer[i]!=NULL){ | |||||
| blas_memory_free(blas_thread_buffer[i]); | |||||
| blas_thread_buffer[i]=NULL; | |||||
| } | |||||
| } | |||||
| #if defined(ARCH_MIPS64) | #if defined(ARCH_MIPS64) | ||||
| //set parameters for different number of threads. | //set parameters for different number of threads. | ||||
| blas_set_parameter(); | blas_set_parameter(); | ||||
| @@ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) { | |||||
| int blas_thread_init(void){ | int blas_thread_init(void){ | ||||
| int i=0; | |||||
| blas_get_cpu_number(); | blas_get_cpu_number(); | ||||
| blas_server_avail = 1; | blas_server_avail = 1; | ||||
| for(i=0; i<blas_num_threads; i++){ | |||||
| blas_thread_buffer[i]=blas_memory_alloc(2); | |||||
| } | |||||
| for(; i<MAX_CPU_NUMBER; i++){ | |||||
| blas_thread_buffer[i]=NULL; | |||||
| } | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| int BLASFUNC(blas_thread_shutdown)(void){ | int BLASFUNC(blas_thread_shutdown)(void){ | ||||
| int i=0; | |||||
| blas_server_avail = 0; | blas_server_avail = 0; | ||||
| for(i=0; i<MAX_CPU_NUMBER; i++){ | |||||
| if(blas_thread_buffer[i]!=NULL){ | |||||
| blas_memory_free(blas_thread_buffer[i]); | |||||
| blas_thread_buffer[i]=NULL; | |||||
| } | |||||
| } | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| @@ -177,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||||
| static void exec_threads(blas_queue_t *queue){ | static void exec_threads(blas_queue_t *queue){ | ||||
| void *buffer, *sa, *sb; | void *buffer, *sa, *sb; | ||||
| int pos=0, release_flag=0; | |||||
| buffer = NULL; | buffer = NULL; | ||||
| sa = queue -> sa; | sa = queue -> sa; | ||||
| sb = queue -> sb; | sb = queue -> sb; | ||||
| @@ -189,9 +222,19 @@ static void exec_threads(blas_queue_t *queue){ | |||||
| if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { | if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { | ||||
| buffer = blas_memory_alloc(2); | |||||
| pos = omp_get_thread_num(); | |||||
| buffer = blas_thread_buffer[pos]; | |||||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||||
| //fallback | |||||
| if(buffer==NULL) { | |||||
| buffer = blas_memory_alloc(2); | |||||
| release_flag=1; | |||||
| } | |||||
| if (sa == NULL) { | |||||
| sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||||
| queue->sa=sa; | |||||
| } | |||||
| if (sb == NULL) { | if (sb == NULL) { | ||||
| if (!(queue -> mode & BLAS_COMPLEX)){ | if (!(queue -> mode & BLAS_COMPLEX)){ | ||||
| @@ -224,6 +267,7 @@ static void exec_threads(blas_queue_t *queue){ | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | ||||
| } | } | ||||
| } | } | ||||
| queue->sb=sb; | |||||
| } | } | ||||
| } | } | ||||
| @@ -241,7 +285,7 @@ static void exec_threads(blas_queue_t *queue){ | |||||
| } | } | ||||
| if (buffer != NULL) blas_memory_free(buffer); | |||||
| if (release_flag) blas_memory_free(buffer); | |||||
| } | } | ||||
| @@ -253,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | ||||
| } | } | ||||
| } | } | ||||
| queue->sb=sb; | |||||
| } | } | ||||
| #ifdef MONITOR | #ifdef MONITOR | ||||
| @@ -495,4 +496,4 @@ void goto_set_num_threads(int num_threads) | |||||
| void openblas_set_num_threads(int num) | void openblas_set_num_threads(int num) | ||||
| { | { | ||||
| goto_set_num_threads(num); | goto_set_num_threads(num); | ||||
| } | |||||
| } | |||||
| @@ -64,12 +64,15 @@ extern gotoblas_t gotoblas_BOBCAT; | |||||
| #ifndef NO_AVX | #ifndef NO_AVX | ||||
| extern gotoblas_t gotoblas_SANDYBRIDGE; | extern gotoblas_t gotoblas_SANDYBRIDGE; | ||||
| extern gotoblas_t gotoblas_BULLDOZER; | extern gotoblas_t gotoblas_BULLDOZER; | ||||
| extern gotoblas_t gotoblas_PILEDRIVER; | |||||
| #else | #else | ||||
| //Use NEHALEM kernels for sandy bridge | //Use NEHALEM kernels for sandy bridge | ||||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | ||||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | #define gotoblas_BULLDOZER gotoblas_BARCELONA | ||||
| #define gotoblas_PILEDRIVER gotoblas_BARCELONA | |||||
| #endif | #endif | ||||
| //Use sandy bridge kernels for haswell. | |||||
| #define gotoblas_HASWELL gotoblas_SANDYBRIDGE | |||||
| #define VENDOR_INTEL 1 | #define VENDOR_INTEL 1 | ||||
| #define VENDOR_AMD 2 | #define VENDOR_AMD 2 | ||||
| @@ -92,7 +95,7 @@ int support_avx(){ | |||||
| int ret=0; | int ret=0; | ||||
| cpuid(1, &eax, &ebx, &ecx, &edx); | cpuid(1, &eax, &ebx, &ecx, &edx); | ||||
| if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){ | |||||
| if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ | |||||
| xgetbv(0, &eax, &edx); | xgetbv(0, &eax, &edx); | ||||
| if((eax & 6) == 6){ | if((eax & 6) == 6){ | ||||
| ret=1; //OS support AVX | ret=1; //OS support AVX | ||||
| @@ -175,7 +178,7 @@ static gotoblas_t *get_coretype(void){ | |||||
| if(support_avx()) | if(support_avx()) | ||||
| return &gotoblas_SANDYBRIDGE; | return &gotoblas_SANDYBRIDGE; | ||||
| else{ | else{ | ||||
| fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n"); | |||||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); | |||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
| } | } | ||||
| } | } | ||||
| @@ -186,7 +189,27 @@ static gotoblas_t *get_coretype(void){ | |||||
| if(support_avx()) | if(support_avx()) | ||||
| return &gotoblas_SANDYBRIDGE; | return &gotoblas_SANDYBRIDGE; | ||||
| else{ | else{ | ||||
| fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n"); | |||||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); | |||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| } | |||||
| //Intel Haswell | |||||
| if (model == 12) { | |||||
| if(support_avx()) | |||||
| return &gotoblas_HASWELL; | |||||
| else{ | |||||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); | |||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| } | |||||
| return NULL; | |||||
| case 4: | |||||
| //Intel Haswell | |||||
| if (model == 5) { | |||||
| if(support_avx()) | |||||
| return &gotoblas_HASWELL; | |||||
| else{ | |||||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); | |||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
| } | } | ||||
| } | } | ||||
| @@ -207,13 +230,23 @@ static gotoblas_t *get_coretype(void){ | |||||
| } else if (exfamily == 5) { | } else if (exfamily == 5) { | ||||
| return &gotoblas_BOBCAT; | return &gotoblas_BOBCAT; | ||||
| } else if (exfamily == 6) { | } else if (exfamily == 6) { | ||||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(model == 1){ | |||||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | if(support_avx()) | ||||
| return &gotoblas_BULLDOZER; | return &gotoblas_BULLDOZER; | ||||
| else{ | else{ | ||||
| fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n"); | |||||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"); | |||||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | ||||
| } | |||||
| } | |||||
| }else if(model == 2){ | |||||
| //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300 | |||||
| if(support_avx()) | |||||
| return &gotoblas_PILEDRIVER; | |||||
| else{ | |||||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"); | |||||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| } | |||||
| } else { | } else { | ||||
| return &gotoblas_BARCELONA; | return &gotoblas_BARCELONA; | ||||
| } | } | ||||
| @@ -251,6 +284,7 @@ static char *corename[] = { | |||||
| "Sandybridge", | "Sandybridge", | ||||
| "Bobcat", | "Bobcat", | ||||
| "Bulldozer", | "Bulldozer", | ||||
| "Piledriver", | |||||
| }; | }; | ||||
| char *gotoblas_corename(void) { | char *gotoblas_corename(void) { | ||||
| @@ -273,6 +307,7 @@ char *gotoblas_corename(void) { | |||||
| if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | ||||
| if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | ||||
| if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | ||||
| if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; | |||||
| return corename[0]; | return corename[0]; | ||||
| } | } | ||||
| @@ -82,6 +82,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include <sched.h> | #include <sched.h> | ||||
| #include <dirent.h> | #include <dirent.h> | ||||
| #include <dlfcn.h> | #include <dlfcn.h> | ||||
| #include <unistd.h> | |||||
| #include <string.h> | |||||
| #define MAX_NODES 16 | #define MAX_NODES 16 | ||||
| #define MAX_CPUS 256 | #define MAX_CPUS 256 | ||||
| @@ -314,7 +316,7 @@ static int numa_check(void) { | |||||
| } | } | ||||
| while ((dir = readdir(dp)) != NULL) { | while ((dir = readdir(dp)) != NULL) { | ||||
| if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { | |||||
| if (strncmp(dir->d_name, "node", 4)==0) { | |||||
| node = atoi(&dir -> d_name[4]); | node = atoi(&dir -> d_name[4]); | ||||
| @@ -735,7 +737,8 @@ void gotoblas_affinity_init(void) { | |||||
| fprintf(stderr, "Shared Memory Initialization.\n"); | fprintf(stderr, "Shared Memory Initialization.\n"); | ||||
| #endif | #endif | ||||
| common -> num_procs = get_nprocs(); | |||||
| //returns the number of processors which are currently online | |||||
| common -> num_procs = sysconf(_SC_NPROCESSORS_ONLN);; | |||||
| if(common -> num_procs > MAX_CPUS) { | if(common -> num_procs > MAX_CPUS) { | ||||
| fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); | fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); | ||||
| @@ -105,6 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(OS_FREEBSD) || defined(OS_DARWIN) | #if defined(OS_FREEBSD) || defined(OS_DARWIN) | ||||
| #include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
| #include <sys/resource.h> | |||||
| #endif | #endif | ||||
| #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) | #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) | ||||
| @@ -125,7 +126,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define NO_WARMUP | #define NO_WARMUP | ||||
| #endif | #endif | ||||
| #ifdef ALLOC_HUGETLB | |||||
| #ifndef SHM_HUGETLB | |||||
| #define SHM_HUGETLB 04000 | #define SHM_HUGETLB 04000 | ||||
| #endif | #endif | ||||
| @@ -216,6 +217,25 @@ int get_num_procs(void) { | |||||
| } | } | ||||
| return nums; | return nums; | ||||
| } | } | ||||
| /* | |||||
| void set_stack_limit(int limitMB){ | |||||
| int result=0; | |||||
| struct rlimit rl; | |||||
| rlim_t StackSize; | |||||
| StackSize=limitMB*1024*1024; | |||||
| result=getrlimit(RLIMIT_STACK, &rl); | |||||
| if(result==0){ | |||||
| if(rl.rlim_cur < StackSize){ | |||||
| rl.rlim_cur=StackSize; | |||||
| result=setrlimit(RLIMIT_STACK, &rl); | |||||
| if(result !=0){ | |||||
| fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| */ | |||||
| #endif | #endif | ||||
| /* | /* | ||||
| @@ -1248,6 +1268,7 @@ void CONSTRUCTOR gotoblas_init(void) { | |||||
| if (gotoblas_initialized) return; | if (gotoblas_initialized) return; | ||||
| #ifdef PROFILE | #ifdef PROFILE | ||||
| moncontrol (0); | moncontrol (0); | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,52 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2013 Martin Koehler, grisuthedragon@users.github.com | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include "common.h" | |||||
| #if defined(USE_OPENMP) | |||||
| static int parallel = 2 ; | |||||
| #elif defined(SMP_SERVER) | |||||
| static int parallel = 1; | |||||
| #else | |||||
| static int parallel = 0; | |||||
| #endif | |||||
| int CNAME() { | |||||
| return parallel; | |||||
| } | |||||
| int NAME() { | |||||
| return parallel; | |||||
| } | |||||
| @@ -89,7 +89,7 @@ else | |||||
| endif | endif | ||||
| libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def | libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def | ||||
| $(CC) $(CFLAGS) libgoto2_shared.def -shared -o $(@F) \ | |||||
| $(CC) $(CFLAGS) $(LDFLAGS) libgoto2_shared.def -shared -o $(@F) \ | |||||
| -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ | -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ | ||||
| -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) | -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) | ||||
| @@ -116,10 +116,15 @@ ifeq ($(OSNAME), Linux) | |||||
| so : ../$(LIBSONAME) | so : ../$(LIBSONAME) | ||||
| ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c | ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c | ||||
| $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ | |||||
| $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||||
| -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ | -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ | ||||
| -Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) | -Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) | ||||
| $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||||
| ifneq ($(C_COMPILER), LSB) | |||||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||||
| else | |||||
| #Use FC on LSB | |||||
| $(FC) $(FFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||||
| endif | |||||
| rm -f linktest | rm -f linktest | ||||
| endif | endif | ||||
| @@ -130,10 +135,10 @@ ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD)) | |||||
| so : ../$(LIBSONAME) | so : ../$(LIBSONAME) | ||||
| ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c | ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c | ||||
| $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ | |||||
| $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||||
| -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ | -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ | ||||
| -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB) | -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB) | ||||
| $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||||
| rm -f linktest | rm -f linktest | ||||
| endif | endif | ||||
| @@ -143,15 +148,15 @@ ifeq ($(OSNAME), OSF1) | |||||
| so : ../$(LIBSONAME) | so : ../$(LIBSONAME) | ||||
| ../$(LIBSONAME) : | ../$(LIBSONAME) : | ||||
| $(CC) -shared -o ../$(LIBSONAME) ../$(LIBNAME) | |||||
| $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) ../$(LIBNAME) | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), SunOS) | ifeq ($(OSNAME), SunOS) | ||||
| so : ../$(LIBSONAME) | so : ../$(LIBSONAME) | ||||
| $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ | |||||
| $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||||
| -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(EXTRALIB) | -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(EXTRALIB) | ||||
| $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||||
| rm -f linktest | rm -f linktest | ||||
| endif | endif | ||||
| @@ -194,7 +199,7 @@ symbol.S : gensymbol | |||||
| perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > symbol.S | perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > symbol.S | ||||
| test : linktest.c | test : linktest.c | ||||
| $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | |||||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | |||||
| rm -f linktest | rm -f linktest | ||||
| linktest.c : gensymbol ../Makefile.system ../getarch.c | linktest.c : gensymbol ../Makefile.system ../getarch.c | ||||
| @@ -49,7 +49,7 @@ | |||||
| cblas_zhemv, cblas_zher2, cblas_zher2k, cblas_zher, cblas_zherk, cblas_zhpmv, cblas_zhpr2, | cblas_zhemv, cblas_zher2, cblas_zher2k, cblas_zher, cblas_zherk, cblas_zhpmv, cblas_zhpr2, | ||||
| cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, | cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, | ||||
| cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, | cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, | ||||
| cblas_ztrsv); | |||||
| cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub ); | |||||
| @exblasobjs = ( | @exblasobjs = ( | ||||
| qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, | qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, | ||||
| @@ -72,13 +72,18 @@ | |||||
| zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m, | zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m, | ||||
| ); | ); | ||||
| #both underscore and no underscore | |||||
| @misc_common_objs = ( | |||||
| openblas_set_num_threads, openblas_get_parallel, | |||||
| ); | |||||
| @misc_no_underscore_objs = ( | @misc_no_underscore_objs = ( | ||||
| openblas_set_num_threads, goto_set_num_threads, | |||||
| goto_set_num_threads, | |||||
| openblas_get_config, | openblas_get_config, | ||||
| ); | ); | ||||
| @misc_underscore_objs = ( | @misc_underscore_objs = ( | ||||
| openblas_set_num_threads, | |||||
| ); | ); | ||||
| @lapackobjs = ( | @lapackobjs = ( | ||||
| @@ -111,7 +116,7 @@ | |||||
| # already provided by @blasobjs: xerbla, lsame | # already provided by @blasobjs: xerbla, lsame | ||||
| ilaenv, ieeeck, lsamen, xerbla_array, iparmq, | ilaenv, ieeeck, lsamen, xerbla_array, iparmq, | ||||
| ilaprec, ilatrans, ilauplo, iladiag, chla_transtype, | ilaprec, ilatrans, ilauplo, iladiag, chla_transtype, | ||||
| ilaver, slamch, | |||||
| ilaver, slamch, slamc3, | |||||
| # SCLAUX -- Auxiliary routines called from both REAL and COMPLEX. | # SCLAUX -- Auxiliary routines called from both REAL and COMPLEX. | ||||
| # excluded: second_$(TIMER) | # excluded: second_$(TIMER) | ||||
| @@ -148,7 +153,7 @@ | |||||
| dlasr, dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc, | dlasr, dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc, | ||||
| dsteqr, dsterf, dlaisnan, disnan, | dsteqr, dsterf, dlaisnan, disnan, | ||||
| dlartgp, dlartgs, | dlartgp, dlartgs, | ||||
| dlamch, | |||||
| dlamch, dlamc3, | |||||
| # SLASRC -- Single precision real LAPACK routines | # SLASRC -- Single precision real LAPACK routines | ||||
| # already provided by @lapackobjs: | # already provided by @lapackobjs: | ||||
| @@ -2671,7 +2676,7 @@ if ($ARGV[5] == 1) { | |||||
| #NO_LAPACK=1 | #NO_LAPACK=1 | ||||
| @underscore_objs = (@blasobjs, @misc_underscore_objs); | @underscore_objs = (@blasobjs, @misc_underscore_objs); | ||||
| } elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1" || | } elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1" || | ||||
| -d "../lapack-3.4.2") { | |||||
| -d "../lapack-3.4.2" || -d "../lapack-netlib") { | |||||
| @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs); | @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs); | ||||
| } else { | } else { | ||||
| @underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs); | @underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs); | ||||
| @@ -2679,7 +2684,7 @@ if ($ARGV[5] == 1) { | |||||
| if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); }; | if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); }; | ||||
| if ($ARGV[1] eq "X86_64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; | |||||
| if ($ARGV[1] eq "x86_64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; | |||||
| if ($ARGV[1] eq "x86"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; | if ($ARGV[1] eq "x86"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; | ||||
| @@ -2716,6 +2721,10 @@ $bu = $ARGV[2]; | |||||
| $bu = "" if (($bu eq "0") || ($bu eq "1")); | $bu = "" if (($bu eq "0") || ($bu eq "1")); | ||||
| if ($ARGV[0] eq "linux"){ | if ($ARGV[0] eq "linux"){ | ||||
| @underscore_objs = (@underscore_objs, @misc_common_objs); | |||||
| @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); | |||||
| foreach $objs (@underscore_objs) { | foreach $objs (@underscore_objs) { | ||||
| print $objs, $bu, "\n"; | print $objs, $bu, "\n"; | ||||
| } | } | ||||
| @@ -2733,6 +2742,10 @@ if ($ARGV[0] eq "linux"){ | |||||
| } | } | ||||
| if ($ARGV[0] eq "osx"){ | if ($ARGV[0] eq "osx"){ | ||||
| @underscore_objs = (@underscore_objs, @misc_common_objs); | |||||
| @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); | |||||
| foreach $objs (@underscore_objs) { | foreach $objs (@underscore_objs) { | ||||
| print "_", $objs, $bu, "\n"; | print "_", $objs, $bu, "\n"; | ||||
| } | } | ||||
| @@ -2746,6 +2759,10 @@ if ($ARGV[0] eq "osx"){ | |||||
| } | } | ||||
| if ($ARGV[0] eq "aix"){ | if ($ARGV[0] eq "aix"){ | ||||
| @underscore_objs = (@underscore_objs, @misc_common_objs); | |||||
| @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); | |||||
| foreach $objs (@underscore_objs) { | foreach $objs (@underscore_objs) { | ||||
| print $objs, $bu, "\n"; | print $objs, $bu, "\n"; | ||||
| } | } | ||||
| @@ -2761,23 +2778,31 @@ if ($ARGV[0] eq "aix"){ | |||||
| if ($ARGV[0] eq "win2k"){ | if ($ARGV[0] eq "win2k"){ | ||||
| print "EXPORTS\n"; | print "EXPORTS\n"; | ||||
| $count = 1; | $count = 1; | ||||
| @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); | |||||
| foreach $objs (@underscore_objs) { | foreach $objs (@underscore_objs) { | ||||
| unless ($objs =~ /openblas_set_num_threads/) { #remove openblas_set_num_threads | |||||
| $uppercase = $objs; | |||||
| $uppercase =~ tr/[a-z]/[A-Z]/; | |||||
| print "\t$objs=$objs","_ \@", $count, "\n"; | |||||
| $count ++; | |||||
| print "\t",$objs, "_=$objs","_ \@", $count, "\n"; | |||||
| $count ++; | |||||
| print "\t$uppercase=$objs", "_ \@", $count, "\n"; | |||||
| $count ++; | |||||
| } | |||||
| $uppercase = $objs; | |||||
| $uppercase =~ tr/[a-z]/[A-Z]/; | |||||
| print "\t$objs=$objs","_ \@", $count, "\n"; | |||||
| $count ++; | |||||
| print "\t",$objs, "_=$objs","_ \@", $count, "\n"; | |||||
| $count ++; | |||||
| print "\t$uppercase=$objs", "_ \@", $count, "\n"; | |||||
| $count ++; | |||||
| } | |||||
| #for misc_common_objs | |||||
| foreach $objs (@misc_common_objs) { | |||||
| $uppercase = $objs; | |||||
| $uppercase =~ tr/[a-z]/[A-Z]/; | |||||
| print "\t",$objs, "_=$objs","_ \@", $count, "\n"; | |||||
| $count ++; | |||||
| print "\t$uppercase=$objs", "_ \@", $count, "\n"; | |||||
| $count ++; | |||||
| } | } | ||||
| #for openblas_set_num_threads | |||||
| print "\topenblas_set_num_threads_=openblas_set_num_threads_ \@", $count, "\n"; | |||||
| $count ++; | |||||
| foreach $objs (@no_underscore_objs) { | foreach $objs (@no_underscore_objs) { | ||||
| print "\t",$objs,"=$objs"," \@", $count, "\n"; | print "\t",$objs,"=$objs"," \@", $count, "\n"; | ||||
| @@ -2810,6 +2835,9 @@ if ($ARGV[0] eq "win2khpl"){ | |||||
| } | } | ||||
| if ($ARGV[0] eq "microsoft"){ | if ($ARGV[0] eq "microsoft"){ | ||||
| @underscore_objs = (@underscore_objs, @misc_common_objs); | |||||
| print "EXPORTS\n"; | print "EXPORTS\n"; | ||||
| $count = 1; | $count = 1; | ||||
| foreach $objs (@underscore_objs) { | foreach $objs (@underscore_objs) { | ||||
| @@ -2828,6 +2856,9 @@ if ($ARGV[0] eq "microsoft"){ | |||||
| } | } | ||||
| if ($ARGV[0] eq "win2kasm"){ | if ($ARGV[0] eq "win2kasm"){ | ||||
| @underscore_objs = (@underscore_objs, @misc_common_objs); | |||||
| print "\t.text\n"; | print "\t.text\n"; | ||||
| foreach $objs (@underscore_objs) { | foreach $objs (@underscore_objs) { | ||||
| $uppercase = $objs; | $uppercase = $objs; | ||||
| @@ -2841,6 +2872,10 @@ if ($ARGV[0] eq "win2kasm"){ | |||||
| } | } | ||||
| if ($ARGV[0] eq "linktest"){ | if ($ARGV[0] eq "linktest"){ | ||||
| @underscore_objs = (@underscore_objs, @misc_common_objs); | |||||
| @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); | |||||
| print "int main(void){\n"; | print "int main(void){\n"; | ||||
| foreach $objs (@underscore_objs) { | foreach $objs (@underscore_objs) { | ||||
| print $objs, $bu, "();\n" if $objs ne "xerbla"; | print $objs, $bu, "();\n" if $objs ne "xerbla"; | ||||
| @@ -24,7 +24,7 @@ $compiler = "" if $compiler eq "f77"; | |||||
| if ($compiler eq "") { | if ($compiler eq "") { | ||||
| @lists = ("f77", "g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95", | |||||
| @lists = ("g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95", | |||||
| "sunf77", "sunf90", "sunf95", | "sunf77", "sunf90", "sunf95", | ||||
| "xlf95", "xlf90", "xlf", | "xlf95", "xlf90", "xlf", | ||||
| "ppuf77", "ppuf95", "ppuf90", "ppuxlf", | "ppuf77", "ppuf95", "ppuf90", "ppuxlf", | ||||
| @@ -83,6 +83,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #ifdef linux | #ifdef linux | ||||
| #include <sys/sysinfo.h> | #include <sys/sysinfo.h> | ||||
| #include <unistd.h> | |||||
| #endif | #endif | ||||
| /* #define FORCE_P2 */ | /* #define FORCE_P2 */ | ||||
| @@ -96,14 +97,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* #define FORCE_PENRYN */ | /* #define FORCE_PENRYN */ | ||||
| /* #define FORCE_DUNNINGTON */ | /* #define FORCE_DUNNINGTON */ | ||||
| /* #define FORCE_NEHALEM */ | /* #define FORCE_NEHALEM */ | ||||
| /* #define FORCE_SANDYBRIDGE */ | |||||
| /* #define FORCE_ATOM */ | |||||
| /* #define FORCE_ATHLON */ | /* #define FORCE_ATHLON */ | ||||
| /* #define FORCE_OPTERON */ | /* #define FORCE_OPTERON */ | ||||
| /* #define FORCE_OPTERON_SSE3 */ | /* #define FORCE_OPTERON_SSE3 */ | ||||
| /* #define FORCE_BARCELONA */ | /* #define FORCE_BARCELONA */ | ||||
| /* #define FORCE_SHANGHAI */ | /* #define FORCE_SHANGHAI */ | ||||
| /* #define FORCE_ISTANBUL */ | /* #define FORCE_ISTANBUL */ | ||||
| /* #define FORCE_BOBCAT */ | |||||
| /* #define FORCE_BULLDOZER */ | /* #define FORCE_BULLDOZER */ | ||||
| /* #define FORCE_BOBCAT */ | |||||
| /* #define FORCE_PILEDRIVER */ | |||||
| /* #define FORCE_SSE_GENERIC */ | /* #define FORCE_SSE_GENERIC */ | ||||
| /* #define FORCE_VIAC3 */ | /* #define FORCE_VIAC3 */ | ||||
| /* #define FORCE_NANO */ | /* #define FORCE_NANO */ | ||||
| @@ -118,12 +122,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* #define FORCE_PPC440FP2 */ | /* #define FORCE_PPC440FP2 */ | ||||
| /* #define FORCE_CELL */ | /* #define FORCE_CELL */ | ||||
| /* #define FORCE_SICORTEX */ | /* #define FORCE_SICORTEX */ | ||||
| /* #define FORCE_LOONGSON3A */ | |||||
| /* #define FORCE_LOONGSON3B */ | |||||
| /* #define FORCE_LOONGSON3A */ | |||||
| /* #define FORCE_LOONGSON3B */ | |||||
| /* #define FORCE_ITANIUM2 */ | /* #define FORCE_ITANIUM2 */ | ||||
| /* #define FORCE_GENERIC */ | |||||
| /* #define FORCE_SPARC */ | /* #define FORCE_SPARC */ | ||||
| /* #define FORCE_SPARCV7 */ | /* #define FORCE_SPARCV7 */ | ||||
| /* #define FORCE_GENERIC */ | |||||
| #ifdef FORCE_P2 | #ifdef FORCE_P2 | ||||
| #define FORCE | #define FORCE | ||||
| @@ -139,32 +143,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "P5" | #define CORENAME "P5" | ||||
| #endif | #endif | ||||
| #ifdef FORCE_COPPERMINE | |||||
| #ifdef FORCE_KATMAI | |||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| #define ARCHITECTURE "X86" | #define ARCHITECTURE "X86" | ||||
| #define SUBARCHITECTURE "PENTIUM3" | #define SUBARCHITECTURE "PENTIUM3" | ||||
| #define ARCHCONFIG "-DPENTIUM3 " \ | #define ARCHCONFIG "-DPENTIUM3 " \ | ||||
| "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ | "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ | ||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | ||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " | "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " | ||||
| #define LIBNAME "coppermine" | |||||
| #define CORENAME "COPPERMINE" | |||||
| #define LIBNAME "katmai" | |||||
| #define CORENAME "KATMAI" | |||||
| #endif | #endif | ||||
| #ifdef FORCE_KATMAI | |||||
| #ifdef FORCE_COPPERMINE | |||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| #define ARCHITECTURE "X86" | #define ARCHITECTURE "X86" | ||||
| #define SUBARCHITECTURE "PENTIUM3" | #define SUBARCHITECTURE "PENTIUM3" | ||||
| #define ARCHCONFIG "-DPENTIUM3 " \ | #define ARCHCONFIG "-DPENTIUM3 " \ | ||||
| "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ | "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ | ||||
| "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | ||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " | "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " | ||||
| #define LIBNAME "katmai" | |||||
| #define CORENAME "KATMAI" | |||||
| #define LIBNAME "coppermine" | |||||
| #define CORENAME "COPPERMINE" | |||||
| #endif | #endif | ||||
| #ifdef FORCE_NORTHWOOD | #ifdef FORCE_NORTHWOOD | ||||
| @@ -396,6 +400,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "BULLDOZER" | #define CORENAME "BULLDOZER" | ||||
| #endif | #endif | ||||
| #if defined (FORCE_PILEDRIVER) | |||||
| #define FORCE | |||||
| #define FORCE_INTEL | |||||
| #define ARCHITECTURE "X86" | |||||
| #define SUBARCHITECTURE "PILEDRIVER" | |||||
| #define ARCHCONFIG "-DPILEDRIVER " \ | |||||
| "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ | |||||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ | |||||
| "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" | |||||
| #define LIBNAME "piledriver" | |||||
| #define CORENAME "PILEDRIVER" | |||||
| #endif | |||||
| #ifdef FORCE_SSE_GENERIC | #ifdef FORCE_SSE_GENERIC | ||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| @@ -717,7 +737,8 @@ static int get_num_cores(void) { | |||||
| #endif | #endif | ||||
| #ifdef linux | #ifdef linux | ||||
| return get_nprocs(); | |||||
| //returns the number of processors which are currently online | |||||
| return sysconf(_SC_NPROCESSORS_ONLN); | |||||
| #elif defined(OS_WINDOWS) | #elif defined(OS_WINDOWS) | ||||
| @@ -802,8 +823,12 @@ int main(int argc, char *argv[]){ | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #if NO_PARALLEL_MAKE==1 | |||||
| printf("MAKE += -j 1\n"); | |||||
| #else | |||||
| #ifndef OS_WINDOWS | #ifndef OS_WINDOWS | ||||
| printf("MAKE += -j %d\n", get_num_cores()); | printf("MAKE += -j %d\n", get_num_cores()); | ||||
| #endif | |||||
| #endif | #endif | ||||
| break; | break; | ||||
| @@ -8,7 +8,7 @@ | |||||
| int main(int argc, char **argv) { | int main(int argc, char **argv) { | ||||
| if ((argc < 1) || (*argv[1] == '0')) { | |||||
| if ( (argc <= 1) || (argc >= 2) && (*argv[1] == '0')) { | |||||
| printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); | printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); | ||||
| printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); | printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); | ||||
| printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); | printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); | ||||
| @@ -22,10 +22,48 @@ int main(int argc, char **argv) { | |||||
| printf("ZGEMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N); | printf("ZGEMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N); | ||||
| printf("XGEMM_UNROLL_M=%d\n", XGEMM_DEFAULT_UNROLL_M); | printf("XGEMM_UNROLL_M=%d\n", XGEMM_DEFAULT_UNROLL_M); | ||||
| printf("XGEMM_UNROLL_N=%d\n", XGEMM_DEFAULT_UNROLL_N); | printf("XGEMM_UNROLL_N=%d\n", XGEMM_DEFAULT_UNROLL_N); | ||||
| #ifdef CGEMM3M_DEFAULT_UNROLL_M | |||||
| printf("CGEMM3M_UNROLL_M=%d\n", CGEMM3M_DEFAULT_UNROLL_M); | |||||
| #else | |||||
| printf("CGEMM3M_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); | |||||
| #endif | |||||
| #ifdef CGEMM3M_DEFAULT_UNROLL_N | |||||
| printf("CGEMM3M_UNROLL_N=%d\n", CGEMM3M_DEFAULT_UNROLL_N); | |||||
| #else | |||||
| printf("CGEMM3M_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); | |||||
| #endif | |||||
| #ifdef ZGEMM3M_DEFAULT_UNROLL_M | |||||
| printf("ZGEMM3M_UNROLL_M=%d\n", ZGEMM3M_DEFAULT_UNROLL_M); | |||||
| #else | |||||
| printf("ZGEMM3M_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); | |||||
| #endif | |||||
| #ifdef ZGEMM3M_DEFAULT_UNROLL_N | |||||
| printf("ZGEMM3M_UNROLL_N=%d\n", ZGEMM3M_DEFAULT_UNROLL_N); | |||||
| #else | |||||
| printf("ZGEMM3M_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N); | |||||
| #endif | |||||
| #ifdef XGEMM3M_DEFAULT_UNROLL_M | |||||
| printf("XGEMM3M_UNROLL_M=%d\n", ZGEMM3M_DEFAULT_UNROLL_M); | |||||
| #else | |||||
| printf("XGEMM3M_UNROLL_M=%d\n", QGEMM_DEFAULT_UNROLL_M); | |||||
| #endif | |||||
| #ifdef XGEMM3M_DEFAULT_UNROLL_N | |||||
| printf("XGEMM3M_UNROLL_N=%d\n", ZGEMM3M_DEFAULT_UNROLL_N); | |||||
| #else | |||||
| printf("XGEMM3M_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N); | |||||
| #endif | |||||
| } | } | ||||
| if ((argc >= 1) && (*argv[1] == '1')) { | |||||
| if ((argc >= 2) && (*argv[1] == '1')) { | |||||
| printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float))); | printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float))); | ||||
| printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double))); | printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double))); | ||||
| printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float))); | printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float))); | ||||
| @@ -60,6 +60,8 @@ static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT * | |||||
| }; | }; | ||||
| #endif | #endif | ||||
| extern void dtrtri_lapack_(char *UPLO, char *DIAG, int *N, double *a, int *ldA, int *Info); | |||||
| int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ | int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ | ||||
| blas_arg_t args; | blas_arg_t args; | ||||
| @@ -83,6 +85,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In | |||||
| TOUPPER(uplo_arg); | TOUPPER(uplo_arg); | ||||
| TOUPPER(diag_arg); | TOUPPER(diag_arg); | ||||
| uplo = -1; | uplo = -1; | ||||
| if (uplo_arg == 'U') uplo = 0; | if (uplo_arg == 'U') uplo = 0; | ||||
| if (uplo_arg == 'L') uplo = 1; | if (uplo_arg == 'L') uplo = 1; | ||||
| @@ -90,6 +93,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In | |||||
| if (diag_arg == 'U') diag = 0; | if (diag_arg == 'U') diag = 0; | ||||
| if (diag_arg == 'N') diag = 1; | if (diag_arg == 'N') diag = 1; | ||||
| info = 0; | info = 0; | ||||
| if (args.lda < MAX(1,args.n)) info = 5; | if (args.lda < MAX(1,args.n)) info = 5; | ||||
| if (args.n < 0) info = 3; | if (args.n < 0) info = 3; | ||||
| @@ -129,6 +133,15 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In | |||||
| if (args.nthreads == 1) { | if (args.nthreads == 1) { | ||||
| #endif | #endif | ||||
| #if DOUBLE | |||||
| // double trtri_U single thread error | |||||
| // call dtrtri from lapack for a walk around. | |||||
| if(uplo==0){ | |||||
| dtrtri_lapack_(UPLO, DIAG, N, a, ldA, Info); | |||||
| return 0; | |||||
| } | |||||
| #endif | |||||
| *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); | *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); | ||||
| #ifdef SMP | #ifdef SMP | ||||
| @@ -388,7 +388,7 @@ $(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER | |||||
| $(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ -DXCONJ $< -o $@ | $(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ -DXCONJ $< -o $@ | ||||
| $(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) | $(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) | ||||
| $(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ-DXCONJ $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ -DXCONJ $< -o $@ | |||||
| $(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM) | $(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@ | $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@ | ||||
| @@ -1206,328 +1206,328 @@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M | |||||
| $(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c | $(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c | ||||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | ||||
| $(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c | $(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c | ||||
| @@ -2608,328 +2608,328 @@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_ | |||||
| $(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c | $(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c | ||||
| $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | ||||
| $(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c | |||||
| $(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c | |||||
| $(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c | |||||
| $(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c | |||||
| $(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c | |||||
| $(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ | ||||
| $(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c | |||||
| $(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c | |||||
| $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| $(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c | $(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c | ||||
| @@ -826,6 +826,22 @@ static void init_parameter(void) { | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef PILEDRIVER | |||||
| #ifdef DEBUG | |||||
| fprintf(stderr, "Piledriver\n"); | |||||
| #endif | |||||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||||
| #ifdef EXPRECISION | |||||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||||
| #endif | |||||
| #endif | |||||
| #ifdef NANO | #ifdef NANO | ||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| @@ -0,0 +1,59 @@ | |||||
| SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
| SGEMMINCOPY = | |||||
| SGEMMITCOPY = | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMINCOPYOBJ = | |||||
| SGEMMITCOPYOBJ = | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
| CGEMMINCOPY = | |||||
| CGEMMITCOPY = | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMINCOPYOBJ = | |||||
| CGEMMITCOPYOBJ = | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S | |||||
| @@ -101,10 +101,10 @@ | |||||
| #define Y 36 + STACKSIZE+ARGS(%esp) | #define Y 36 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_INCY 40 + STACKSIZE+ARGS(%esp) | #define STACK_INCY 40 + STACKSIZE+ARGS(%esp) | ||||
| #define BUFFER 44 + STACKSIZE+ARGS(%esp) | #define BUFFER 44 + STACKSIZE+ARGS(%esp) | ||||
| #define MMM 0+ARGS(%esp) | #define MMM 0+ARGS(%esp) | ||||
| #define YY 4+ARGS(%esp) | #define YY 4+ARGS(%esp) | ||||
| #define AA 8+ARGS(%esp) | #define AA 8+ARGS(%esp) | ||||
| #define LDAX 12+ARGS(%esp) | |||||
| #define I %eax | #define I %eax | ||||
| #define J %ebx | #define J %ebx | ||||
| @@ -153,8 +153,8 @@ | |||||
| movl YY,J | movl YY,J | ||||
| movl J,Y | movl J,Y | ||||
| movl STACK_LDA, LDA | |||||
| movl STACK_LDA, LDA | |||||
| movl STACK_X, X | movl STACK_X, X | ||||
| movl STACK_INCX, INCX | movl STACK_INCX, INCX | ||||
| @@ -688,9 +688,9 @@ | |||||
| movl M,J | movl M,J | ||||
| leal (,J,SIZE),%eax | leal (,J,SIZE),%eax | ||||
| addl %eax,AA | addl %eax,AA | ||||
| movl YY,J | |||||
| addl %eax,J | |||||
| movl J,YY | |||||
| movl STACK_INCY,INCY | |||||
| imull INCY,%eax | |||||
| addl %eax,YY | |||||
| jmp .L0t | jmp .L0t | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -714,9 +714,9 @@ | |||||
| movl M,J | movl M,J | ||||
| leal (,J,SIZE),%eax | leal (,J,SIZE),%eax | ||||
| addl %eax,AA | addl %eax,AA | ||||
| movl YY,J | |||||
| addl %eax,J | |||||
| movl J,YY | |||||
| movl STACK_INCY,INCY | |||||
| imull INCY,%eax | |||||
| addl %eax,YY | |||||
| jmp .L0t | jmp .L0t | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -102,11 +102,9 @@ | |||||
| #define STACK_INCY 40 + STACKSIZE+ARGS(%esp) | #define STACK_INCY 40 + STACKSIZE+ARGS(%esp) | ||||
| #define BUFFER 44 + STACKSIZE+ARGS(%esp) | #define BUFFER 44 + STACKSIZE+ARGS(%esp) | ||||
| #define MMM 0+STACKSIZE(%esp) | |||||
| #define NN 4+STACKSIZE(%esp) | |||||
| #define AA 8+STACKSIZE(%esp) | |||||
| #define LDAX 12+STACKSIZE(%esp) | |||||
| #define XX 16+STACKSIZE(%esp) | |||||
| #define MMM 0+ARGS(%esp) | |||||
| #define AA 4+ARGS(%esp) | |||||
| #define XX 8+ARGS(%esp) | |||||
| #define I %eax | #define I %eax | ||||
| #define J %ebx | #define J %ebx | ||||
| @@ -129,12 +127,8 @@ | |||||
| PROFCODE | PROFCODE | ||||
| movl STACK_LDA, LDA | |||||
| movl LDA,LDAX # backup LDA | |||||
| movl STACK_X, X | movl STACK_X, X | ||||
| movl X,XX | movl X,XX | ||||
| movl N,J | |||||
| movl J,NN # backup N | |||||
| movl A,J | movl A,J | ||||
| movl J,AA # backup A | movl J,AA # backup A | ||||
| movl M,J | movl M,J | ||||
| @@ -144,7 +138,6 @@ | |||||
| addl $1,J | addl $1,J | ||||
| sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) | sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) | ||||
| subl $8, J # Don't use last 8 float in the buffer. | subl $8, J # Don't use last 8 float in the buffer. | ||||
| # Now, split M by block J | |||||
| subl J,MMM # MMM=MMM-J | subl J,MMM # MMM=MMM-J | ||||
| movl J,M | movl J,M | ||||
| jge .L00t | jge .L00t | ||||
| @@ -159,13 +152,10 @@ | |||||
| movl AA,%eax | movl AA,%eax | ||||
| movl %eax,A # mov AA to A | movl %eax,A # mov AA to A | ||||
| movl NN,%eax | |||||
| movl %eax,N # reset N | |||||
| movl LDAX, LDA # reset LDA | |||||
| movl XX,X | |||||
| movl XX,%eax | |||||
| movl %eax,X | |||||
| movl STACK_LDA, LDA | |||||
| movl STACK_INCX, INCX | movl STACK_INCX, INCX | ||||
| movl STACK_INCY, INCY | movl STACK_INCY, INCY | ||||
| @@ -688,9 +678,9 @@ | |||||
| movl M,J | movl M,J | ||||
| leal (,J,SIZE),%eax | leal (,J,SIZE),%eax | ||||
| addl %eax,AA | addl %eax,AA | ||||
| movl XX,J | |||||
| addl %eax,J | |||||
| movl J,XX | |||||
| movl STACK_INCX,INCX | |||||
| imull INCX,%eax | |||||
| addl %eax,XX | |||||
| jmp .L0t | jmp .L0t | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -76,7 +76,7 @@ | |||||
| #endif | #endif | ||||
| #define STACKSIZE 16 | #define STACKSIZE 16 | ||||
| #define ARGS 16 | |||||
| #define ARGS 20 | |||||
| #define M 4 + STACKSIZE+ARGS(%esp) | #define M 4 + STACKSIZE+ARGS(%esp) | ||||
| #define N 8 + STACKSIZE+ARGS(%esp) | #define N 8 + STACKSIZE+ARGS(%esp) | ||||
| @@ -89,10 +89,9 @@ | |||||
| #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) | #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) | ||||
| #define BUFFER 48 + STACKSIZE+ARGS(%esp) | #define BUFFER 48 + STACKSIZE+ARGS(%esp) | ||||
| #define MMM 0+STACKSIZE(%esp) | |||||
| #define AA 4+STACKSIZE(%esp) | |||||
| #define LDAX 8+STACKSIZE(%esp) | |||||
| #define NN 12+STACKSIZE(%esp) | |||||
| #define MMM 0+ARGS(%esp) | |||||
| #define AA 4+ARGS(%esp) | |||||
| #define XX 8+ARGS(%esp) | |||||
| #define I %eax | #define I %eax | ||||
| #define J %ebx | #define J %ebx | ||||
| @@ -117,10 +116,8 @@ | |||||
| PROFCODE | PROFCODE | ||||
| movl STACK_LDA, LDA | |||||
| movl LDA,LDAX # backup LDA | |||||
| movl N,J | |||||
| movl J,NN # backup N | |||||
| movl STACK_X, X | |||||
| movl X,XX | |||||
| movl A,J | movl A,J | ||||
| movl J,AA # backup A | movl J,AA # backup A | ||||
| movl M,J | movl M,J | ||||
| @@ -130,7 +127,6 @@ | |||||
| addl $1,J | addl $1,J | ||||
| sall $21,J # J=2^21*sizeof(double)=buffer size(16MB) | sall $21,J # J=2^21*sizeof(double)=buffer size(16MB) | ||||
| subl $4, J # Don't use last 4 double in the buffer. | subl $4, J # Don't use last 4 double in the buffer. | ||||
| # Now, split M by block J | |||||
| subl J,MMM # MMM=MMM-J | subl J,MMM # MMM=MMM-J | ||||
| movl J,M | movl J,M | ||||
| jge .L00t | jge .L00t | ||||
| @@ -142,15 +138,13 @@ | |||||
| movl %eax,M | movl %eax,M | ||||
| .L00t: | .L00t: | ||||
| movl XX,%eax | |||||
| movl %eax, X | |||||
| movl AA,%eax | movl AA,%eax | ||||
| movl %eax,A # mov AA to A | movl %eax,A # mov AA to A | ||||
| movl NN,%eax | |||||
| movl %eax,N # reset N | |||||
| movl LDAX, LDA # reset LDA | |||||
| movl STACK_X, X | |||||
| movl STACK_LDA, LDA | |||||
| movl STACK_INCX, INCX | movl STACK_INCX, INCX | ||||
| movl STACK_INCY, INCY | movl STACK_INCY, INCY | ||||
| @@ -605,6 +599,9 @@ | |||||
| movl M,J | movl M,J | ||||
| leal (,J,SIZE),%eax | leal (,J,SIZE),%eax | ||||
| addl %eax,AA | addl %eax,AA | ||||
| movl STACK_INCX,INCX | |||||
| imull INCX,%eax | |||||
| addl %eax,XX | |||||
| jmp .L0t | jmp .L0t | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -74,11 +74,11 @@ | |||||
| #else | #else | ||||
| movl %eax, %ecx | movl %eax, %ecx | ||||
| subl $32, %ecx | subl $32, %ecx | ||||
| cmovg %ecx, %eax | |||||
| cmovge %ecx, %eax | |||||
| movl %edx, %ecx | movl %edx, %ecx | ||||
| subl $32, %ecx | subl $32, %ecx | ||||
| cmovg %ecx, %edx | |||||
| cmovge %ecx, %edx | |||||
| subl %eax, %edx | subl %eax, %edx | ||||
| movl $0, %eax | movl $0, %eax | ||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -439,7 +439,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -488,7 +488,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -1697,7 +1697,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1727,7 +1727,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -437,7 +437,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -833,7 +833,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -1848,7 +1848,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2109,7 +2109,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2429,7 +2429,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -2459,7 +2459,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -2952,7 +2952,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -3148,7 +3148,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -3389,7 +3389,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -3404,7 +3404,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -910,7 +910,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -959,7 +959,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -1439,7 +1439,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1469,7 +1469,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -872,7 +872,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -1316,7 +1316,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -1855,7 +1855,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -1885,7 +1885,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -2249,7 +2249,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2562,7 +2562,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2957,7 +2957,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -2972,7 +2972,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -3280,7 +3280,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -3515,7 +3515,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -1036,7 +1036,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1066,7 +1066,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -2224,7 +2224,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -2273,7 +2273,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -439,7 +439,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -454,7 +454,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -758,7 +758,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -993,7 +993,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -1324,7 +1324,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -1354,7 +1354,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -1718,7 +1718,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2031,7 +2031,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2859,7 +2859,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -3303,7 +3303,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -89,18 +89,23 @@ | |||||
| #endif | #endif | ||||
| #define STACKSIZE 16 | #define STACKSIZE 16 | ||||
| #define M 4 + STACKSIZE(%esp) | |||||
| #define N 8 + STACKSIZE(%esp) | |||||
| #define ALPHA_R 16 + STACKSIZE(%esp) | |||||
| #define ALPHA_I 20 + STACKSIZE(%esp) | |||||
| #define A 24 + STACKSIZE(%esp) | |||||
| #define STACK_LDA 28 + STACKSIZE(%esp) | |||||
| #define STACK_X 32 + STACKSIZE(%esp) | |||||
| #define STACK_INCX 36 + STACKSIZE(%esp) | |||||
| #define Y 40 + STACKSIZE(%esp) | |||||
| #define STACK_INCY 44 + STACKSIZE(%esp) | |||||
| #define BUFFER 48 + STACKSIZE(%esp) | |||||
| #define ARGS 20 | |||||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||||
| #define ALPHA_R 16 + STACKSIZE+ARGS(%esp) | |||||
| #define ALPHA_I 20 + STACKSIZE+ARGS(%esp) | |||||
| #define A 24 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_LDA 28 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_X 32 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCX 36 + STACKSIZE+ARGS(%esp) | |||||
| #define Y 40 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) | |||||
| #define BUFFER 48 + STACKSIZE+ARGS(%esp) | |||||
| #define MMM 0+ARGS(%esp) | |||||
| #define YY 4+ARGS(%esp) | |||||
| #define AA 8+ARGS(%esp) | |||||
| #define I %eax | #define I %eax | ||||
| #define J %ebx | #define J %ebx | ||||
| @@ -123,6 +128,7 @@ | |||||
| PROLOGUE | PROLOGUE | ||||
| subl $ARGS,%esp | |||||
| pushl %ebp | pushl %ebp | ||||
| pushl %edi | pushl %edi | ||||
| pushl %esi | pushl %esi | ||||
| @@ -130,6 +136,33 @@ | |||||
| PROFCODE | PROFCODE | ||||
| movl Y,J | |||||
| movl J,YY | |||||
| movl A,J | |||||
| movl J,AA | |||||
| movl M,J | |||||
| movl J,MMM | |||||
| .L0t: | |||||
| xorl J,J | |||||
| addl $1,J | |||||
| sall $20,J | |||||
| subl J,MMM | |||||
| movl J,M | |||||
| jge .L00t | |||||
| ALIGN_3 | |||||
| movl MMM,%eax | |||||
| addl J,%eax | |||||
| jle .L999x | |||||
| movl %eax,M | |||||
| .L00t: | |||||
| movl AA,%eax | |||||
| movl %eax,A | |||||
| movl YY,J | |||||
| movl J,Y | |||||
| movl STACK_LDA, LDA | movl STACK_LDA, LDA | ||||
| movl STACK_X, X | movl STACK_X, X | ||||
| movl STACK_INCX, INCX | movl STACK_INCX, INCX | ||||
| @@ -595,10 +628,21 @@ | |||||
| ALIGN_3 | ALIGN_3 | ||||
| .L999: | .L999: | ||||
| movl M,%eax | |||||
| sall $ZBASE_SHIFT,%eax | |||||
| addl %eax,AA | |||||
| movl STACK_INCY,INCY | |||||
| imull INCY,%eax | |||||
| addl %eax,YY | |||||
| jmp .L0t | |||||
| ALIGN_3 | |||||
| .L999x: | |||||
| popl %ebx | popl %ebx | ||||
| popl %esi | popl %esi | ||||
| popl %edi | popl %edi | ||||
| popl %ebp | popl %ebp | ||||
| addl $ARGS,%esp | |||||
| ret | ret | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -76,18 +76,23 @@ | |||||
| #endif | #endif | ||||
| #define STACKSIZE 16 | #define STACKSIZE 16 | ||||
| #define ARGS 16 | |||||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||||
| #define ALPHA_R 16 + STACKSIZE+ARGS(%esp) | |||||
| #define ALPHA_I 24 + STACKSIZE+ARGS(%esp) | |||||
| #define A 32 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_LDA 36 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_X 40 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCX 44 + STACKSIZE+ARGS(%esp) | |||||
| #define Y 48 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCY 52 + STACKSIZE+ARGS(%esp) | |||||
| #define BUFFER 56 + STACKSIZE+ARGS(%esp) | |||||
| #define MMM 0 + ARGS(%esp) | |||||
| #define YY 4 + ARGS(%esp) | |||||
| #define AA 8 + ARGS(%esp) | |||||
| #define M 4 + STACKSIZE(%esp) | |||||
| #define N 8 + STACKSIZE(%esp) | |||||
| #define ALPHA_R 16 + STACKSIZE(%esp) | |||||
| #define ALPHA_I 24 + STACKSIZE(%esp) | |||||
| #define A 32 + STACKSIZE(%esp) | |||||
| #define STACK_LDA 36 + STACKSIZE(%esp) | |||||
| #define STACK_X 40 + STACKSIZE(%esp) | |||||
| #define STACK_INCX 44 + STACKSIZE(%esp) | |||||
| #define Y 48 + STACKSIZE(%esp) | |||||
| #define STACK_INCY 52 + STACKSIZE(%esp) | |||||
| #define BUFFER 56 + STACKSIZE(%esp) | |||||
| #define I %eax | #define I %eax | ||||
| #define J %ebx | #define J %ebx | ||||
| @@ -110,6 +115,7 @@ | |||||
| PROLOGUE | PROLOGUE | ||||
| subl $ARGS,%esp | |||||
| pushl %ebp | pushl %ebp | ||||
| pushl %edi | pushl %edi | ||||
| pushl %esi | pushl %esi | ||||
| @@ -117,6 +123,33 @@ | |||||
| PROFCODE | PROFCODE | ||||
| movl Y,J | |||||
| movl J,YY | |||||
| movl A,J | |||||
| movl J,AA | |||||
| movl M,J | |||||
| movl J,MMM | |||||
| .L0t: | |||||
| xorl J,J | |||||
| addl $1,J | |||||
| sall $18,J | |||||
| subl J,MMM | |||||
| movl J,M | |||||
| jge .L00t | |||||
| ALIGN_3 | |||||
| movl MMM,%eax | |||||
| addl J,%eax | |||||
| jle .L999x | |||||
| movl %eax,M | |||||
| .L00t: | |||||
| movl AA,%eax | |||||
| movl %eax,A | |||||
| movl YY,J | |||||
| movl J,Y | |||||
| movl STACK_LDA, LDA | movl STACK_LDA, LDA | ||||
| movl STACK_X, X | movl STACK_X, X | ||||
| movl STACK_INCX, INCX | movl STACK_INCX, INCX | ||||
| @@ -458,10 +491,21 @@ | |||||
| ALIGN_3 | ALIGN_3 | ||||
| .L999: | .L999: | ||||
| movl M,%eax | |||||
| sall $ZBASE_SHIFT,%eax | |||||
| addl %eax,AA | |||||
| movl STACK_INCY,INCY | |||||
| imull INCY,%eax | |||||
| addl %eax,YY | |||||
| jmp .L0t | |||||
| ALIGN_3 | |||||
| .L999x: | |||||
| popl %ebx | popl %ebx | ||||
| popl %esi | popl %esi | ||||
| popl %edi | popl %edi | ||||
| popl %ebp | popl %ebp | ||||
| addl $ARGS,%esp | |||||
| ret | ret | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -89,18 +89,23 @@ | |||||
| #endif | #endif | ||||
| #define STACKSIZE 16 | #define STACKSIZE 16 | ||||
| #define M 4 + STACKSIZE(%esp) | |||||
| #define N 8 + STACKSIZE(%esp) | |||||
| #define ALPHA_R 16 + STACKSIZE(%esp) | |||||
| #define ALPHA_I 20 + STACKSIZE(%esp) | |||||
| #define A 24 + STACKSIZE(%esp) | |||||
| #define STACK_LDA 28 + STACKSIZE(%esp) | |||||
| #define STACK_X 32 + STACKSIZE(%esp) | |||||
| #define STACK_INCX 36 + STACKSIZE(%esp) | |||||
| #define Y 40 + STACKSIZE(%esp) | |||||
| #define STACK_INCY 44 + STACKSIZE(%esp) | |||||
| #define BUFFER 48 + STACKSIZE(%esp) | |||||
| #define ARGS 20 | |||||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||||
| #define ALPHA_R 16 + STACKSIZE+ARGS(%esp) | |||||
| #define ALPHA_I 20 + STACKSIZE+ARGS(%esp) | |||||
| #define A 24 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_LDA 28 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_X 32 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCX 36 + STACKSIZE+ARGS(%esp) | |||||
| #define Y 40 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) | |||||
| #define BUFFER 48 + STACKSIZE+ARGS(%esp) | |||||
| #define MMM 0+ARGS(%esp) | |||||
| #define XX 4+ARGS(%esp) | |||||
| #define AA 8+ARGS(%esp) | |||||
| #define I %eax | #define I %eax | ||||
| #define J %ebx | #define J %ebx | ||||
| @@ -123,6 +128,7 @@ | |||||
| PROLOGUE | PROLOGUE | ||||
| subl $ARGS,%esp | |||||
| pushl %ebp | pushl %ebp | ||||
| pushl %edi | pushl %edi | ||||
| pushl %esi | pushl %esi | ||||
| @@ -130,8 +136,35 @@ | |||||
| PROFCODE | PROFCODE | ||||
| movl STACK_LDA, LDA | |||||
| movl STACK_X, X | movl STACK_X, X | ||||
| movl X,XX | |||||
| movl A,J | |||||
| movl J,AA #backup A | |||||
| movl M,J | |||||
| movl J,MMM | |||||
| .L0t: | |||||
| xorl J,J | |||||
| addl $1,J | |||||
| sall $20,J | |||||
| subl $8,J | |||||
| subl J,MMM #MMM-=J | |||||
| movl J,M | |||||
| jge .L00t | |||||
| ALIGN_4 | |||||
| movl MMM,%eax | |||||
| addl J,%eax | |||||
| jle .L999x | |||||
| movl %eax,M | |||||
| .L00t: | |||||
| movl AA,%eax | |||||
| movl %eax,A | |||||
| movl XX,%eax | |||||
| movl %eax,X | |||||
| movl STACK_LDA,LDA | |||||
| movl STACK_INCX, INCX | movl STACK_INCX, INCX | ||||
| movl STACK_INCY, INCY | movl STACK_INCY, INCY | ||||
| @@ -513,10 +546,22 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L999: | .L999: | ||||
| movl M,%eax | |||||
| sall $ZBASE_SHIFT, %eax | |||||
| addl %eax,AA | |||||
| movl STACK_INCX,INCX | |||||
| imull INCX,%eax | |||||
| addl %eax,XX | |||||
| jmp .L0t | |||||
| ALIGN_4 | |||||
| .L999x: | |||||
| popl %ebx | popl %ebx | ||||
| popl %esi | popl %esi | ||||
| popl %edi | popl %edi | ||||
| popl %ebp | popl %ebp | ||||
| addl $ARGS,%esp | |||||
| ret | ret | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -76,19 +76,24 @@ | |||||
| #endif | #endif | ||||
| #define STACKSIZE 16 | #define STACKSIZE 16 | ||||
| #define ARGS 20 | |||||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||||
| #define ALPHA_R 16 + STACKSIZE+ARGS(%esp) | |||||
| #define ALPHA_I 24 + STACKSIZE+ARGS(%esp) | |||||
| #define A 32 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_LDA 36 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_X 40 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCX 44 + STACKSIZE+ARGS(%esp) | |||||
| #define Y 48 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCY 52 + STACKSIZE+ARGS(%esp) | |||||
| #define BUFFER 56 + STACKSIZE+ARGS(%esp) | |||||
| #define MMM 0 + ARGS(%esp) | |||||
| #define AA 4 + ARGS(%esp) | |||||
| #define XX 8 + ARGS(%esp) | |||||
| #define M 4 + STACKSIZE(%esp) | |||||
| #define N 8 + STACKSIZE(%esp) | |||||
| #define ALPHA_R 16 + STACKSIZE(%esp) | |||||
| #define ALPHA_I 24 + STACKSIZE(%esp) | |||||
| #define A 32 + STACKSIZE(%esp) | |||||
| #define STACK_LDA 36 + STACKSIZE(%esp) | |||||
| #define STACK_X 40 + STACKSIZE(%esp) | |||||
| #define STACK_INCX 44 + STACKSIZE(%esp) | |||||
| #define Y 48 + STACKSIZE(%esp) | |||||
| #define STACK_INCY 52 + STACKSIZE(%esp) | |||||
| #define BUFFER 56 + STACKSIZE(%esp) | |||||
| #define I %eax | #define I %eax | ||||
| #define J %ebx | #define J %ebx | ||||
| @@ -110,6 +115,7 @@ | |||||
| PROLOGUE | PROLOGUE | ||||
| subl $ARGS,%esp | |||||
| pushl %ebp | pushl %ebp | ||||
| pushl %edi | pushl %edi | ||||
| pushl %esi | pushl %esi | ||||
| @@ -117,8 +123,35 @@ | |||||
| PROFCODE | PROFCODE | ||||
| movl STACK_X, X | |||||
| movl X, XX | |||||
| movl A,J | |||||
| movl J,AA | |||||
| movl M,J | |||||
| movl J,MMM | |||||
| .L0t: | |||||
| xorl J,J | |||||
| addl $1,J | |||||
| sall $18,J | |||||
| subl $4,J | |||||
| subl J,MMM | |||||
| movl J,M | |||||
| jge .L00t | |||||
| ALIGN_4 | |||||
| movl MMM,%eax | |||||
| addl J,%eax | |||||
| jle .L999x | |||||
| movl %eax, M | |||||
| .L00t: | |||||
| movl XX, %eax | |||||
| movl %eax, X | |||||
| movl AA,%eax | |||||
| movl %eax,A | |||||
| movl STACK_LDA, LDA | movl STACK_LDA, LDA | ||||
| movl STACK_X, X | |||||
| movl STACK_INCX, INCX | movl STACK_INCX, INCX | ||||
| movl STACK_INCY, INCY | movl STACK_INCY, INCY | ||||
| @@ -188,7 +221,7 @@ | |||||
| movl Y, Y1 | movl Y, Y1 | ||||
| movl N, J | movl N, J | ||||
| ALIGN_3 | |||||
| ALIGN_4 | |||||
| .L11: | .L11: | ||||
| movl BUFFER, X | movl BUFFER, X | ||||
| @@ -395,10 +428,21 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L999: | .L999: | ||||
| movl M,%eax | |||||
| sall $ZBASE_SHIFT,%eax | |||||
| addl %eax,AA | |||||
| movl STACK_INCX,INCX | |||||
| imull INCX,%eax | |||||
| addl %eax,XX | |||||
| jmp .L0t | |||||
| ALIGN_4 | |||||
| .L999x: | |||||
| popl %ebx | popl %ebx | ||||
| popl %esi | popl %esi | ||||
| popl %edi | popl %edi | ||||
| popl %ebp | popl %ebp | ||||
| addl $ARGS,%esp | |||||
| ret | ret | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -533,7 +533,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -994,7 +994,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -1820,7 +1820,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -1,62 +1,71 @@ | |||||
| ZGEMVNKERNEL = zgemv_n_dup.S | ZGEMVNKERNEL = zgemv_n_dup.S | ||||
| ZGEMVTKERNEL = zgemv_t_dup.S | ZGEMVTKERNEL = zgemv_t_dup.S | ||||
| SGEMMKERNEL = sgemm_kernel_8x4_bulldozer.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
| SGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
| DGEMVNKERNEL = dgemv_n_bulldozer.S | |||||
| DGEMVTKERNEL = dgemv_t_bulldozer.S | |||||
| DAXPYKERNEL = daxpy_bulldozer.S | |||||
| DDOTKERNEL = ddot_bulldozer.S | |||||
| DCOPYKERNEL = dcopy_bulldozer.S | |||||
| SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||||
| SGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||||
| SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S | |||||
| DGEMMINCOPY = | |||||
| DGEMMITCOPY = | |||||
| DGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
| DGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
| DGEMMINCOPYOBJ = | |||||
| DGEMMITCOPYOBJ = | |||||
| DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S | |||||
| DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S | |||||
| DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S | |||||
| DGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||||
| DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||||
| CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | ||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | ||||
| CGEMMONCOPY = zgemm_ncopy_2.S | |||||
| CGEMMOTCOPY = zgemm_tcopy_2.S | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
| ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S | |||||
| ZGEMMINCOPY = | ZGEMMINCOPY = | ||||
| ZGEMMITCOPY = | ZGEMMITCOPY = | ||||
| ZGEMMONCOPY = zgemm_ncopy_2.S | |||||
| ZGEMMOTCOPY = zgemm_tcopy_2.S | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMINCOPYOBJ = | ZGEMMINCOPYOBJ = | ||||
| ZGEMMITCOPYOBJ = | ZGEMMITCOPYOBJ = | ||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| @@ -0,0 +1,70 @@ | |||||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||||
| DGEMVNKERNEL = dgemv_n_bulldozer.S | |||||
| DGEMVTKERNEL = dgemv_t_bulldozer.S | |||||
| DAXPYKERNEL = daxpy_bulldozer.S | |||||
| DDOTKERNEL = ddot_bulldozer.S | |||||
| DCOPYKERNEL = dcopy_bulldozer.S | |||||
| SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||||
| SGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||||
| SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S | |||||
| DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S | |||||
| DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S | |||||
| DGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||||
| DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S | |||||
| ZGEMMINCOPY = | |||||
| ZGEMMITCOPY = | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMINCOPYOBJ = | |||||
| ZGEMMITCOPYOBJ = | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| @@ -69,7 +69,7 @@ | |||||
| #endif | #endif | ||||
| movaps %xmm0, ALPHA | movaps %xmm0, ALPHA | ||||
| #else | #else | ||||
| movaps %xmm3, ALPHA | |||||
| movq 40(%rsp), X | movq 40(%rsp), X | ||||
| movq 48(%rsp), INCX | movq 48(%rsp), INCX | ||||
| @@ -79,6 +79,10 @@ | |||||
| SAVEREGISTERS | SAVEREGISTERS | ||||
| #ifdef WINDOWS_ABI | |||||
| movaps %xmm3, ALPHA | |||||
| #endif | |||||
| shufps $0, ALPHA, ALPHA | shufps $0, ALPHA, ALPHA | ||||
| leaq (, INCX, SIZE), INCX | leaq (, INCX, SIZE), INCX | ||||
| @@ -69,7 +69,6 @@ | |||||
| #endif | #endif | ||||
| movaps %xmm0, ALPHA | movaps %xmm0, ALPHA | ||||
| #else | #else | ||||
| movaps %xmm3, ALPHA | |||||
| movq 40(%rsp), X | movq 40(%rsp), X | ||||
| movq 48(%rsp), INCX | movq 48(%rsp), INCX | ||||
| @@ -79,6 +78,10 @@ | |||||
| SAVEREGISTERS | SAVEREGISTERS | ||||
| #ifdef WINDOWS_ABI | |||||
| movaps %xmm3, ALPHA | |||||
| #endif | |||||
| unpcklpd ALPHA, ALPHA | unpcklpd ALPHA, ALPHA | ||||
| leaq (, INCX, SIZE), INCX | leaq (, INCX, SIZE), INCX | ||||
| @@ -47,14 +47,22 @@ | |||||
| #ifndef WINDOWS_ABI | #ifndef WINDOWS_ABI | ||||
| #define STACKSIZE 64 | |||||
| #define STACKSIZE 128 | |||||
| #define OLD_INCX 8 + STACKSIZE(%rsp) | #define OLD_INCX 8 + STACKSIZE(%rsp) | ||||
| #define OLD_Y 16 + STACKSIZE(%rsp) | #define OLD_Y 16 + STACKSIZE(%rsp) | ||||
| #define OLD_INCY 24 + STACKSIZE(%rsp) | #define OLD_INCY 24 + STACKSIZE(%rsp) | ||||
| #define OLD_BUFFER 32 + STACKSIZE(%rsp) | #define OLD_BUFFER 32 + STACKSIZE(%rsp) | ||||
| #define ALPHA 48 (%rsp) | #define ALPHA 48 (%rsp) | ||||
| #define MMM 64(%rsp) | |||||
| #define NN 72(%rsp) | |||||
| #define AA 80(%rsp) | |||||
| #define XX 88(%rsp) | |||||
| #define LDAX 96(%rsp) | |||||
| #define ALPHAR 104(%rsp) | |||||
| #define ALPHAI 112(%rsp) | |||||
| #define M %rdi | #define M %rdi | ||||
| #define N %rsi | #define N %rsi | ||||
| #define A %rcx | #define A %rcx | ||||
| @@ -66,7 +74,7 @@ | |||||
| #else | #else | ||||
| #define STACKSIZE 256 | |||||
| #define STACKSIZE 288 | |||||
| #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) | #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) | ||||
| #define OLD_A 48 + STACKSIZE(%rsp) | #define OLD_A 48 + STACKSIZE(%rsp) | ||||
| @@ -78,6 +86,14 @@ | |||||
| #define OLD_BUFFER 96 + STACKSIZE(%rsp) | #define OLD_BUFFER 96 + STACKSIZE(%rsp) | ||||
| #define ALPHA 224 (%rsp) | #define ALPHA 224 (%rsp) | ||||
| #define MMM 232(%rsp) | |||||
| #define NN 240(%rsp) | |||||
| #define AA 248(%rsp) | |||||
| #define XX 256(%rsp) | |||||
| #define LDAX 264(%rsp) | |||||
| #define ALPHAR 272(%rsp) | |||||
| #define ALPHAI 280(%rsp) | |||||
| #define M %rcx | #define M %rcx | ||||
| #define N %rdx | #define N %rdx | ||||
| #define A %r8 | #define A %r8 | ||||
| @@ -142,9 +158,37 @@ | |||||
| movaps %xmm3, %xmm0 | movaps %xmm3, %xmm0 | ||||
| movss OLD_ALPHA_I, %xmm1 | movss OLD_ALPHA_I, %xmm1 | ||||
| #endif | #endif | ||||
| movq A, AA | |||||
| movq N, NN | |||||
| movq M, MMM | |||||
| movq LDA, LDAX | |||||
| movq X, XX | |||||
| movq OLD_Y, Y | |||||
| movss %xmm0,ALPHAR | |||||
| movss %xmm1,ALPHAI | |||||
| .L0t: | |||||
| xorq I,I | |||||
| addq $1,I | |||||
| salq $20,I | |||||
| subq I,MMM | |||||
| movq I,M | |||||
| movss ALPHAR,%xmm0 | |||||
| movss ALPHAI,%xmm1 | |||||
| jge .L00t | |||||
| movq MMM,M | |||||
| addq I,M | |||||
| jle .L999x | |||||
| .L00t: | |||||
| movq AA, A | |||||
| movq NN, N | |||||
| movq LDAX, LDA | |||||
| movq XX, X | |||||
| movq OLD_INCX, INCX | movq OLD_INCX, INCX | ||||
| movq OLD_Y, Y | |||||
| # movq OLD_Y, Y | |||||
| movq OLD_INCY, INCY | movq OLD_INCY, INCY | ||||
| movq OLD_BUFFER, BUFFER | movq OLD_BUFFER, BUFFER | ||||
| @@ -4274,6 +4318,11 @@ | |||||
| ALIGN_3 | ALIGN_3 | ||||
| .L999: | .L999: | ||||
| movq M, I | |||||
| salq $ZBASE_SHIFT,I | |||||
| addq I,AA | |||||
| jmp .L0t | |||||
| .L999x: | |||||
| movq 0(%rsp), %rbx | movq 0(%rsp), %rbx | ||||
| movq 8(%rsp), %rbp | movq 8(%rsp), %rbp | ||||
| movq 16(%rsp), %r12 | movq 16(%rsp), %r12 | ||||
| @@ -47,13 +47,19 @@ | |||||
| #ifndef WINDOWS_ABI | #ifndef WINDOWS_ABI | ||||
| #define STACKSIZE 64 | |||||
| #define STACKSIZE 128 | |||||
| #define OLD_INCX 8 + STACKSIZE(%rsp) | #define OLD_INCX 8 + STACKSIZE(%rsp) | ||||
| #define OLD_Y 16 + STACKSIZE(%rsp) | #define OLD_Y 16 + STACKSIZE(%rsp) | ||||
| #define OLD_INCY 24 + STACKSIZE(%rsp) | #define OLD_INCY 24 + STACKSIZE(%rsp) | ||||
| #define OLD_BUFFER 32 + STACKSIZE(%rsp) | #define OLD_BUFFER 32 + STACKSIZE(%rsp) | ||||
| #define ALPHA 48 (%rsp) | #define ALPHA 48 (%rsp) | ||||
| #define MMM 64(%rsp) | |||||
| #define NN 72(%rsp) | |||||
| #define AA 80(%rsp) | |||||
| #define LDAX 88(%rsp) | |||||
| #define ALPHAR 96(%rsp) | |||||
| #define ALPHAI 104(%rsp) | |||||
| #define M %rdi | #define M %rdi | ||||
| #define N %rsi | #define N %rsi | ||||
| @@ -66,7 +72,7 @@ | |||||
| #else | #else | ||||
| #define STACKSIZE 256 | |||||
| #define STACKSIZE 288 | |||||
| #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) | #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) | ||||
| #define OLD_A 48 + STACKSIZE(%rsp) | #define OLD_A 48 + STACKSIZE(%rsp) | ||||
| @@ -78,6 +84,13 @@ | |||||
| #define OLD_BUFFER 96 + STACKSIZE(%rsp) | #define OLD_BUFFER 96 + STACKSIZE(%rsp) | ||||
| #define ALPHA 224 (%rsp) | #define ALPHA 224 (%rsp) | ||||
| #define MMM 232(%rsp) | |||||
| #define NN 240(%rsp) | |||||
| #define AA 248(%rsp) | |||||
| #define LDAX 256(%rsp) | |||||
| #define ALPHAR 264(%rsp) | |||||
| #define ALPHAI 272(%rsp) | |||||
| #define M %rcx | #define M %rcx | ||||
| #define N %rdx | #define N %rdx | ||||
| #define A %r8 | #define A %r8 | ||||
| @@ -144,6 +157,32 @@ | |||||
| movss OLD_ALPHA_I, %xmm1 | movss OLD_ALPHA_I, %xmm1 | ||||
| #endif | #endif | ||||
| movq A, AA | |||||
| movq N, NN | |||||
| movq M, MMM | |||||
| movq LDA, LDAX | |||||
| movss %xmm0,ALPHAR | |||||
| movss %xmm1,ALPHAI | |||||
| .L0t: | |||||
| xorq I,I | |||||
| addq $1,I | |||||
| salq $20,I | |||||
| subq I,MMM | |||||
| movq I,M | |||||
| movss ALPHAR,%xmm0 | |||||
| movss ALPHAI,%xmm1 | |||||
| jge .L00t | |||||
| movq MMM,M | |||||
| addq I,M | |||||
| jle .L999x | |||||
| .L00t: | |||||
| movq AA, A | |||||
| movq NN, N | |||||
| movq LDAX, LDA | |||||
| movq OLD_INCX, INCX | movq OLD_INCX, INCX | ||||
| movq OLD_Y, Y | movq OLD_Y, Y | ||||
| movq OLD_INCY, INCY | movq OLD_INCY, INCY | ||||
| @@ -4350,6 +4389,11 @@ | |||||
| ALIGN_3 | ALIGN_3 | ||||
| .L999: | .L999: | ||||
| movq M, I | |||||
| salq $ZBASE_SHIFT,I | |||||
| addq I,AA | |||||
| jmp .L0t | |||||
| .L999x: | |||||
| movq 0(%rsp), %rbx | movq 0(%rsp), %rbx | ||||
| movq 8(%rsp), %rbp | movq 8(%rsp), %rbp | ||||
| movq 16(%rsp), %r12 | movq 16(%rsp), %r12 | ||||
| @@ -0,0 +1,408 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #ifndef WINDOWS_ABI | |||||
| #define M ARG1 | |||||
| #define X ARG4 | |||||
| #define INCX ARG5 | |||||
| #define Y ARG6 | |||||
| #define INCY ARG2 | |||||
| #else | |||||
| #define M ARG1 | |||||
| #define X ARG2 | |||||
| #define INCX ARG3 | |||||
| #define Y ARG4 | |||||
| #define INCY %r10 | |||||
| #endif | |||||
| #define YY %r11 | |||||
| #define ALPHA %xmm15 | |||||
| #define A_PRE 640 | |||||
| #include "l1param.h" | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| #ifndef WINDOWS_ABI | |||||
| #ifndef XDOUBLE | |||||
| movq 8(%rsp), INCY | |||||
| #else | |||||
| movq 24(%rsp), INCY | |||||
| #endif | |||||
| vmovups %xmm0, ALPHA | |||||
| #else | |||||
| vmovups %xmm3, ALPHA | |||||
| movq 40(%rsp), X | |||||
| movq 48(%rsp), INCX | |||||
| movq 56(%rsp), Y | |||||
| movq 64(%rsp), INCY | |||||
| #endif | |||||
| SAVEREGISTERS | |||||
| unpcklpd ALPHA, ALPHA | |||||
| leaq (, INCX, SIZE), INCX | |||||
| leaq (, INCY, SIZE), INCY | |||||
| testq M, M | |||||
| jle .L47 | |||||
| cmpq $SIZE, INCX | |||||
| jne .L40 | |||||
| cmpq $SIZE, INCY | |||||
| jne .L40 | |||||
| testq $SIZE, Y | |||||
| je .L10 | |||||
| movsd (X), %xmm0 | |||||
| mulsd ALPHA, %xmm0 | |||||
| addsd (Y), %xmm0 | |||||
| movsd %xmm0, (Y) | |||||
| addq $1 * SIZE, X | |||||
| addq $1 * SIZE, Y | |||||
| decq M | |||||
| jle .L19 | |||||
| ALIGN_4 | |||||
| .L10: | |||||
| subq $-16 * SIZE, X | |||||
| subq $-16 * SIZE, Y | |||||
| movq M, %rax | |||||
| sarq $4, %rax | |||||
| jle .L13 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vmovups -14 * SIZE(X), %xmm1 | |||||
| vmovups -12 * SIZE(X), %xmm2 | |||||
| vmovups -10 * SIZE(X), %xmm3 | |||||
| decq %rax | |||||
| jle .L12 | |||||
| ALIGN_3 | |||||
| .L11: | |||||
| prefetchnta A_PRE(Y) | |||||
| vmovups -8 * SIZE(X), %xmm4 | |||||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||||
| vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 | |||||
| vmovups -6 * SIZE(X), %xmm5 | |||||
| vmovups -4 * SIZE(X), %xmm6 | |||||
| vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 | |||||
| vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 | |||||
| vmovups -2 * SIZE(X), %xmm7 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| prefetchnta A_PRE(X) | |||||
| nop | |||||
| vmovups %xmm2, -12 * SIZE(Y) | |||||
| vmovups %xmm3, -10 * SIZE(Y) | |||||
| prefetchnta A_PRE+64(Y) | |||||
| vmovups 0 * SIZE(X), %xmm0 | |||||
| vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4 | |||||
| vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5 | |||||
| vmovups 2 * SIZE(X), %xmm1 | |||||
| vmovups 4 * SIZE(X), %xmm2 | |||||
| vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6 | |||||
| vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7 | |||||
| vmovups 6 * SIZE(X), %xmm3 | |||||
| vmovups %xmm4, -8 * SIZE(Y) | |||||
| vmovups %xmm5, -6 * SIZE(Y) | |||||
| prefetchnta A_PRE+64(X) | |||||
| nop | |||||
| vmovups %xmm6, -4 * SIZE(Y) | |||||
| vmovups %xmm7, -2 * SIZE(Y) | |||||
| subq $-16 * SIZE, Y | |||||
| subq $-16 * SIZE, X | |||||
| decq %rax | |||||
| jg .L11 | |||||
| ALIGN_3 | |||||
| .L12: | |||||
| vmovups -8 * SIZE(X), %xmm4 | |||||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||||
| vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 | |||||
| vmovups -6 * SIZE(X), %xmm5 | |||||
| vmovups -4 * SIZE(X), %xmm6 | |||||
| vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 | |||||
| vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 | |||||
| vmovups -2 * SIZE(X), %xmm7 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| vmovups %xmm2, -12 * SIZE(Y) | |||||
| vmovups %xmm3, -10 * SIZE(Y) | |||||
| vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4 | |||||
| vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5 | |||||
| vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6 | |||||
| vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7 | |||||
| vmovups %xmm4, -8 * SIZE(Y) | |||||
| vmovups %xmm5, -6 * SIZE(Y) | |||||
| vmovups %xmm6, -4 * SIZE(Y) | |||||
| vmovups %xmm7, -2 * SIZE(Y) | |||||
| subq $-16 * SIZE, Y | |||||
| subq $-16 * SIZE, X | |||||
| ALIGN_3 | |||||
| .L13: | |||||
| movq M, %rax | |||||
| andq $8, %rax | |||||
| jle .L14 | |||||
| ALIGN_3 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vmovups -14 * SIZE(X), %xmm1 | |||||
| vmovups -12 * SIZE(X), %xmm2 | |||||
| vmovups -10 * SIZE(X), %xmm3 | |||||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||||
| vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 | |||||
| vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 | |||||
| vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| vmovups %xmm2, -12 * SIZE(Y) | |||||
| vmovups %xmm3, -10 * SIZE(Y) | |||||
| addq $8 * SIZE, X | |||||
| addq $8 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L14: | |||||
| movq M, %rax | |||||
| andq $4, %rax | |||||
| jle .L15 | |||||
| ALIGN_3 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vmovups -14 * SIZE(X), %xmm1 | |||||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||||
| vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| addq $4 * SIZE, X | |||||
| addq $4 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L15: | |||||
| movq M, %rax | |||||
| andq $2, %rax | |||||
| jle .L16 | |||||
| ALIGN_3 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| addq $2 * SIZE, X | |||||
| addq $2 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L16: | |||||
| movq M, %rax | |||||
| andq $1, %rax | |||||
| jle .L19 | |||||
| ALIGN_3 | |||||
| vmovsd -16 * SIZE(X), %xmm0 | |||||
| vfmaddsd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||||
| vmovsd %xmm0, -16 * SIZE(Y) | |||||
| ALIGN_3 | |||||
| .L19: | |||||
| xorq %rax,%rax | |||||
| RESTOREREGISTERS | |||||
| ret | |||||
| ALIGN_3 | |||||
| .L40: | |||||
| movq Y, YY | |||||
| movq M, %rax | |||||
| //If incx==0 || incy==0, avoid unloop. | |||||
| cmpq $0, INCX | |||||
| je .L46 | |||||
| cmpq $0, INCY | |||||
| je .L46 | |||||
| sarq $3, %rax | |||||
| jle .L45 | |||||
| prefetchnta 512(X) | |||||
| prefetchnta 512+64(X) | |||||
| prefetchnta 512+128(X) | |||||
| prefetchnta 512+192(X) | |||||
| prefetchnta 512(Y) | |||||
| prefetchnta 512+64(Y) | |||||
| prefetchnta 512+128(Y) | |||||
| prefetchnta 512+192(Y) | |||||
| ALIGN_3 | |||||
| .L41: | |||||
| vmovsd 0 * SIZE(X), %xmm0 | |||||
| addq INCX, X | |||||
| vmovhpd 0 * SIZE(X), %xmm0 , %xmm0 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(YY), %xmm6 | |||||
| addq INCY, YY | |||||
| vmovhpd 0 * SIZE(YY), %xmm6 , %xmm6 | |||||
| addq INCY, YY | |||||
| vmovsd 0 * SIZE(X), %xmm1 | |||||
| addq INCX, X | |||||
| vmovhpd 0 * SIZE(X), %xmm1 , %xmm1 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(YY), %xmm7 | |||||
| addq INCY, YY | |||||
| vmovhpd 0 * SIZE(YY), %xmm7 , %xmm7 | |||||
| addq INCY, YY | |||||
| vfmaddpd %xmm6 , ALPHA , %xmm0 , %xmm0 | |||||
| vmovsd 0 * SIZE(X), %xmm2 | |||||
| addq INCX, X | |||||
| vmovhpd 0 * SIZE(X), %xmm2 , %xmm2 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(YY), %xmm8 | |||||
| addq INCY, YY | |||||
| vmovhpd 0 * SIZE(YY), %xmm8 , %xmm8 | |||||
| addq INCY, YY | |||||
| vfmaddpd %xmm7 , ALPHA , %xmm1 , %xmm1 | |||||
| vmovsd 0 * SIZE(X), %xmm3 | |||||
| addq INCX, X | |||||
| vmovhpd 0 * SIZE(X), %xmm3 , %xmm3 | |||||
| addq INCX, X | |||||
| vfmaddpd %xmm8 , ALPHA , %xmm2 , %xmm2 | |||||
| vmovsd 0 * SIZE(YY), %xmm9 | |||||
| addq INCY, YY | |||||
| vmovhpd 0 * SIZE(YY), %xmm9 , %xmm9 | |||||
| addq INCY, YY | |||||
| vmovsd %xmm0, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| vmovhpd %xmm0, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm1, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| vmovhpd %xmm1, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm2, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| vmovhpd %xmm2, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| vfmaddpd %xmm9 , ALPHA , %xmm3 , %xmm3 | |||||
| vmovsd %xmm3, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| vmovhpd %xmm3, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| decq %rax | |||||
| jg .L41 | |||||
| ALIGN_3 | |||||
| .L45: | |||||
| movq M, %rax | |||||
| andq $7, %rax | |||||
| jle .L47 | |||||
| ALIGN_3 | |||||
| .L46: | |||||
| vmovsd (X), %xmm0 | |||||
| addq INCX, X | |||||
| vfmaddsd (Y) , ALPHA , %xmm0 , %xmm0 | |||||
| vmovsd %xmm0, (Y) | |||||
| addq INCY, Y | |||||
| decq %rax | |||||
| jg .L46 | |||||
| ALIGN_3 | |||||
| .L47: | |||||
| xorq %rax, %rax | |||||
| RESTOREREGISTERS | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,291 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define M ARG1 /* rdi */ | |||||
| #define X ARG2 /* rsi */ | |||||
| #define INCX ARG3 /* rdx */ | |||||
| #define Y ARG4 /* rcx */ | |||||
| #ifndef WINDOWS_ABI | |||||
| #define INCY ARG5 /* r8 */ | |||||
| #else | |||||
| #define INCY %r10 | |||||
| #endif | |||||
| #include "l1param.h" | |||||
| #define VLOAD(OFFSET, ADDR, REG) vmovups OFFSET(ADDR), REG | |||||
| #define VSHUFPD_1(REG1 , REG2) vshufpd $0x01, REG1, REG2, REG2 | |||||
| #define A_PRE 640 | |||||
| #define B_PRE 640 | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| #ifdef WINDOWS_ABI | |||||
| movq 40(%rsp), INCY | |||||
| #endif | |||||
| SAVEREGISTERS | |||||
| leaq (, INCX, SIZE), INCX | |||||
| leaq (, INCY, SIZE), INCY | |||||
| cmpq $SIZE, INCX | |||||
| jne .L40 | |||||
| cmpq $SIZE, INCY | |||||
| jne .L40 | |||||
| testq $SIZE, X | |||||
| je .L10 | |||||
| vmovsd (X), %xmm0 | |||||
| vmovsd %xmm0, (Y) | |||||
| addq $1 * SIZE, X | |||||
| addq $1 * SIZE, Y | |||||
| decq M | |||||
| jle .L19 | |||||
| ALIGN_4 | |||||
| .L10: | |||||
| subq $-16 * SIZE, X | |||||
| subq $-16 * SIZE, Y | |||||
| movq M, %rax | |||||
| sarq $4, %rax | |||||
| jle .L13 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vmovups -14 * SIZE(X), %xmm1 | |||||
| vmovups -12 * SIZE(X), %xmm2 | |||||
| vmovups -10 * SIZE(X), %xmm3 | |||||
| vmovups -8 * SIZE(X), %xmm4 | |||||
| vmovups -6 * SIZE(X), %xmm5 | |||||
| vmovups -4 * SIZE(X), %xmm6 | |||||
| vmovups -2 * SIZE(X), %xmm7 | |||||
| decq %rax | |||||
| jle .L12 | |||||
| ALIGN_4 | |||||
| .L11: | |||||
| prefetchnta A_PRE(X) | |||||
| nop | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| prefetchnta B_PRE(Y) | |||||
| nop | |||||
| vmovups %xmm2, -12 * SIZE(Y) | |||||
| vmovups %xmm3, -10 * SIZE(Y) | |||||
| VLOAD( 0 * SIZE, X, %xmm0) | |||||
| VLOAD( 2 * SIZE, X, %xmm1) | |||||
| VLOAD( 4 * SIZE, X, %xmm2) | |||||
| VLOAD( 6 * SIZE, X, %xmm3) | |||||
| prefetchnta A_PRE+64(X) | |||||
| nop | |||||
| vmovups %xmm4, -8 * SIZE(Y) | |||||
| vmovups %xmm5, -6 * SIZE(Y) | |||||
| prefetchnta B_PRE+64(Y) | |||||
| nop | |||||
| vmovups %xmm6, -4 * SIZE(Y) | |||||
| vmovups %xmm7, -2 * SIZE(Y) | |||||
| VLOAD( 8 * SIZE, X, %xmm4) | |||||
| VLOAD(10 * SIZE, X, %xmm5) | |||||
| subq $-16 * SIZE, Y | |||||
| VLOAD(12 * SIZE, X, %xmm6) | |||||
| VLOAD(14 * SIZE, X, %xmm7) | |||||
| subq $-16 * SIZE, X | |||||
| decq %rax | |||||
| jg .L11 | |||||
| ALIGN_3 | |||||
| .L12: | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| vmovups %xmm2, -12 * SIZE(Y) | |||||
| vmovups %xmm3, -10 * SIZE(Y) | |||||
| vmovups %xmm4, -8 * SIZE(Y) | |||||
| vmovups %xmm5, -6 * SIZE(Y) | |||||
| vmovups %xmm6, -4 * SIZE(Y) | |||||
| vmovups %xmm7, -2 * SIZE(Y) | |||||
| subq $-16 * SIZE, Y | |||||
| subq $-16 * SIZE, X | |||||
| ALIGN_3 | |||||
| .L13: | |||||
| testq $8, M | |||||
| jle .L14 | |||||
| ALIGN_3 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vmovups -14 * SIZE(X), %xmm1 | |||||
| vmovups -12 * SIZE(X), %xmm2 | |||||
| vmovups -10 * SIZE(X), %xmm3 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| vmovups %xmm2, -12 * SIZE(Y) | |||||
| vmovups %xmm3, -10 * SIZE(Y) | |||||
| addq $8 * SIZE, X | |||||
| addq $8 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L14: | |||||
| testq $4, M | |||||
| jle .L15 | |||||
| ALIGN_3 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vmovups -14 * SIZE(X), %xmm1 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| addq $4 * SIZE, X | |||||
| addq $4 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L15: | |||||
| testq $2, M | |||||
| jle .L16 | |||||
| ALIGN_3 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| addq $2 * SIZE, X | |||||
| addq $2 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L16: | |||||
| testq $1, M | |||||
| jle .L19 | |||||
| ALIGN_3 | |||||
| vmovsd -16 * SIZE(X), %xmm0 | |||||
| vmovsd %xmm0, -16 * SIZE(Y) | |||||
| ALIGN_3 | |||||
| .L19: | |||||
| xorq %rax,%rax | |||||
| RESTOREREGISTERS | |||||
| ret | |||||
| ALIGN_3 | |||||
| .L40: | |||||
| movq M, %rax | |||||
| sarq $3, %rax | |||||
| jle .L45 | |||||
| ALIGN_3 | |||||
| .L41: | |||||
| vmovsd (X), %xmm0 | |||||
| addq INCX, X | |||||
| vmovsd (X), %xmm4 | |||||
| addq INCX, X | |||||
| vmovsd (X), %xmm1 | |||||
| addq INCX, X | |||||
| vmovsd (X), %xmm5 | |||||
| addq INCX, X | |||||
| vmovsd (X), %xmm2 | |||||
| addq INCX, X | |||||
| vmovsd (X), %xmm6 | |||||
| addq INCX, X | |||||
| vmovsd (X), %xmm3 | |||||
| addq INCX, X | |||||
| vmovsd (X), %xmm7 | |||||
| addq INCX, X | |||||
| vmovsd %xmm0, (Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm4, (Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm1, (Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm5, (Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm2, (Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm6, (Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm3, (Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm7, (Y) | |||||
| addq INCY, Y | |||||
| decq %rax | |||||
| jg .L41 | |||||
| ALIGN_3 | |||||
| .L45: | |||||
| movq M, %rax | |||||
| andq $7, %rax | |||||
| jle .L47 | |||||
| ALIGN_3 | |||||
| .L46: | |||||
| vmovsd (X), %xmm0 | |||||
| addq INCX, X | |||||
| vmovsd %xmm0, (Y) | |||||
| addq INCY, Y | |||||
| decq %rax | |||||
| jg .L46 | |||||
| ALIGN_3 | |||||
| .L47: | |||||
| xorq %rax, %rax | |||||
| RESTOREREGISTERS | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,311 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N ARG1 /* rdi */ | |||||
| #define X ARG2 /* rsi */ | |||||
| #define INCX ARG3 /* rdx */ | |||||
| #define Y ARG4 /* rcx */ | |||||
| #ifndef WINDOWS_ABI | |||||
| #define INCY ARG5 /* r8 */ | |||||
| #else | |||||
| #define INCY %r10 | |||||
| #endif | |||||
| #define A_PRE 512 | |||||
| #include "l1param.h" | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| #ifdef WINDOWS_ABI | |||||
| movq 40(%rsp), INCY | |||||
| #endif | |||||
| SAVEREGISTERS | |||||
| leaq (, INCX, SIZE), INCX | |||||
| leaq (, INCY, SIZE), INCY | |||||
| vxorps %xmm0, %xmm0 , %xmm0 | |||||
| vxorps %xmm1, %xmm1 , %xmm1 | |||||
| vxorps %xmm2, %xmm2 , %xmm2 | |||||
| vxorps %xmm3, %xmm3 , %xmm3 | |||||
| cmpq $0, N | |||||
| jle .L999 | |||||
| cmpq $SIZE, INCX | |||||
| jne .L50 | |||||
| cmpq $SIZE, INCY | |||||
| jne .L50 | |||||
| subq $-16 * SIZE, X | |||||
| subq $-16 * SIZE, Y | |||||
| testq $SIZE, Y | |||||
| je .L10 | |||||
| vmovsd -16 * SIZE(X), %xmm0 | |||||
| vmulsd -16 * SIZE(Y), %xmm0 , %xmm0 | |||||
| addq $1 * SIZE, X | |||||
| addq $1 * SIZE, Y | |||||
| decq N | |||||
| ALIGN_2 | |||||
| .L10: | |||||
| movq N, %rax | |||||
| sarq $4, %rax | |||||
| jle .L14 | |||||
| vmovups -16 * SIZE(X), %xmm4 | |||||
| vmovups -14 * SIZE(X), %xmm5 | |||||
| vmovups -12 * SIZE(X), %xmm6 | |||||
| vmovups -10 * SIZE(X), %xmm7 | |||||
| vmovups -8 * SIZE(X), %xmm8 | |||||
| vmovups -6 * SIZE(X), %xmm9 | |||||
| vmovups -4 * SIZE(X), %xmm10 | |||||
| vmovups -2 * SIZE(X), %xmm11 | |||||
| decq %rax | |||||
| jle .L12 | |||||
| ALIGN_3 | |||||
| .L11: | |||||
| prefetchnta A_PRE(Y) | |||||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||||
| vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 | |||||
| prefetchnta A_PRE(X) | |||||
| vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 | |||||
| vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 | |||||
| vmovups 0 * SIZE(X), %xmm4 | |||||
| vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0 | |||||
| vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1 | |||||
| vmovups 2 * SIZE(X), %xmm5 | |||||
| vmovups 4 * SIZE(X), %xmm6 | |||||
| vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2 | |||||
| vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3 | |||||
| vmovups 6 * SIZE(X), %xmm7 | |||||
| prefetchnta A_PRE+64(Y) | |||||
| vmovups 8 * SIZE(X), %xmm8 | |||||
| vmovups 10 * SIZE(X), %xmm9 | |||||
| prefetchnta A_PRE+64(X) | |||||
| vmovups 12 * SIZE(X), %xmm10 | |||||
| vmovups 14 * SIZE(X), %xmm11 | |||||
| subq $-16 * SIZE, X | |||||
| subq $-16 * SIZE, Y | |||||
| decq %rax | |||||
| jg .L11 | |||||
| ALIGN_3 | |||||
| .L12: | |||||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||||
| vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 | |||||
| vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 | |||||
| vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 | |||||
| vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0 | |||||
| vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1 | |||||
| vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2 | |||||
| vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3 | |||||
| subq $-16 * SIZE, X | |||||
| subq $-16 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L14: | |||||
| testq $15, N | |||||
| jle .L999 | |||||
| testq $8, N | |||||
| jle .L15 | |||||
| vmovups -16 * SIZE(X), %xmm4 | |||||
| vmovups -14 * SIZE(X), %xmm5 | |||||
| vmovups -12 * SIZE(X), %xmm6 | |||||
| vmovups -10 * SIZE(X), %xmm7 | |||||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||||
| vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 | |||||
| vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 | |||||
| vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 | |||||
| addq $8 * SIZE, X | |||||
| addq $8 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L15: | |||||
| testq $4, N | |||||
| jle .L16 | |||||
| vmovups -16 * SIZE(X), %xmm4 | |||||
| vmovups -14 * SIZE(X), %xmm5 | |||||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||||
| vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 | |||||
| addq $4 * SIZE, X | |||||
| addq $4 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L16: | |||||
| testq $2, N | |||||
| jle .L17 | |||||
| vmovups -16 * SIZE(X), %xmm4 | |||||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||||
| addq $2 * SIZE, X | |||||
| addq $2 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L17: | |||||
| testq $1, N | |||||
| jle .L999 | |||||
| vmovsd -16 * SIZE(X), %xmm4 | |||||
| vmovsd -16 * SIZE(Y), %xmm5 | |||||
| vfmaddpd %xmm0, %xmm4 , %xmm5 , %xmm0 | |||||
| jmp .L999 | |||||
| ALIGN_3 | |||||
| .L50: | |||||
| movq N, %rax | |||||
| sarq $3, %rax | |||||
| jle .L55 | |||||
| ALIGN_3 | |||||
| .L53: | |||||
| vmovsd 0 * SIZE(X), %xmm4 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm8 | |||||
| addq INCY, Y | |||||
| vmovsd 0 * SIZE(X), %xmm5 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm9 | |||||
| addq INCY, Y | |||||
| vmovsd 0 * SIZE(X), %xmm6 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm10 | |||||
| addq INCY, Y | |||||
| vmovsd 0 * SIZE(X), %xmm7 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm11 | |||||
| addq INCY, Y | |||||
| vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 | |||||
| vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1 | |||||
| vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2 | |||||
| vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3 | |||||
| vmovsd 0 * SIZE(X), %xmm4 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm8 | |||||
| addq INCY, Y | |||||
| vmovsd 0 * SIZE(X), %xmm5 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm9 | |||||
| addq INCY, Y | |||||
| vmovsd 0 * SIZE(X), %xmm6 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm10 | |||||
| addq INCY, Y | |||||
| vmovsd 0 * SIZE(X), %xmm7 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm11 | |||||
| addq INCY, Y | |||||
| vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 | |||||
| vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1 | |||||
| vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2 | |||||
| vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3 | |||||
| decq %rax | |||||
| jg .L53 | |||||
| ALIGN_3 | |||||
| .L55: | |||||
| movq N, %rax | |||||
| andq $7, %rax | |||||
| jle .L999 | |||||
| ALIGN_3 | |||||
| .L56: | |||||
| vmovsd 0 * SIZE(X), %xmm4 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm8 | |||||
| addq INCY, Y | |||||
| vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 | |||||
| decq %rax | |||||
| jg .L56 | |||||
| ALIGN_3 | |||||
| .L999: | |||||
| vaddpd %xmm1, %xmm0 , %xmm0 | |||||
| vaddpd %xmm3, %xmm2 , %xmm2 | |||||
| vaddpd %xmm2, %xmm0 , %xmm0 | |||||
| vhaddpd %xmm0, %xmm0 , %xmm0 | |||||
| RESTOREREGISTERS | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,667 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS | |||||
| #define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS | |||||
| #define A_PRE 256 | |||||
| #ifndef WINDOWS_ABI | |||||
| #define N ARG1 /* rsi */ | |||||
| #define M ARG2 /* rdi */ | |||||
| #define A ARG3 /* rdx */ | |||||
| #define LDA ARG4 /* rcx */ | |||||
| #define B ARG5 /* r8 */ | |||||
| #define AO1 %r9 | |||||
| #define AO2 %r10 | |||||
| #define LDA3 %r11 | |||||
| #define M8 %r12 | |||||
| #else | |||||
| #define N ARG1 /* rdx */ | |||||
| #define M ARG2 /* rcx */ | |||||
| #define A ARG3 /* r8 */ | |||||
| #define LDA ARG4 /* r9 */ | |||||
| #define OLD_B 40 + 56(%rsp) | |||||
| #define B %r12 | |||||
| #define AO1 %rsi | |||||
| #define AO2 %rdi | |||||
| #define LDA3 %r10 | |||||
| #define M8 %r11 | |||||
| #endif | |||||
| #define I %rax | |||||
| #define B0 %rbp | |||||
| #define B1 %r13 | |||||
| #define B2 %r14 | |||||
| #define B3 %r15 | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| #ifdef WINDOWS_ABI | |||||
| pushq %rdi | |||||
| pushq %rsi | |||||
| #endif | |||||
| pushq %r15 | |||||
| pushq %r14 | |||||
| pushq %r13 | |||||
| pushq %r12 | |||||
| pushq %rbp | |||||
| #ifdef WINDOWS_ABI | |||||
| movq OLD_B, B | |||||
| #endif | |||||
| subq $-16 * SIZE, B | |||||
| movq M, B1 | |||||
| movq M, B2 | |||||
| movq M, B3 | |||||
| andq $-8, B1 | |||||
| andq $-4, B2 | |||||
| andq $-2, B3 | |||||
| imulq N, B1 | |||||
| imulq N, B2 | |||||
| imulq N, B3 | |||||
| leaq (B, B1, SIZE), B1 | |||||
| leaq (B, B2, SIZE), B2 | |||||
| leaq (B, B3, SIZE), B3 | |||||
| leaq (,LDA, SIZE), LDA | |||||
| leaq (LDA, LDA, 2), LDA3 | |||||
| leaq (, N, SIZE), M8 | |||||
| cmpq $8, N | |||||
| jl .L20 | |||||
| ALIGN_4 | |||||
| .L11: | |||||
| subq $8, N | |||||
| movq A, AO1 | |||||
| leaq (A, LDA, 4), AO2 | |||||
| leaq (A, LDA, 8), A | |||||
| movq B, B0 | |||||
| addq $64 * SIZE, B | |||||
| movq M, I | |||||
| sarq $3, I | |||||
| jle .L14 | |||||
| ALIGN_4 | |||||
| .L13: | |||||
| prefetchnta A_PRE(AO1) | |||||
| VMOVUPS_A1(0 * SIZE, AO1, %xmm0) | |||||
| VMOVUPS_A1(2 * SIZE, AO1, %xmm1) | |||||
| VMOVUPS_A1(4 * SIZE, AO1, %xmm2) | |||||
| VMOVUPS_A1(6 * SIZE, AO1, %xmm3) | |||||
| vmovups %xmm0, -16 * SIZE(B0) | |||||
| vmovups %xmm1, -14 * SIZE(B0) | |||||
| vmovups %xmm2, -12 * SIZE(B0) | |||||
| vmovups %xmm3, -10 * SIZE(B0) | |||||
| prefetchnta A_PRE(AO1, LDA, 1) | |||||
| VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) | |||||
| VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) | |||||
| VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) | |||||
| VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) | |||||
| vmovups %xmm0, -8 * SIZE(B0) | |||||
| vmovups %xmm1, -6 * SIZE(B0) | |||||
| vmovups %xmm2, -4 * SIZE(B0) | |||||
| vmovups %xmm3, -2 * SIZE(B0) | |||||
| prefetchnta A_PRE(AO1, LDA, 2) | |||||
| VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0) | |||||
| VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1) | |||||
| VMOVUPS_A2(4 * SIZE, AO1, LDA, 2, %xmm2) | |||||
| VMOVUPS_A2(6 * SIZE, AO1, LDA, 2, %xmm3) | |||||
| vmovups %xmm0, 0 * SIZE(B0) | |||||
| vmovups %xmm1, 2 * SIZE(B0) | |||||
| vmovups %xmm2, 4 * SIZE(B0) | |||||
| vmovups %xmm3, 6 * SIZE(B0) | |||||
| prefetchnta A_PRE(AO1, LDA3, 1) | |||||
| VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm0) | |||||
| VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm1) | |||||
| VMOVUPS_A2(4 * SIZE, AO1, LDA3, 1, %xmm2) | |||||
| VMOVUPS_A2(6 * SIZE, AO1, LDA3, 1, %xmm3) | |||||
| vmovups %xmm0, 8 * SIZE(B0) | |||||
| vmovups %xmm1, 10 * SIZE(B0) | |||||
| vmovups %xmm2, 12 * SIZE(B0) | |||||
| vmovups %xmm3, 14 * SIZE(B0) | |||||
| prefetchnta A_PRE(AO2) | |||||
| VMOVUPS_A1(0 * SIZE, AO2, %xmm0) | |||||
| VMOVUPS_A1(2 * SIZE, AO2, %xmm1) | |||||
| VMOVUPS_A1(4 * SIZE, AO2, %xmm2) | |||||
| VMOVUPS_A1(6 * SIZE, AO2, %xmm3) | |||||
| vmovups %xmm0, 16 * SIZE(B0) | |||||
| vmovups %xmm1, 18 * SIZE(B0) | |||||
| vmovups %xmm2, 20 * SIZE(B0) | |||||
| vmovups %xmm3, 22 * SIZE(B0) | |||||
| prefetchnta A_PRE(AO2, LDA, 1) | |||||
| VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) | |||||
| VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) | |||||
| VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) | |||||
| VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) | |||||
| vmovups %xmm0, 24 * SIZE(B0) | |||||
| vmovups %xmm1, 26 * SIZE(B0) | |||||
| vmovups %xmm2, 28 * SIZE(B0) | |||||
| vmovups %xmm3, 30 * SIZE(B0) | |||||
| prefetchnta A_PRE(AO2, LDA, 2) | |||||
| VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0) | |||||
| VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1) | |||||
| VMOVUPS_A2(4 * SIZE, AO2, LDA, 2, %xmm2) | |||||
| VMOVUPS_A2(6 * SIZE, AO2, LDA, 2, %xmm3) | |||||
| vmovups %xmm0, 32 * SIZE(B0) | |||||
| vmovups %xmm1, 34 * SIZE(B0) | |||||
| vmovups %xmm2, 36 * SIZE(B0) | |||||
| vmovups %xmm3, 38 * SIZE(B0) | |||||
| prefetchnta A_PRE(AO2, LDA3, 1) | |||||
| VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm0) | |||||
| VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm1) | |||||
| VMOVUPS_A2(4 * SIZE, AO2, LDA3, 1, %xmm2) | |||||
| VMOVUPS_A2(6 * SIZE, AO2, LDA3, 1, %xmm3) | |||||
| vmovups %xmm0, 40 * SIZE(B0) | |||||
| vmovups %xmm1, 42 * SIZE(B0) | |||||
| vmovups %xmm2, 44 * SIZE(B0) | |||||
| vmovups %xmm3, 46 * SIZE(B0) | |||||
| addq $8 * SIZE, AO1 | |||||
| addq $8 * SIZE, AO2 | |||||
| leaq (B0, M8, 8), B0 | |||||
| decq I | |||||
| jg .L13 | |||||
| ALIGN_4 | |||||
| .L14: | |||||
| testq $4, M | |||||
| jle .L16 | |||||
| VMOVUPS_A1(0 * SIZE, AO1, %xmm0) | |||||
| VMOVUPS_A1(2 * SIZE, AO1, %xmm1) | |||||
| VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) | |||||
| VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) | |||||
| vmovups %xmm0, -16 * SIZE(B1) | |||||
| vmovups %xmm1, -14 * SIZE(B1) | |||||
| vmovups %xmm2, -12 * SIZE(B1) | |||||
| vmovups %xmm3, -10 * SIZE(B1) | |||||
| VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0) | |||||
| VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1) | |||||
| VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm2) | |||||
| VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm3) | |||||
| vmovups %xmm0, -8 * SIZE(B1) | |||||
| vmovups %xmm1, -6 * SIZE(B1) | |||||
| vmovups %xmm2, -4 * SIZE(B1) | |||||
| vmovups %xmm3, -2 * SIZE(B1) | |||||
| VMOVUPS_A1(0 * SIZE, AO2, %xmm0) | |||||
| VMOVUPS_A1(2 * SIZE, AO2, %xmm1) | |||||
| VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) | |||||
| VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) | |||||
| vmovups %xmm0, 0 * SIZE(B1) | |||||
| vmovups %xmm1, 2 * SIZE(B1) | |||||
| vmovups %xmm2, 4 * SIZE(B1) | |||||
| vmovups %xmm3, 6 * SIZE(B1) | |||||
| VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0) | |||||
| VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1) | |||||
| VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm2) | |||||
| VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm3) | |||||
| vmovups %xmm0, 8 * SIZE(B1) | |||||
| vmovups %xmm1, 10 * SIZE(B1) | |||||
| vmovups %xmm2, 12 * SIZE(B1) | |||||
| vmovups %xmm3, 14 * SIZE(B1) | |||||
| addq $4 * SIZE, AO1 | |||||
| addq $4 * SIZE, AO2 | |||||
| subq $-32 * SIZE, B1 | |||||
| ALIGN_4 | |||||
| .L16: | |||||
| testq $2, M | |||||
| jle .L18 | |||||
| VMOVUPS_A1(0 * SIZE, AO1, %xmm0) | |||||
| VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) | |||||
| VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm2) | |||||
| VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm3) | |||||
| vmovups %xmm0, -16 * SIZE(B2) | |||||
| vmovups %xmm1, -14 * SIZE(B2) | |||||
| vmovups %xmm2, -12 * SIZE(B2) | |||||
| vmovups %xmm3, -10 * SIZE(B2) | |||||
| VMOVUPS_A1(0 * SIZE, AO2, %xmm0) | |||||
| VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm1) | |||||
| VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm2) | |||||
| VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm3) | |||||
| vmovups %xmm0, -8 * SIZE(B2) | |||||
| vmovups %xmm1, -6 * SIZE(B2) | |||||
| vmovups %xmm2, -4 * SIZE(B2) | |||||
| vmovups %xmm3, -2 * SIZE(B2) | |||||
| addq $2 * SIZE, AO1 | |||||
| addq $2 * SIZE, AO2 | |||||
| subq $-16 * SIZE, B2 | |||||
| ALIGN_4 | |||||
| .L18: | |||||
| testq $1, M | |||||
| jle .L19 | |||||
| vmovsd 0 * SIZE(AO1), %xmm0 | |||||
| vmovsd 0 * SIZE(AO1, LDA), %xmm1 | |||||
| vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2 | |||||
| vmovsd 0 * SIZE(AO1, LDA3), %xmm3 | |||||
| vunpcklpd %xmm1, %xmm0 , %xmm0 | |||||
| vunpcklpd %xmm3, %xmm2 , %xmm2 | |||||
| vmovups %xmm0, -16 * SIZE(B3) | |||||
| vmovups %xmm2, -14 * SIZE(B3) | |||||
| vmovsd 0 * SIZE(AO2), %xmm0 | |||||
| vmovsd 0 * SIZE(AO2, LDA), %xmm1 | |||||
| vmovsd 0 * SIZE(AO2, LDA, 2), %xmm2 | |||||
| vmovsd 0 * SIZE(AO2, LDA3), %xmm3 | |||||
| vunpcklpd %xmm1, %xmm0 , %xmm0 | |||||
| vunpcklpd %xmm3, %xmm2 , %xmm2 | |||||
| vmovups %xmm0, -12 * SIZE(B3) | |||||
| vmovups %xmm2, -10 * SIZE(B3) | |||||
| subq $-8 * SIZE, B3 | |||||
| ALIGN_4 | |||||
| .L19: | |||||
| cmpq $8, N | |||||
| jge .L11 | |||||
| ALIGN_4 | |||||
| .L20: | |||||
| cmpq $4, N | |||||
| jl .L30 | |||||
| subq $4, N | |||||
| movq A, AO1 | |||||
| leaq (A, LDA, 2), AO2 | |||||
| leaq (A, LDA, 4), A | |||||
| movq B, B0 | |||||
| addq $32 * SIZE, B | |||||
| movq M, I | |||||
| sarq $3, I | |||||
| jle .L24 | |||||
| ALIGN_4 | |||||
| .L23: | |||||
| VMOVUPS_A1(0 * SIZE, AO1, %xmm0) | |||||
| VMOVUPS_A1(2 * SIZE, AO1, %xmm1) | |||||
| VMOVUPS_A1(4 * SIZE, AO1, %xmm2) | |||||
| VMOVUPS_A1(6 * SIZE, AO1, %xmm3) | |||||
| vmovups %xmm0, -16 * SIZE(B0) | |||||
| vmovups %xmm1, -14 * SIZE(B0) | |||||
| vmovups %xmm2, -12 * SIZE(B0) | |||||
| vmovups %xmm3, -10 * SIZE(B0) | |||||
| VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) | |||||
| VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) | |||||
| VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) | |||||
| VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) | |||||
| vmovups %xmm0, -8 * SIZE(B0) | |||||
| vmovups %xmm1, -6 * SIZE(B0) | |||||
| vmovups %xmm2, -4 * SIZE(B0) | |||||
| vmovups %xmm3, -2 * SIZE(B0) | |||||
| VMOVUPS_A1(0 * SIZE, AO2, %xmm0) | |||||
| VMOVUPS_A1(2 * SIZE, AO2, %xmm1) | |||||
| VMOVUPS_A1(4 * SIZE, AO2, %xmm2) | |||||
| VMOVUPS_A1(6 * SIZE, AO2, %xmm3) | |||||
| vmovups %xmm0, 0 * SIZE(B0) | |||||
| vmovups %xmm1, 2 * SIZE(B0) | |||||
| vmovups %xmm2, 4 * SIZE(B0) | |||||
| vmovups %xmm3, 6 * SIZE(B0) | |||||
| VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) | |||||
| VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) | |||||
| VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) | |||||
| VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) | |||||
| vmovups %xmm0, 8 * SIZE(B0) | |||||
| vmovups %xmm1, 10 * SIZE(B0) | |||||
| vmovups %xmm2, 12 * SIZE(B0) | |||||
| vmovups %xmm3, 14 * SIZE(B0) | |||||
| addq $8 * SIZE, AO1 | |||||
| addq $8 * SIZE, AO2 | |||||
| leaq (B0, M8, 8), B0 | |||||
| decq I | |||||
| jg .L23 | |||||
| ALIGN_4 | |||||
| .L24: | |||||
| testq $4, M | |||||
| jle .L26 | |||||
| VMOVUPS_A1(0 * SIZE, AO1, %xmm0) | |||||
| VMOVUPS_A1(2 * SIZE, AO1, %xmm1) | |||||
| VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) | |||||
| VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) | |||||
| vmovups %xmm0, -16 * SIZE(B1) | |||||
| vmovups %xmm1, -14 * SIZE(B1) | |||||
| vmovups %xmm2, -12 * SIZE(B1) | |||||
| vmovups %xmm3, -10 * SIZE(B1) | |||||
| VMOVUPS_A1(0 * SIZE, AO2, %xmm0) | |||||
| VMOVUPS_A1(2 * SIZE, AO2, %xmm1) | |||||
| VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) | |||||
| VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) | |||||
| vmovups %xmm0, -8 * SIZE(B1) | |||||
| vmovups %xmm1, -6 * SIZE(B1) | |||||
| vmovups %xmm2, -4 * SIZE(B1) | |||||
| vmovups %xmm3, -2 * SIZE(B1) | |||||
| addq $4 * SIZE, AO1 | |||||
| addq $4 * SIZE, AO2 | |||||
| subq $-16 * SIZE, B1 | |||||
| ALIGN_4 | |||||
| .L26: | |||||
| testq $2, M | |||||
| jle .L28 | |||||
| VMOVUPS_A1(0 * SIZE, AO1, %xmm0) | |||||
| VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) | |||||
| VMOVUPS_A1(0 * SIZE, AO2, %xmm2) | |||||
| VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3) | |||||
| vmovups %xmm0, -16 * SIZE(B2) | |||||
| vmovups %xmm1, -14 * SIZE(B2) | |||||
| vmovups %xmm2, -12 * SIZE(B2) | |||||
| vmovups %xmm3, -10 * SIZE(B2) | |||||
| addq $2 * SIZE, AO1 | |||||
| addq $2 * SIZE, AO2 | |||||
| subq $-8 * SIZE, B2 | |||||
| ALIGN_4 | |||||
| .L28: | |||||
| testq $1, M | |||||
| jle .L30 | |||||
| vmovsd 0 * SIZE(AO1), %xmm0 | |||||
| vmovsd 0 * SIZE(AO1, LDA), %xmm1 | |||||
| vmovsd 0 * SIZE(AO2), %xmm2 | |||||
| vmovsd 0 * SIZE(AO2, LDA), %xmm3 | |||||
| vunpcklpd %xmm1, %xmm0, %xmm0 | |||||
| vunpcklpd %xmm3, %xmm2, %xmm2 | |||||
| vmovups %xmm0, -16 * SIZE(B3) | |||||
| vmovups %xmm2, -14 * SIZE(B3) | |||||
| subq $-4 * SIZE, B3 | |||||
| ALIGN_4 | |||||
| .L30: | |||||
| cmpq $2, N | |||||
| jl .L40 | |||||
| subq $2, N | |||||
| movq A, AO1 | |||||
| leaq (A, LDA), AO2 | |||||
| leaq (A, LDA, 2), A | |||||
| movq B, B0 | |||||
| addq $16 * SIZE, B | |||||
| movq M, I | |||||
| sarq $3, I | |||||
| jle .L34 | |||||
| ALIGN_4 | |||||
| .L33: | |||||
| VMOVUPS_A1(0 * SIZE, AO1, %xmm0) | |||||
| VMOVUPS_A1(2 * SIZE, AO1, %xmm1) | |||||
| VMOVUPS_A1(4 * SIZE, AO1, %xmm2) | |||||
| VMOVUPS_A1(6 * SIZE, AO1, %xmm3) | |||||
| vmovups %xmm0, -16 * SIZE(B0) | |||||
| vmovups %xmm1, -14 * SIZE(B0) | |||||
| vmovups %xmm2, -12 * SIZE(B0) | |||||
| vmovups %xmm3, -10 * SIZE(B0) | |||||
| VMOVUPS_A1(0 * SIZE, AO2, %xmm0) | |||||
| VMOVUPS_A1(2 * SIZE, AO2, %xmm1) | |||||
| VMOVUPS_A1(4 * SIZE, AO2, %xmm2) | |||||
| VMOVUPS_A1(6 * SIZE, AO2, %xmm3) | |||||
| vmovups %xmm0, -8 * SIZE(B0) | |||||
| vmovups %xmm1, -6 * SIZE(B0) | |||||
| vmovups %xmm2, -4 * SIZE(B0) | |||||
| vmovups %xmm3, -2 * SIZE(B0) | |||||
| addq $8 * SIZE, AO1 | |||||
| addq $8 * SIZE, AO2 | |||||
| leaq (B0, M8, 8), B0 | |||||
| decq I | |||||
| jg .L33 | |||||
| ALIGN_4 | |||||
| .L34: | |||||
| testq $4, M | |||||
| jle .L36 | |||||
| VMOVUPS_A1(0 * SIZE, AO1, %xmm0) | |||||
| VMOVUPS_A1(2 * SIZE, AO1, %xmm1) | |||||
| VMOVUPS_A1(0 * SIZE, AO2, %xmm2) | |||||
| VMOVUPS_A1(2 * SIZE, AO2, %xmm3) | |||||
| vmovups %xmm0, -16 * SIZE(B1) | |||||
| vmovups %xmm1, -14 * SIZE(B1) | |||||
| vmovups %xmm2, -12 * SIZE(B1) | |||||
| vmovups %xmm3, -10 * SIZE(B1) | |||||
| addq $4 * SIZE, AO1 | |||||
| addq $4 * SIZE, AO2 | |||||
| subq $-8 * SIZE, B1 | |||||
| ALIGN_4 | |||||
| .L36: | |||||
| testq $2, M | |||||
| jle .L38 | |||||
| VMOVUPS_A1(0 * SIZE, AO1, %xmm0) | |||||
| VMOVUPS_A1(0 * SIZE, AO2, %xmm1) | |||||
| vmovups %xmm0, -16 * SIZE(B2) | |||||
| vmovups %xmm1, -14 * SIZE(B2) | |||||
| addq $2 * SIZE, AO1 | |||||
| addq $2 * SIZE, AO2 | |||||
| subq $-4 * SIZE, B2 | |||||
| ALIGN_4 | |||||
| .L38: | |||||
| testq $1, M | |||||
| jle .L40 | |||||
| vmovsd 0 * SIZE(AO1), %xmm0 | |||||
| vmovsd 0 * SIZE(AO2), %xmm1 | |||||
| vunpcklpd %xmm1, %xmm0, %xmm0 | |||||
| vmovups %xmm0, -16 * SIZE(B3) | |||||
| subq $-2 * SIZE, B3 | |||||
| ALIGN_4 | |||||
| .L40: | |||||
| cmpq $1, N | |||||
| jl .L999 | |||||
| movq A, AO1 | |||||
| movq B, B0 | |||||
| movq M, I | |||||
| sarq $3, I | |||||
| jle .L44 | |||||
| ALIGN_4 | |||||
| .L43: | |||||
| VMOVUPS_A1(0 * SIZE, AO1, %xmm0) | |||||
| VMOVUPS_A1(2 * SIZE, AO1, %xmm1) | |||||
| VMOVUPS_A1(4 * SIZE, AO1, %xmm2) | |||||
| VMOVUPS_A1(6 * SIZE, AO1, %xmm3) | |||||
| vmovups %xmm0, -16 * SIZE(B0) | |||||
| vmovups %xmm1, -14 * SIZE(B0) | |||||
| vmovups %xmm2, -12 * SIZE(B0) | |||||
| vmovups %xmm3, -10 * SIZE(B0) | |||||
| addq $8 * SIZE, AO1 | |||||
| leaq (B0, M8, 8), B0 | |||||
| decq I | |||||
| jg .L43 | |||||
| ALIGN_4 | |||||
| .L44: | |||||
| testq $4, M | |||||
| jle .L45 | |||||
| VMOVUPS_A1(0 * SIZE, AO1, %xmm0) | |||||
| VMOVUPS_A1(2 * SIZE, AO1, %xmm1) | |||||
| vmovups %xmm0, -16 * SIZE(B1) | |||||
| vmovups %xmm1, -14 * SIZE(B1) | |||||
| addq $4 * SIZE, AO1 | |||||
| subq $-4 * SIZE, B1 | |||||
| ALIGN_4 | |||||
| .L45: | |||||
| testq $2, M | |||||
| jle .L46 | |||||
| VMOVUPS_A1(0 * SIZE, AO1, %xmm0) | |||||
| vmovups %xmm0, -16 * SIZE(B2) | |||||
| addq $2 * SIZE, AO1 | |||||
| subq $-2 * SIZE, B2 | |||||
| ALIGN_4 | |||||
| .L46: | |||||
| testq $1, M | |||||
| jle .L999 | |||||
| vmovsd 0 * SIZE(AO1), %xmm0 | |||||
| vmovsd %xmm0, -16 * SIZE(B3) | |||||
| jmp .L999 | |||||
| ALIGN_4 | |||||
| .L999: | |||||
| popq %rbp | |||||
| popq %r12 | |||||
| popq %r13 | |||||
| popq %r14 | |||||
| popq %r15 | |||||
| #ifdef WINDOWS_ABI | |||||
| popq %rsi | |||||
| popq %rdi | |||||
| #endif | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -47,7 +47,7 @@ | |||||
| #ifndef WINDOWS_ABI | #ifndef WINDOWS_ABI | ||||
| #define STACKSIZE 64 | |||||
| #define STACKSIZE 128 | |||||
| #define OLD_M %rdi | #define OLD_M %rdi | ||||
| #define OLD_N %rsi | #define OLD_N %rsi | ||||
| @@ -59,9 +59,14 @@ | |||||
| #define STACK_BUFFER 32 + STACKSIZE(%rsp) | #define STACK_BUFFER 32 + STACKSIZE(%rsp) | ||||
| #define ALPHA 48 (%rsp) | #define ALPHA 48 (%rsp) | ||||
| #define MMM 56(%rsp) | |||||
| #define NN 64(%rsp) | |||||
| #define AA 72(%rsp) | |||||
| #define LDAX 80(%rsp) | |||||
| #define XX 88(%rsp) | |||||
| #else | #else | ||||
| #define STACKSIZE 256 | |||||
| #define STACKSIZE 288 | |||||
| #define OLD_M %rcx | #define OLD_M %rcx | ||||
| #define OLD_N %rdx | #define OLD_N %rdx | ||||
| @@ -74,6 +79,12 @@ | |||||
| #define STACK_BUFFER 88 + STACKSIZE(%rsp) | #define STACK_BUFFER 88 + STACKSIZE(%rsp) | ||||
| #define ALPHA 224 (%rsp) | #define ALPHA 224 (%rsp) | ||||
| #define MMM 232(%rsp) | |||||
| #define NN 240(%rsp) | |||||
| #define AA 248(%rsp) | |||||
| #define LDAX 256(%rsp) | |||||
| #define XX 264(%rsp) | |||||
| #endif | #endif | ||||
| #define LDA %r8 | #define LDA %r8 | ||||
| @@ -137,17 +148,42 @@ | |||||
| movq OLD_LDA, LDA | movq OLD_LDA, LDA | ||||
| #endif | #endif | ||||
| movq STACK_INCX, INCX | |||||
| movq STACK_Y, Y | |||||
| movq STACK_INCY, INCY | |||||
| movq STACK_BUFFER, BUFFER | |||||
| #ifndef WINDOWS_ABI | #ifndef WINDOWS_ABI | ||||
| movsd %xmm0, ALPHA | movsd %xmm0, ALPHA | ||||
| #else | #else | ||||
| movsd %xmm3, ALPHA | movsd %xmm3, ALPHA | ||||
| #endif | #endif | ||||
| movq STACK_Y, Y | |||||
| movq A,AA | |||||
| movq N,NN | |||||
| movq M,MMM | |||||
| movq LDA,LDAX | |||||
| movq X,XX | |||||
| .L0t: | |||||
| xorq I,I | |||||
| addq $1,I | |||||
| salq $21,I | |||||
| subq I,MMM | |||||
| movq I,M | |||||
| jge .L00t | |||||
| movq MMM,M | |||||
| addq I,M | |||||
| jle .L999x | |||||
| .L00t: | |||||
| movq XX,X | |||||
| movq AA,A | |||||
| movq NN,N | |||||
| movq LDAX,LDA | |||||
| movq STACK_INCX, INCX | |||||
| movq STACK_INCY, INCY | |||||
| movq STACK_BUFFER, BUFFER | |||||
| leaq -1(INCY), %rax | leaq -1(INCY), %rax | ||||
| leaq (,INCX, SIZE), INCX | leaq (,INCX, SIZE), INCX | ||||
| @@ -2815,6 +2851,12 @@ | |||||
| ALIGN_3 | ALIGN_3 | ||||
| .L999: | .L999: | ||||
| leaq (, M, SIZE), %rax | |||||
| addq %rax,AA | |||||
| jmp .L0t | |||||
| ALIGN_4 | |||||
| .L999x: | |||||
| movq 0(%rsp), %rbx | movq 0(%rsp), %rbx | ||||
| movq 8(%rsp), %rbp | movq 8(%rsp), %rbp | ||||
| movq 16(%rsp), %r12 | movq 16(%rsp), %r12 | ||||
| @@ -0,0 +1,360 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #ifndef WINDOWS_ABI | |||||
| #define M ARG1 /* rdi */ | |||||
| #define N ARG2 /* rsi */ | |||||
| #define A ARG3 /* rdx */ | |||||
| #define LDA ARG4 /* rcx */ | |||||
| #define B ARG5 /* r8 */ | |||||
| #define I %r9 | |||||
| #else | |||||
| #define STACKSIZE 256 | |||||
| #define M ARG1 /* rcx */ | |||||
| #define N ARG2 /* rdx */ | |||||
| #define A ARG3 /* r8 */ | |||||
| #define LDA ARG4 /* r9 */ | |||||
| #define OLD_B 40 + 32 + STACKSIZE(%rsp) | |||||
| #define B %r14 | |||||
| #define I %r15 | |||||
| #endif | |||||
| #define J %r10 | |||||
| #define AO1 %r11 | |||||
| #define AO2 %r12 | |||||
| #define AO3 %r13 | |||||
| #define AO4 %rax | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| #ifdef WINDOWS_ABI | |||||
| pushq %r15 | |||||
| pushq %r14 | |||||
| #endif | |||||
| pushq %r13 | |||||
| pushq %r12 | |||||
| #ifdef WINDOWS_ABI | |||||
| subq $STACKSIZE, %rsp | |||||
| vmovups %xmm6, 0(%rsp) | |||||
| vmovups %xmm7, 16(%rsp) | |||||
| vmovups %xmm8, 32(%rsp) | |||||
| vmovups %xmm9, 48(%rsp) | |||||
| vmovups %xmm10, 64(%rsp) | |||||
| vmovups %xmm11, 80(%rsp) | |||||
| vmovups %xmm12, 96(%rsp) | |||||
| vmovups %xmm13, 112(%rsp) | |||||
| vmovups %xmm14, 128(%rsp) | |||||
| vmovups %xmm15, 144(%rsp) | |||||
| movq OLD_B, B | |||||
| #endif | |||||
| leaq (,LDA, SIZE), LDA # Scaling | |||||
| movq N, J | |||||
| sarq $1, J | |||||
| jle .L20 | |||||
| ALIGN_4 | |||||
| .L01: | |||||
| movq A, AO1 | |||||
| leaq (A, LDA), AO2 | |||||
| leaq (A, LDA, 2), A | |||||
| movq M, I | |||||
| sarq $3, I | |||||
| jle .L08 | |||||
| ALIGN_4 | |||||
| .L03: | |||||
| #ifndef DOUBLE | |||||
| vmovss 0 * SIZE(AO1), %xmm0 | |||||
| vmovss 0 * SIZE(AO2), %xmm1 | |||||
| vmovss 1 * SIZE(AO1), %xmm2 | |||||
| vmovss 1 * SIZE(AO2), %xmm3 | |||||
| vmovss 2 * SIZE(AO1), %xmm4 | |||||
| vmovss 2 * SIZE(AO2), %xmm5 | |||||
| vmovss 3 * SIZE(AO1), %xmm6 | |||||
| vmovss 3 * SIZE(AO2), %xmm7 | |||||
| vmovss 4 * SIZE(AO1), %xmm8 | |||||
| vmovss 4 * SIZE(AO2), %xmm9 | |||||
| vmovss 5 * SIZE(AO1), %xmm10 | |||||
| vmovss 5 * SIZE(AO2), %xmm11 | |||||
| vmovss 6 * SIZE(AO1), %xmm12 | |||||
| vmovss 6 * SIZE(AO2), %xmm13 | |||||
| vmovss 7 * SIZE(AO1), %xmm14 | |||||
| vmovss 7 * SIZE(AO2), %xmm15 | |||||
| vmovss %xmm0, 0 * SIZE(B) | |||||
| vmovss %xmm1, 1 * SIZE(B) | |||||
| vmovss %xmm2, 2 * SIZE(B) | |||||
| vmovss %xmm3, 3 * SIZE(B) | |||||
| vmovss %xmm4, 4 * SIZE(B) | |||||
| vmovss %xmm5, 5 * SIZE(B) | |||||
| vmovss %xmm6, 6 * SIZE(B) | |||||
| vmovss %xmm7, 7 * SIZE(B) | |||||
| vmovss %xmm8, 8 * SIZE(B) | |||||
| vmovss %xmm9, 9 * SIZE(B) | |||||
| vmovss %xmm10, 10 * SIZE(B) | |||||
| vmovss %xmm11, 11 * SIZE(B) | |||||
| vmovss %xmm12, 12 * SIZE(B) | |||||
| vmovss %xmm13, 13 * SIZE(B) | |||||
| vmovss %xmm14, 14 * SIZE(B) | |||||
| vmovss %xmm15, 15 * SIZE(B) | |||||
| #else | |||||
| prefetchw 256(B) | |||||
| prefetchnta 256(AO1) | |||||
| vmovsd 0 * SIZE(AO1), %xmm0 | |||||
| vmovsd 1 * SIZE(AO1), %xmm1 | |||||
| vmovsd 2 * SIZE(AO1), %xmm2 | |||||
| vmovsd 3 * SIZE(AO1), %xmm3 | |||||
| vmovsd 4 * SIZE(AO1), %xmm4 | |||||
| vmovsd 5 * SIZE(AO1), %xmm5 | |||||
| vmovsd 6 * SIZE(AO1), %xmm6 | |||||
| vmovsd 7 * SIZE(AO1), %xmm7 | |||||
| prefetchnta 256(AO2) | |||||
| vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0 | |||||
| vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1 | |||||
| vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2 | |||||
| vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3 | |||||
| vmovhpd 4 * SIZE(AO2), %xmm4 , %xmm4 | |||||
| vmovhpd 5 * SIZE(AO2), %xmm5 , %xmm5 | |||||
| vmovhpd 6 * SIZE(AO2), %xmm6 , %xmm6 | |||||
| vmovhpd 7 * SIZE(AO2), %xmm7 , %xmm7 | |||||
| prefetchw 256+64(B) | |||||
| vmovups %xmm0, 0 * SIZE(B) | |||||
| vmovups %xmm1, 2 * SIZE(B) | |||||
| vmovups %xmm2, 4 * SIZE(B) | |||||
| vmovups %xmm3, 6 * SIZE(B) | |||||
| vmovups %xmm4, 8 * SIZE(B) | |||||
| vmovups %xmm5, 10 * SIZE(B) | |||||
| vmovups %xmm6, 12 * SIZE(B) | |||||
| vmovups %xmm7, 14 * SIZE(B) | |||||
| #endif | |||||
| addq $8 * SIZE, AO1 | |||||
| addq $8 * SIZE, AO2 | |||||
| subq $-16 * SIZE, B | |||||
| decq I | |||||
| jg .L03 | |||||
| ALIGN_4 | |||||
| .L08: | |||||
| testq $4 , M | |||||
| je .L14 | |||||
| ALIGN_4 | |||||
| .L13: | |||||
| #ifndef DOUBLE | |||||
| vmovss 0 * SIZE(AO1), %xmm0 | |||||
| vmovss 0 * SIZE(AO2), %xmm1 | |||||
| vmovss 1 * SIZE(AO1), %xmm2 | |||||
| vmovss 1 * SIZE(AO2), %xmm3 | |||||
| vmovss 2 * SIZE(AO1), %xmm4 | |||||
| vmovss 2 * SIZE(AO2), %xmm5 | |||||
| vmovss 3 * SIZE(AO1), %xmm6 | |||||
| vmovss 3 * SIZE(AO2), %xmm7 | |||||
| vmovss %xmm0, 0 * SIZE(B) | |||||
| vmovss %xmm1, 1 * SIZE(B) | |||||
| vmovss %xmm2, 2 * SIZE(B) | |||||
| vmovss %xmm3, 3 * SIZE(B) | |||||
| vmovss %xmm4, 4 * SIZE(B) | |||||
| vmovss %xmm5, 5 * SIZE(B) | |||||
| vmovss %xmm6, 6 * SIZE(B) | |||||
| vmovss %xmm7, 7 * SIZE(B) | |||||
| #else | |||||
| vmovsd 0 * SIZE(AO1), %xmm0 | |||||
| vmovsd 1 * SIZE(AO1), %xmm1 | |||||
| vmovsd 2 * SIZE(AO1), %xmm2 | |||||
| vmovsd 3 * SIZE(AO1), %xmm3 | |||||
| vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0 | |||||
| vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1 | |||||
| vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2 | |||||
| vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3 | |||||
| vmovups %xmm0, 0 * SIZE(B) | |||||
| vmovups %xmm1, 2 * SIZE(B) | |||||
| vmovups %xmm2, 4 * SIZE(B) | |||||
| vmovups %xmm3, 6 * SIZE(B) | |||||
| #endif | |||||
| addq $4 * SIZE, AO1 | |||||
| addq $4 * SIZE, AO2 | |||||
| subq $-8 * SIZE, B | |||||
| ALIGN_4 | |||||
| .L14: | |||||
| movq M, I | |||||
| andq $3, I | |||||
| jle .L16 | |||||
| ALIGN_4 | |||||
| .L15: | |||||
| #ifndef DOUBLE | |||||
| vmovss 0 * SIZE(AO1), %xmm0 | |||||
| vmovss 0 * SIZE(AO2), %xmm1 | |||||
| vmovss %xmm0, 0 * SIZE(B) | |||||
| vmovss %xmm1, 1 * SIZE(B) | |||||
| #else | |||||
| vmovsd 0 * SIZE(AO1), %xmm0 | |||||
| vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0 | |||||
| vmovups %xmm0, 0 * SIZE(B) | |||||
| #endif | |||||
| addq $SIZE, AO1 | |||||
| addq $SIZE, AO2 | |||||
| addq $2 * SIZE, B | |||||
| decq I | |||||
| jg .L15 | |||||
| ALIGN_4 | |||||
| .L16: | |||||
| decq J | |||||
| jg .L01 | |||||
| ALIGN_4 | |||||
| .L20: | |||||
| testq $1, N | |||||
| jle .L999 | |||||
| movq A, AO1 | |||||
| movq M, I | |||||
| sarq $2, I | |||||
| jle .L34 | |||||
| ALIGN_4 | |||||
| .L33: | |||||
| #ifndef DOUBLE | |||||
| vmovups 0 * SIZE(AO1), %xmm0 | |||||
| vmovups %xmm0, 0 * SIZE(B) | |||||
| #else | |||||
| vmovups 0 * SIZE(AO1), %xmm0 | |||||
| vmovups 2 * SIZE(AO1), %xmm1 | |||||
| vmovups %xmm0, 0 * SIZE(B) | |||||
| vmovups %xmm1, 2 * SIZE(B) | |||||
| #endif | |||||
| addq $4 * SIZE, AO1 | |||||
| subq $-4 * SIZE, B | |||||
| decq I | |||||
| jg .L33 | |||||
| ALIGN_4 | |||||
| .L34: | |||||
| movq M, I | |||||
| andq $3, I | |||||
| jle .L999 | |||||
| ALIGN_4 | |||||
| .L35: | |||||
| #ifndef DOUBLE | |||||
| vmovss 0 * SIZE(AO1), %xmm0 | |||||
| vmovss %xmm0, 0 * SIZE(B) | |||||
| #else | |||||
| vmovsd 0 * SIZE(AO1), %xmm0 | |||||
| vmovsd %xmm0, 0 * SIZE(B) | |||||
| #endif | |||||
| addq $SIZE, AO1 | |||||
| addq $1 * SIZE, B | |||||
| decq I | |||||
| jg .L35 | |||||
| ALIGN_4 | |||||
| .L999: | |||||
| #ifdef WINDOWS_ABI | |||||
| vmovups 0(%rsp), %xmm6 | |||||
| vmovups 16(%rsp), %xmm7 | |||||
| vmovups 32(%rsp), %xmm8 | |||||
| vmovups 48(%rsp), %xmm9 | |||||
| vmovups 64(%rsp), %xmm10 | |||||
| vmovups 80(%rsp), %xmm11 | |||||
| vmovups 96(%rsp), %xmm12 | |||||
| vmovups 112(%rsp), %xmm13 | |||||
| vmovups 128(%rsp), %xmm14 | |||||
| vmovups 144(%rsp), %xmm15 | |||||
| addq $STACKSIZE, %rsp | |||||
| #endif | |||||
| popq %r12 | |||||
| popq %r13 | |||||
| #ifdef WINDOWS_ABI | |||||
| popq %r14 | |||||
| popq %r15 | |||||
| #endif | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,374 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #ifndef WINDOWS_ABI | |||||
| #define M ARG1 /* rdi */ | |||||
| #define N ARG2 /* rsi */ | |||||
| #define A ARG3 /* rdx */ | |||||
| #define LDA ARG4 /* rcx */ | |||||
| #define B ARG5 /* r8 */ | |||||
| #define I %r10 | |||||
| #define J %rbp | |||||
| #define AO1 %r9 | |||||
| #define AO2 %r15 | |||||
| #define AO3 %r11 | |||||
| #define AO4 %r14 | |||||
| #define BO1 %r13 | |||||
| #define M8 %rbx | |||||
| #define BO %rax | |||||
| #else | |||||
| #define STACKSIZE 256 | |||||
| #define M ARG1 /* rcx */ | |||||
| #define N ARG2 /* rdx */ | |||||
| #define A ARG3 /* r8 */ | |||||
| #define LDA ARG4 /* r9 */ | |||||
| #define OLD_B 40 + 64 + STACKSIZE(%rsp) | |||||
| #define B %rdi | |||||
| #define I %r10 | |||||
| #define J %r11 | |||||
| #define AO1 %r12 | |||||
| #define AO2 %r13 | |||||
| #define AO3 %r14 | |||||
| #define AO4 %r15 | |||||
| #define BO1 %rsi | |||||
| #define M8 %rbp | |||||
| #define BO %rax | |||||
| #endif | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| #ifdef WINDOWS_ABI | |||||
| pushq %rdi | |||||
| pushq %rsi | |||||
| #endif | |||||
| pushq %r15 | |||||
| pushq %r14 | |||||
| pushq %r13 | |||||
| pushq %r12 | |||||
| pushq %rbp | |||||
| pushq %rbx | |||||
| #ifdef WINDOWS_ABI | |||||
| subq $STACKSIZE, %rsp | |||||
| vmovups %xmm6, 0(%rsp) | |||||
| vmovups %xmm7, 16(%rsp) | |||||
| vmovups %xmm8, 32(%rsp) | |||||
| vmovups %xmm9, 48(%rsp) | |||||
| vmovups %xmm10, 64(%rsp) | |||||
| vmovups %xmm11, 80(%rsp) | |||||
| vmovups %xmm12, 96(%rsp) | |||||
| vmovups %xmm13, 112(%rsp) | |||||
| vmovups %xmm14, 128(%rsp) | |||||
| vmovups %xmm15, 144(%rsp) | |||||
| movq OLD_B, B | |||||
| #endif | |||||
| movq N, %rax | |||||
| andq $-2, %rax | |||||
| imulq M, %rax | |||||
| leaq (B, %rax, SIZE), BO1 | |||||
| leaq (, LDA, SIZE), LDA | |||||
| leaq (, M, SIZE), M8 | |||||
| movq M, J | |||||
| sarq $1, J | |||||
| jle .L20 | |||||
| ALIGN_4 | |||||
| .L01: | |||||
| movq A, AO1 | |||||
| leaq (A, LDA ), AO2 | |||||
| leaq (A, LDA, 2), A | |||||
| movq B, BO | |||||
| addq $4 * SIZE, B | |||||
| movq N, I | |||||
| sarq $3, I | |||||
| jle .L10 | |||||
| ALIGN_4 | |||||
| .L08: | |||||
| #ifndef DOUBLE | |||||
| vmovsd 0 * SIZE(AO1), %xmm0 | |||||
| vmovsd 2 * SIZE(AO1), %xmm2 | |||||
| vmovsd 4 * SIZE(AO1), %xmm4 | |||||
| vmovsd 6 * SIZE(AO1), %xmm6 | |||||
| vmovsd 0 * SIZE(AO2), %xmm1 | |||||
| vmovsd 2 * SIZE(AO2), %xmm3 | |||||
| vmovsd 4 * SIZE(AO2), %xmm5 | |||||
| vmovsd 6 * SIZE(AO2), %xmm7 | |||||
| vmovsd %xmm0, 0 * SIZE(BO) | |||||
| vmovsd %xmm1, 2 * SIZE(BO) | |||||
| leaq (BO, M8, 2), BO | |||||
| vmovsd %xmm2, 0 * SIZE(BO) | |||||
| vmovsd %xmm3, 2 * SIZE(BO) | |||||
| leaq (BO, M8, 2), BO | |||||
| vmovsd %xmm4, 0 * SIZE(BO) | |||||
| vmovsd %xmm5, 2 * SIZE(BO) | |||||
| leaq (BO, M8, 2), BO | |||||
| vmovsd %xmm6, 0 * SIZE(BO) | |||||
| vmovsd %xmm7, 2 * SIZE(BO) | |||||
| leaq (BO, M8, 2), BO | |||||
| #else | |||||
| prefetchnta 256(AO1) | |||||
| prefetchnta 256(AO2) | |||||
| vmovups 0 * SIZE(AO1), %xmm0 | |||||
| vmovups 2 * SIZE(AO1), %xmm2 | |||||
| vmovups 4 * SIZE(AO1), %xmm4 | |||||
| vmovups 6 * SIZE(AO1), %xmm6 | |||||
| vmovups 0 * SIZE(AO2), %xmm1 | |||||
| vmovups 2 * SIZE(AO2), %xmm3 | |||||
| vmovups 4 * SIZE(AO2), %xmm5 | |||||
| vmovups 6 * SIZE(AO2), %xmm7 | |||||
| vmovups %xmm0, 0 * SIZE(BO) | |||||
| vmovups %xmm1, 2 * SIZE(BO) | |||||
| leaq (BO, M8, 2), BO | |||||
| vmovups %xmm2, 0 * SIZE(BO) | |||||
| vmovups %xmm3, 2 * SIZE(BO) | |||||
| leaq (BO, M8, 2), BO | |||||
| vmovups %xmm4, 0 * SIZE(BO) | |||||
| vmovups %xmm5, 2 * SIZE(BO) | |||||
| leaq (BO, M8, 2), BO | |||||
| vmovups %xmm6, 0 * SIZE(BO) | |||||
| vmovups %xmm7, 2 * SIZE(BO) | |||||
| leaq (BO, M8, 2), BO | |||||
| #endif | |||||
| addq $8 * SIZE, AO1 | |||||
| addq $8 * SIZE, AO2 | |||||
| decq I | |||||
| jg .L08 | |||||
| ALIGN_4 | |||||
| .L10: | |||||
| testq $4, N | |||||
| jle .L12 | |||||
| #ifndef DOUBLE | |||||
| vmovsd 0 * SIZE(AO1), %xmm0 | |||||
| vmovsd 2 * SIZE(AO1), %xmm2 | |||||
| vmovsd 0 * SIZE(AO2), %xmm1 | |||||
| vmovsd 2 * SIZE(AO2), %xmm3 | |||||
| vmovsd %xmm0, 0 * SIZE(BO) | |||||
| vmovsd %xmm1, 2 * SIZE(BO) | |||||
| leaq (BO, M8, 2), BO | |||||
| vmovsd %xmm2, 0 * SIZE(BO) | |||||
| vmovsd %xmm3, 2 * SIZE(BO) | |||||
| leaq (BO, M8, 2), BO | |||||
| #else | |||||
| vmovups 0 * SIZE(AO1), %xmm0 | |||||
| vmovups 2 * SIZE(AO1), %xmm2 | |||||
| vmovups 0 * SIZE(AO2), %xmm1 | |||||
| vmovups 2 * SIZE(AO2), %xmm3 | |||||
| vmovups %xmm0, 0 * SIZE(BO) | |||||
| vmovups %xmm1, 2 * SIZE(BO) | |||||
| leaq (BO, M8, 2), BO | |||||
| vmovups %xmm2, 0 * SIZE(BO) | |||||
| vmovups %xmm3, 2 * SIZE(BO) | |||||
| leaq (BO, M8, 2), BO | |||||
| #endif | |||||
| addq $4 * SIZE, AO1 | |||||
| addq $4 * SIZE, AO2 | |||||
| ALIGN_4 | |||||
| .L12: | |||||
| testq $2, N | |||||
| jle .L14 | |||||
| #ifndef DOUBLE | |||||
| vmovsd 0 * SIZE(AO1), %xmm0 | |||||
| vmovsd 0 * SIZE(AO2), %xmm1 | |||||
| vmovsd %xmm0, 0 * SIZE(BO) | |||||
| vmovsd %xmm1, 2 * SIZE(BO) | |||||
| #else | |||||
| vmovups 0 * SIZE(AO1), %xmm0 | |||||
| vmovups 0 * SIZE(AO2), %xmm1 | |||||
| vmovups %xmm0, 0 * SIZE(BO) | |||||
| vmovups %xmm1, 2 * SIZE(BO) | |||||
| #endif | |||||
| leaq (BO, M8, 2), BO | |||||
| addq $2 * SIZE, AO1 | |||||
| addq $2 * SIZE, AO2 | |||||
| ALIGN_4 | |||||
| .L14: | |||||
| testq $1, N | |||||
| jle .L19 | |||||
| #ifndef DOUBLE | |||||
| vmovss 0 * SIZE(AO1), %xmm0 | |||||
| vmovss 0 * SIZE(AO2), %xmm1 | |||||
| vmovss %xmm0, 0 * SIZE(BO1) | |||||
| vmovss %xmm1, 1 * SIZE(BO1) | |||||
| #else | |||||
| vmovsd 0 * SIZE(AO1), %xmm0 | |||||
| vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0 | |||||
| vmovups %xmm0, 0 * SIZE(BO1) | |||||
| #endif | |||||
| addq $2 * SIZE, BO1 | |||||
| ALIGN_4 | |||||
| .L19: | |||||
| decq J | |||||
| jg .L01 | |||||
| ALIGN_4 | |||||
| .L20: | |||||
| testq $1, M | |||||
| jle .L999 | |||||
| ALIGN_4 | |||||
| .L31: | |||||
| movq A, AO1 | |||||
| movq B, BO | |||||
| movq N, I | |||||
| sarq $1, I | |||||
| jle .L33 | |||||
| ALIGN_4 | |||||
| .L32: | |||||
| #ifndef DOUBLE | |||||
| vmovsd 0 * SIZE(AO1), %xmm0 | |||||
| vmovsd %xmm0, 0 * SIZE(BO) | |||||
| #else | |||||
| vmovups 0 * SIZE(AO1), %xmm0 | |||||
| vmovups %xmm0, 0 * SIZE(BO) | |||||
| #endif | |||||
| addq $2 * SIZE, AO1 | |||||
| leaq (BO, M8, 2), BO | |||||
| decq I | |||||
| jg .L32 | |||||
| ALIGN_4 | |||||
| .L33: | |||||
| testq $1, N | |||||
| jle .L999 | |||||
| #ifndef DOUBLE | |||||
| vmovss 0 * SIZE(AO1), %xmm0 | |||||
| vmovss %xmm0, 0 * SIZE(BO1) | |||||
| #else | |||||
| vmovsd 0 * SIZE(AO1), %xmm0 | |||||
| vmovsd %xmm0, 0 * SIZE(BO1) | |||||
| #endif | |||||
| addq $1 * SIZE, BO1 | |||||
| ALIGN_4 | |||||
| .L999: | |||||
| #ifdef WINDOWS_ABI | |||||
| vmovups 0(%rsp), %xmm6 | |||||
| vmovups 16(%rsp), %xmm7 | |||||
| vmovups 32(%rsp), %xmm8 | |||||
| vmovups 48(%rsp), %xmm9 | |||||
| vmovups 64(%rsp), %xmm10 | |||||
| vmovups 80(%rsp), %xmm11 | |||||
| vmovups 96(%rsp), %xmm12 | |||||
| vmovups 112(%rsp), %xmm13 | |||||
| vmovups 128(%rsp), %xmm14 | |||||
| vmovups 144(%rsp), %xmm15 | |||||
| addq $STACKSIZE, %rsp | |||||
| #endif | |||||
| popq %rbx | |||||
| popq %rbp | |||||
| popq %r12 | |||||
| popq %r13 | |||||
| popq %r14 | |||||
| popq %r15 | |||||
| #ifdef WINDOWS_ABI | |||||
| popq %rsi | |||||
| popq %rdi | |||||
| #endif | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -47,7 +47,7 @@ | |||||
| #ifndef WINDOWS_ABI | #ifndef WINDOWS_ABI | ||||
| #define STACKSIZE 64 | |||||
| #define STACKSIZE 128 | |||||
| #define OLD_M %rdi | #define OLD_M %rdi | ||||
| #define OLD_N %rsi | #define OLD_N %rsi | ||||
| @@ -58,10 +58,14 @@ | |||||
| #define STACK_INCY 24 + STACKSIZE(%rsp) | #define STACK_INCY 24 + STACKSIZE(%rsp) | ||||
| #define STACK_BUFFER 32 + STACKSIZE(%rsp) | #define STACK_BUFFER 32 + STACKSIZE(%rsp) | ||||
| #define ALPHA 48 (%rsp) | #define ALPHA 48 (%rsp) | ||||
| #define MMM 56(%rsp) | |||||
| #define NN 64(%rsp) | |||||
| #define AA 72(%rsp) | |||||
| #define LDAX 80(%rsp) | |||||
| #define XX 96(%rsp) | |||||
| #else | #else | ||||
| #define STACKSIZE 256 | |||||
| #define STACKSIZE 288 | |||||
| #define OLD_M %rcx | #define OLD_M %rcx | ||||
| #define OLD_N %rdx | #define OLD_N %rdx | ||||
| @@ -74,6 +78,12 @@ | |||||
| #define STACK_BUFFER 88 + STACKSIZE(%rsp) | #define STACK_BUFFER 88 + STACKSIZE(%rsp) | ||||
| #define ALPHA 224 (%rsp) | #define ALPHA 224 (%rsp) | ||||
| #define MMM 232(%rsp) | |||||
| #define NN 240(%rsp) | |||||
| #define AA 248(%rsp) | |||||
| #define LDAX 256(%rsp) | |||||
| #define XX 264(%rsp) | |||||
| #endif | #endif | ||||
| #define LDA %r8 | #define LDA %r8 | ||||
| @@ -137,17 +147,41 @@ | |||||
| movq OLD_LDA, LDA | movq OLD_LDA, LDA | ||||
| #endif | #endif | ||||
| movq STACK_INCX, INCX | |||||
| movq STACK_Y, Y | |||||
| movq STACK_INCY, INCY | |||||
| movq STACK_BUFFER, BUFFER | |||||
| #ifndef WINDOWS_ABI | #ifndef WINDOWS_ABI | ||||
| movss %xmm0, ALPHA | movss %xmm0, ALPHA | ||||
| #else | #else | ||||
| movss %xmm3, ALPHA | movss %xmm3, ALPHA | ||||
| #endif | #endif | ||||
| movq M,MMM | |||||
| movq A,AA | |||||
| movq N,NN | |||||
| movq LDA,LDAX | |||||
| movq X,XX | |||||
| movq STACK_Y, Y | |||||
| .L0t: | |||||
| xorq I,I | |||||
| addq $1,I | |||||
| salq $22,I | |||||
| subq I,MMM | |||||
| movq I,M | |||||
| jge .L00t | |||||
| movq MMM,M | |||||
| addq I,M | |||||
| jle .L999x | |||||
| .L00t: | |||||
| movq AA,A | |||||
| movq NN,N | |||||
| movq LDAX,LDA | |||||
| movq XX,X | |||||
| movq STACK_INCX, INCX | |||||
| movq STACK_INCY, INCY | |||||
| movq STACK_BUFFER, BUFFER | |||||
| leaq (,INCX, SIZE), INCX | leaq (,INCX, SIZE), INCX | ||||
| leaq (,INCY, SIZE), INCY | leaq (,INCY, SIZE), INCY | ||||
| leaq (,LDA, SIZE), LDA | leaq (,LDA, SIZE), LDA | ||||
| @@ -5990,6 +6024,12 @@ | |||||
| ALIGN_3 | ALIGN_3 | ||||
| .L999: | .L999: | ||||
| leaq (,M,SIZE),%rax | |||||
| addq %rax,AA | |||||
| jmp .L0t | |||||
| ALIGN_4 | |||||
| .L999x: | |||||
| movq 0(%rsp), %rbx | movq 0(%rsp), %rbx | ||||
| movq 8(%rsp), %rbp | movq 8(%rsp), %rbp | ||||
| movq 16(%rsp), %r12 | movq 16(%rsp), %r12 | ||||
| @@ -63,7 +63,7 @@ | |||||
| #else | #else | ||||
| #define STACKSIZE 256 | |||||
| #define STACKSIZE 288 | |||||
| #define OLD_M %rcx | #define OLD_M %rcx | ||||
| #define OLD_N %rdx | #define OLD_N %rdx | ||||
| @@ -74,10 +74,10 @@ | |||||
| #define STACK_Y 72 + STACKSIZE(%rsp) | #define STACK_Y 72 + STACKSIZE(%rsp) | ||||
| #define STACK_INCY 80 + STACKSIZE(%rsp) | #define STACK_INCY 80 + STACKSIZE(%rsp) | ||||
| #define STACK_BUFFER 88 + STACKSIZE(%rsp) | #define STACK_BUFFER 88 + STACKSIZE(%rsp) | ||||
| #define MMM 216(%rsp) | |||||
| #define NN 224(%rsp) | |||||
| #define AA 232(%rsp) | |||||
| #define LDAX 240(%rsp) | |||||
| #define MMM 232(%rsp) | |||||
| #define NN 240(%rsp) | |||||
| #define AA 248(%rsp) | |||||
| #define LDAX 256(%rsp) | |||||
| #endif | #endif | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -1385,7 +1385,7 @@ ALIGN_5 | |||||
| EXTRA_DY $1, yvec15, xvec7; | EXTRA_DY $1, yvec15, xvec7; | ||||
| EXTRA_DY $1, yvec14, xvec6; | EXTRA_DY $1, yvec14, xvec6; | ||||
| EXTRA_DY $1, yvec13, xvec5; | EXTRA_DY $1, yvec13, xvec5; | ||||
| EXTRA_DY $2, yvec12, xvec4; | |||||
| EXTRA_DY $1, yvec12, xvec4; | |||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| LDL_DX 0*SIZE(C0), xvec0, xvec0; | LDL_DX 0*SIZE(C0), xvec0, xvec0; | ||||
| LDH_DX 1*SIZE(C0), xvec0, xvec0; | LDH_DX 1*SIZE(C0), xvec0, xvec0; | ||||
| @@ -1406,8 +1406,8 @@ STL_DX xvec7, 2*SIZE(C0, ldc, 1); | |||||
| STH_DX xvec7, 3*SIZE(C0, ldc, 1); | STH_DX xvec7, 3*SIZE(C0, ldc, 1); | ||||
| STL_DX xvec13, 0*SIZE(C0, ldc, 1); | STL_DX xvec13, 0*SIZE(C0, ldc, 1); | ||||
| STH_DX xvec13, 1*SIZE(C0, ldc, 1); | STH_DX xvec13, 1*SIZE(C0, ldc, 1); | ||||
| STL_DX xvec6, 2*SIZE(C0); | |||||
| STH_DX xvec6, 3*SIZE(C0); | |||||
| STL_DX xvec5, 2*SIZE(C0); | |||||
| STH_DX xvec5, 3*SIZE(C0); | |||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| LDL_DX 0*SIZE(C1), xvec0, xvec0; | LDL_DX 0*SIZE(C1), xvec0, xvec0; | ||||
| LDH_DX 1*SIZE(C1), xvec0, xvec0; | LDH_DX 1*SIZE(C1), xvec0, xvec0; | ||||