| @@ -1,16 +1,25 @@ | |||||
| *.obj | *.obj | ||||
| *.lib | *.lib | ||||
| *.dll | *.dll | ||||
| *.dylib | |||||
| *.def | *.def | ||||
| *.o | *.o | ||||
| lapack-3.1.1 | lapack-3.1.1 | ||||
| lapack-3.1.1.tgz | lapack-3.1.1.tgz | ||||
| lapack-3.4.1 | |||||
| lapack-3.4.1.tgz | |||||
| lapack-3.4.2 | |||||
| lapack-3.4.2.tgz | |||||
| *.so | *.so | ||||
| *.a | *.a | ||||
| .svn | .svn | ||||
| *~ | *~ | ||||
| lib.grd | |||||
| nohup.out | |||||
| config.h | config.h | ||||
| Makefile.conf | Makefile.conf | ||||
| Makefile.conf_last | |||||
| config_last.h | |||||
| getarch | getarch | ||||
| getarch_2nd | getarch_2nd | ||||
| utest/openblas_utest | utest/openblas_utest | ||||
| @@ -1,4 +1,115 @@ | |||||
| OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
| ==================================================================== | |||||
| Version 0.2.6 | |||||
| 2-Mar-2013 | |||||
| common: | |||||
| * Improved OpenMP performance slightly. (d744c9) | |||||
| * Improved cblas.h compatibility with Intel MKL.(#185) | |||||
| * Fixed the overflowing bug in single thread cholesky factorization. | |||||
| * Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174) | |||||
| x86/x86-64: | |||||
| * Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) | |||||
| We will tune the performance in future. | |||||
| * Auto-detect Intel Xeon E7540. | |||||
| * Fixed the overflowing buffer bug of gemv. (#173) | |||||
| * Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189) | |||||
| MIPS64: | |||||
| ==================================================================== | |||||
| Version 0.2.5 | |||||
| 26-Nov-2012 | |||||
| common: | |||||
| * Added NO_SHARED flag to disable generating the shared library. | |||||
| * Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158) | |||||
| * Export LAPACK 3.4.2 symbols in shared library. (#147) | |||||
| * Only detect the number of physical CPU cores on Mac OSX. (#157) | |||||
| * Fixed NetBSD build. (#155) | |||||
| * Fixed compilation with TARGET=GENERIC. (#160) | |||||
| x86/x86-64: | |||||
| * Restore the original CPU affinity when calling | |||||
| openblas_set_num_threads(1) (#153) | |||||
| * Fixed a SEGFAULT bug in dgemv_t when m is very large.(#154) | |||||
| MIPS64: | |||||
| ==================================================================== | |||||
| Version 0.2.4 | |||||
| 8-Oct-2012 | |||||
| common: | |||||
| * Upgraded LAPACK to 3.4.2 version. (#145) | |||||
| * Provided support for passing CFLAGS, FFLAGS, PFLAGS, | |||||
| FPFLAGS to make. (#137) | |||||
| * f77blas.h:compatibility for compilers without C99 complex | |||||
| number support. (#141) | |||||
| x86/x86-64: | |||||
| * Added NO_AVX flag. Check OS supporting AVX on runtime. (#139) | |||||
| * Fixed zdot incompatibility ABI issue with GCC 4.7 on | |||||
| Windows 32-bit. (#140) | |||||
| MIPS64: | |||||
| * Fixed the generation of shared library bug. | |||||
| * Fixed the detection bug on the Loongson 3A server. | |||||
| ==================================================================== | |||||
| Version 0.2.3 | |||||
| 20-Aug-2012 | |||||
| common: | |||||
| * Fixed LAPACK unstable bug about ?laswp. (#130) | |||||
| * Fixed the shared library bug about unloading the library on | |||||
| Linux (#132). | |||||
| * Fixed the compilation failure on BlueGene/P (TARGET=PPC440FP2) | |||||
| Please use gcc and IBM xlf. (#134) | |||||
| x86/x86-64: | |||||
| * Supported goto_set_num_threads and openblas_set_num_threads | |||||
| APIs in Windows. They can set the number of threads on runtime. | |||||
| ==================================================================== | |||||
| Version 0.2.2 | |||||
| 6-July-2012 | |||||
| common: | |||||
| * Fixed exporting DLL functions bug on Windows/MingW | |||||
| * Support GNU Hurd (Thank Sylvestre Ledru) | |||||
| * Support kfreebsd kernel (Thank Sylvestre Ledru) | |||||
| x86/x86-64: | |||||
| * Support Intel Sandy Bridge 22nm desktop/mobile CPU | |||||
| SPARC: | |||||
| * Improve the detection of SPARC (Thank Sylvestre Ledru) | |||||
| ==================================================================== | |||||
| Version 0.2.1 | |||||
| 30-Jun-2012 | |||||
| common: | |||||
| x86/x86-64: | |||||
| * Fixed the SEGFAULT bug about hyper-theading | |||||
| * Support AMD Bulldozer by using GotoBLAS2 AMD Barcelona codes | |||||
| ==================================================================== | |||||
| Version 0.2.0 | |||||
| 26-Jun-2012 | |||||
| common: | |||||
| * Removed the limitation (64) of numbers of CPU cores. | |||||
| Now, it supports 256 cores at max. | |||||
| * Supported clang compiler. | |||||
| * Fixed some build bugs on FreeBSD | |||||
| x86/x86-64: | |||||
| * Optimized Level-3 BLAS on Intel Sandy Bridge x86-64 by AVX instructions. | |||||
| Please use gcc >= 4.6 or clang >=3.1. | |||||
| * Support AMD Bobcat by using GotoBLAS2 AMD Barcelona codes. | |||||
| ==================================================================== | |||||
| Version 0.1.1 | |||||
| 29-Apr-2012 | |||||
| common: | |||||
| * Upgraded LAPACK to 3.4.1 version. (Thank Zaheer Chothia) | |||||
| * Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia) | |||||
| * Fixed the build bug (MD5 and download) on Mac OSX. | |||||
| * Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1. | |||||
| * Fxied the compatibility issue for compilers without C99 complex number | |||||
| (e.g. Visual Studio) | |||||
| x86/x86_64: | |||||
| * Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX. | |||||
| * Test alpha=Nan in dscale. | |||||
| * Fixed a SEGFAULT bug in samax on x86 windows. | |||||
| ==================================================================== | ==================================================================== | ||||
| Version 0.1.0 | Version 0.1.0 | ||||
| 23-Mar-2012 | 23-Mar-2012 | ||||
| @@ -90,6 +90,15 @@ | |||||
| number of threads will consume extra resource. I recommend you to | number of threads will consume extra resource. I recommend you to | ||||
| specify minimum number of threads. | specify minimum number of threads. | ||||
| 1.9 Q I have segfaults when I compile with USE_OPENMP=1. What's wrong? | |||||
| A This may be related to a bug in the Linux kernel 2.6.32. Try applying | |||||
| the patch segaults.patch using | |||||
| patch < segfaults.patch | |||||
| and see if the crashes persist. Note that this patch will lead to many | |||||
| compiler warnings. | |||||
| 2. Architecture Specific issue or Implementation | 2. Architecture Specific issue or Implementation | ||||
| @@ -1,4 +1,4 @@ | |||||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -3,7 +3,7 @@ include ./Makefile.system | |||||
| BLASDIRS = interface driver/level2 driver/level3 driver/others | BLASDIRS = interface driver/level2 driver/level3 driver/others | ||||
| ifndef DYNAMIC_ARCH | |||||
| ifneq ($(DYNAMIC_ARCH), 1) | |||||
| BLASDIRS += kernel | BLASDIRS += kernel | ||||
| endif | endif | ||||
| @@ -26,7 +26,7 @@ endif | |||||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | ||||
| .PHONY : all libs netlib test ctest shared install | |||||
| .PHONY : all libs netlib test ctest shared install | |||||
| .NOTPARALLEL : all libs prof lapack-test install | .NOTPARALLEL : all libs prof lapack-test install | ||||
| all :: libs netlib tests shared | all :: libs netlib tests shared | ||||
| @@ -80,6 +80,7 @@ endif | |||||
| @echo | @echo | ||||
| shared : | shared : | ||||
| ifndef NO_SHARED | |||||
| ifeq ($(OSNAME), Linux) | ifeq ($(OSNAME), Linux) | ||||
| $(MAKE) -C exports so | $(MAKE) -C exports so | ||||
| -ln -fs $(LIBSONAME) $(LIBPREFIX).so | -ln -fs $(LIBSONAME) $(LIBPREFIX).so | ||||
| @@ -99,11 +100,10 @@ ifeq ($(OSNAME), Darwin) | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), WINNT) | ifeq ($(OSNAME), WINNT) | ||||
| $(MAKE) -C exports dll | $(MAKE) -C exports dll | ||||
| -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), CYGWIN_NT) | ifeq ($(OSNAME), CYGWIN_NT) | ||||
| $(MAKE) -C exports dll | $(MAKE) -C exports dll | ||||
| -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll | |||||
| endif | |||||
| endif | endif | ||||
| tests : | tests : | ||||
| @@ -147,7 +147,7 @@ ifeq ($(EXPRECISION), 1) | |||||
| echo "#define EXPRECISION">> config_last.h | echo "#define EXPRECISION">> config_last.h | ||||
| endif | endif | ||||
| ## | ## | ||||
| ifdef DYNAMIC_ARCH | |||||
| ifeq ($(DYNAMIC_ARCH), 1) | |||||
| $(MAKE) -C kernel commonlibs || exit 1 | $(MAKE) -C kernel commonlibs || exit 1 | ||||
| for d in $(DYNAMIC_CORE) ; \ | for d in $(DYNAMIC_CORE) ; \ | ||||
| do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ | do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ | ||||
| @@ -165,7 +165,7 @@ prof_blas : | |||||
| $(MAKE) -C $$d prof || exit 1 ; \ | $(MAKE) -C $$d prof || exit 1 ; \ | ||||
| fi; \ | fi; \ | ||||
| done | done | ||||
| ifdef DYNAMIC_ARCH | |||||
| ifeq ($(DYNAMIC_ARCH), 1) | |||||
| $(MAKE) -C kernel commonprof || exit 1 | $(MAKE) -C kernel commonprof || exit 1 | ||||
| endif | endif | ||||
| @@ -184,7 +184,7 @@ hpl : | |||||
| $(MAKE) -C $$d $(@F) || exit 1 ; \ | $(MAKE) -C $$d $(@F) || exit 1 ; \ | ||||
| fi; \ | fi; \ | ||||
| done | done | ||||
| ifdef DYNAMIC_ARCH | |||||
| ifeq ($(DYNAMIC_ARCH), 1) | |||||
| $(MAKE) -C kernel commonlibs || exit 1 | $(MAKE) -C kernel commonlibs || exit 1 | ||||
| for d in $(DYNAMIC_CORE) ; \ | for d in $(DYNAMIC_CORE) ; \ | ||||
| do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ | do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ | ||||
| @@ -203,47 +203,73 @@ ifeq ($(NO_LAPACK), 1) | |||||
| netlib : | netlib : | ||||
| else | else | ||||
| netlib : lapack-3.4.0 patch.for_lapack-3.4.0 lapack-3.4.0/make.inc | |||||
| netlib : lapack-3.4.2 patch.for_lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc | |||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| -@$(MAKE) -C lapack-3.4.0 lapacklib | |||||
| -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib | |||||
| endif | |||||
| ifndef NO_LAPACKE | |||||
| -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib | |||||
| endif | endif | ||||
| endif | endif | ||||
| prof_lapack : lapack-3.4.0 lapack-3.4.0/make.inc | |||||
| -@$(MAKE) -C lapack-3.4.0 lapack_prof | |||||
| prof_lapack : lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | |||||
| lapack-3.4.0/make.inc : | |||||
| $(NETLIB_LAPACK_DIR)/make.inc : | |||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| -@echo "FORTRAN = $(FC)" > lapack-3.4.0/make.inc | |||||
| -@echo "OPTS = $(FFLAGS)" >> lapack-3.4.0/make.inc | |||||
| -@echo "POPTS = $(FPFLAGS)" >> lapack-3.4.0/make.inc | |||||
| -@echo "NOOPT = $(FFLAGS) -O0" >> lapack-3.4.0/make.inc | |||||
| -@echo "PNOOPT = $(FPFLAGS) -O0" >> lapack-3.4.0/make.inc | |||||
| -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> lapack-3.4.0/make.inc | |||||
| -@echo "ARCH = $(AR)" >> lapack-3.4.0/make.inc | |||||
| -@echo "RANLIB = $(RANLIB)" >> lapack-3.4.0/make.inc | |||||
| -@echo "LAPACKLIB = ../$(LIBNAME)" >> lapack-3.4.0/make.inc | |||||
| -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> lapack-3.4.0/make.inc | |||||
| -@echo "SUFFIX = $(SUFFIX)" >> lapack-3.4.0/make.inc | |||||
| -@echo "PSUFFIX = $(PSUFFIX)" >> lapack-3.4.0/make.inc | |||||
| # -@echo "CEXTRALIB = $(CEXTRALIB)" >> lapack-3.4.0/make.inc | |||||
| -@cat make.inc >> lapack-3.4.0/make.inc | |||||
| endif | |||||
| lapack-3.4.0 : lapack-3.4.0.tgz | |||||
| -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "POPTS = $(FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "NOOPT = $(FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| ifdef INTERFACE64 | |||||
| -@echo "override CFLAGS = $(CFLAGS) -DHAVE_LAPACK_CONFIG_H -DLAPACK_ILP64" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| else | |||||
| -@echo "override CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| endif | |||||
| -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| endif | |||||
| lapack-3.4.2 : lapack-3.4.2.tgz | |||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| @if test `$(MD5SUM) lapack-3.4.0.tgz | $(AWK) '{print $$1}'` = 02d5706ec03ba885fc246e5fa10d8c70; then \ | |||||
| ifndef NO_LAPACK | |||||
| @if test `$(MD5SUM) lapack-3.4.2.tgz | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \ | |||||
| echo $(TAR) zxf $< ;\ | echo $(TAR) zxf $< ;\ | ||||
| $(TAR) zxf $< && (cd lapack-3.4.0; $(PATCH) -p1 < ../patch.for_lapack-3.4.0) ;\ | |||||
| $(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.2) ;\ | |||||
| rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\ | |||||
| else \ | else \ | ||||
| echo " lapack-3.4.0.tgz check sum is wrong (Please use orignal)." ;\ | |||||
| rm -rf lapack-3.4.0 ;\ | |||||
| rm -rf $(NETLIB_LAPACK_DIR) ;\ | |||||
| echo " Cannot download lapack-3.4.2.tgz or the MD5 check sum is wrong (Please use orignal)."; \ | |||||
| exit 1; \ | |||||
| fi | fi | ||||
| endif | endif | ||||
| endif | |||||
| LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.2.tgz | |||||
| lapack-3.4.0.tgz : | |||||
| lapack-3.4.2.tgz : | |||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| -wget http://www.netlib.org/lapack/lapack-3.4.0.tgz | |||||
| #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or | |||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD)) | |||||
| curl -O $(LAPACK_URL) | |||||
| else | |||||
| ifeq ($(OSNAME), FreeBSD) | |||||
| fetch $(LAPACK_URL) | |||||
| else | |||||
| wget -O $@ $(LAPACK_URL) | |||||
| endif | |||||
| endif | |||||
| endif | endif | ||||
| large.tgz : | large.tgz : | ||||
| @@ -256,21 +282,21 @@ ifndef NOFORTRAN | |||||
| -wget http://www.netlib.org/lapack/timing/timing.tgz | -wget http://www.netlib.org/lapack/timing/timing.tgz | ||||
| endif | endif | ||||
| lapack-timing : lapack-3.4.0 large.tgz timing.tgz | |||||
| lapack-timing : lapack-3.4.2 large.tgz timing.tgz | |||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| (cd lapack-3.4.0; $(TAR) zxf ../timing.tgz TIMING) | |||||
| (cd lapack-3.4.0/TIMING; $(TAR) zxf ../../large.tgz ) | |||||
| make -C lapack-3.4.0 tmglib | |||||
| make -C lapack-3.4.0/TIMING | |||||
| (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) | |||||
| (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) | |||||
| make -C $(NETLIB_LAPACK_DIR) tmglib | |||||
| make -C $(NETLIB_LAPACK_DIR)/TIMING | |||||
| endif | endif | ||||
| lapack-test : | lapack-test : | ||||
| $(MAKE) -C lapack-3.4.0 tmglib | |||||
| $(MAKE) -C lapack-3.4.0/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintsts xlintstz xlintstzc | |||||
| @rm -f lapack-3.4.0/TESTING/*.out | |||||
| $(MAKE) -j 1 -C lapack-3.4.0/TESTING | |||||
| $(GREP) failed lapack-3.4.0/TESTING/*.out | |||||
| $(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib | |||||
| $(MAKE) -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintsts xlintstz xlintstzc | |||||
| @rm -f $(NETLIB_LAPACK_DIR)/TESTING/*.out | |||||
| $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING | |||||
| $(GREP) failed $(NETLIB_LAPACK_DIR)/TESTING/*.out | |||||
| dummy : | dummy : | ||||
| @@ -288,10 +314,10 @@ clean :: | |||||
| #endif | #endif | ||||
| @$(MAKE) -C reference clean | @$(MAKE) -C reference clean | ||||
| @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h | @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h | ||||
| @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib | |||||
| @if test -d lapack-3.4.0; then \ | |||||
| echo deleting lapack-3.4.0; \ | |||||
| rm -rf lapack-3.4.0 ;\ | |||||
| @rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib | |||||
| @if test -d $(NETLIB_LAPACK_DIR); then \ | |||||
| echo deleting $(NETLIB_LAPACK_DIR); \ | |||||
| rm -rf $(NETLIB_LAPACK_DIR) ;\ | |||||
| fi | fi | ||||
| @rm -f *.grd Makefile.conf_last config_last.h | @rm -f *.grd Makefile.conf_last config_last.h | ||||
| @echo Done. | |||||
| @echo Done. | |||||
| @@ -23,7 +23,7 @@ install : lib.grd | |||||
| @cat config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | @cat config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | ||||
| @echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | @echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | ||||
| @cat openblas_config_template.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | @cat openblas_config_template.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | ||||
| @echo \#endif >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||||
| @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||||
| @echo Generating f77blas.h in $(OPENBLAS_INCLUDE_DIR) | @echo Generating f77blas.h in $(OPENBLAS_INCLUDE_DIR) | ||||
| @echo \#ifndef OPENBLAS_F77BLAS_H > $(OPENBLAS_INCLUDE_DIR)/f77blas.h | @echo \#ifndef OPENBLAS_F77BLAS_H > $(OPENBLAS_INCLUDE_DIR)/f77blas.h | ||||
| @@ -32,8 +32,18 @@ install : lib.grd | |||||
| @cat common_interface.h >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h | @cat common_interface.h >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h | ||||
| @echo \#endif >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h | @echo \#endif >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h | ||||
| ifndef NO_CBLAS | |||||
| @echo Generating cblas.h in $(OPENBLAS_INCLUDE_DIR) | @echo Generating cblas.h in $(OPENBLAS_INCLUDE_DIR) | ||||
| @sed 's/common/openblas_config/g' cblas.h > $(OPENBLAS_INCLUDE_DIR)/cblas.h | @sed 's/common/openblas_config/g' cblas.h > $(OPENBLAS_INCLUDE_DIR)/cblas.h | ||||
| endif | |||||
| ifndef NO_LAPACKE | |||||
| @echo Copying LAPACKE header files to $(OPENBLAS_LIBRARY_DIR) | |||||
| @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(OPENBLAS_INCLUDE_DIR)/lapacke.h | |||||
| @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(OPENBLAS_INCLUDE_DIR)/lapacke_config.h | |||||
| @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h | |||||
| @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h | |||||
| endif | |||||
| #for install static library | #for install static library | ||||
| @echo Copy the static library to $(OPENBLAS_LIBRARY_DIR) | @echo Copy the static library to $(OPENBLAS_LIBRARY_DIR) | ||||
| @@ -61,11 +71,9 @@ ifeq ($(OSNAME), Darwin) | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), WINNT) | ifeq ($(OSNAME), WINNT) | ||||
| -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) | -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) | ||||
| -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), CYGWIN_NT) | ifeq ($(OSNAME), CYGWIN_NT) | ||||
| -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) | -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) | ||||
| -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll | |||||
| endif | endif | ||||
| @echo Install OK! | @echo Install OK! | ||||
| @@ -1,3 +1,5 @@ | |||||
| # This is triggered by Makefile.system and runs before any of the code is built. | |||||
| export BINARY | export BINARY | ||||
| export USE_OPENMP | export USE_OPENMP | ||||
| @@ -15,7 +17,7 @@ ifdef CPUIDEMU | |||||
| EXFLAGS = -DCPUIDEMU -DVENDOR=99 | EXFLAGS = -DCPUIDEMU -DVENDOR=99 | ||||
| endif | endif | ||||
| all: getarch_2nd | |||||
| all: getarch_2nd cblas_noconst.h | |||||
| ./getarch_2nd 0 >> $(TARGET_MAKE) | ./getarch_2nd 0 >> $(TARGET_MAKE) | ||||
| ./getarch_2nd 1 >> $(TARGET_CONF) | ./getarch_2nd 1 >> $(TARGET_CONF) | ||||
| @@ -36,4 +38,7 @@ else | |||||
| $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c | $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c | ||||
| endif | endif | ||||
| cblas_noconst.h : cblas.h | |||||
| perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h | |||||
| dummy: | dummy: | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.1.0 | |||||
| VERSION = 0.2.6 | |||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
| @@ -24,10 +24,13 @@ VERSION = 0.1.0 | |||||
| # Fortran compiler. Default is g77. | # Fortran compiler. Default is g77. | ||||
| # FC = gfortran | # FC = gfortran | ||||
| # Even you can specify cross compiler | |||||
| # Even you can specify cross compiler. Meanwhile, please set HOSTCC. | |||||
| # CC = x86_64-w64-mingw32-gcc | # CC = x86_64-w64-mingw32-gcc | ||||
| # FC = x86_64-w64-mingw32-gfortran | # FC = x86_64-w64-mingw32-gfortran | ||||
| # If you use the cross compiler, please set this host compiler. | |||||
| # HOSTCC = gcc | |||||
| # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 | # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 | ||||
| # BINARY=64 | # BINARY=64 | ||||
| @@ -45,12 +48,19 @@ VERSION = 0.1.0 | |||||
| # automatically detected by the the script. | # automatically detected by the the script. | ||||
| # NUM_THREADS = 24 | # NUM_THREADS = 24 | ||||
| # if you don't need generate the shared library, please comment it in. | |||||
| # NO_SHARED = 1 | |||||
| # If you don't need CBLAS interface, please comment it in. | # If you don't need CBLAS interface, please comment it in. | ||||
| # NO_CBLAS = 1 | # NO_CBLAS = 1 | ||||
| # If you don't need LAPACK, please comment it in. | |||||
| # If you don't need LAPACK, please comment it in. | |||||
| # If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. | |||||
| # NO_LAPACK = 1 | # NO_LAPACK = 1 | ||||
| # If you don't need LAPACKE (C Interface to LAPACK), please comment it in. | |||||
| # NO_LAPACKE = 1 | |||||
| # If you want to use legacy threaded Level 3 implementation. | # If you want to use legacy threaded Level 3 implementation. | ||||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | # USE_SIMPLE_THREADED_LEVEL3 = 1 | ||||
| @@ -67,6 +77,10 @@ VERSION = 0.1.0 | |||||
| # If you want to disable CPU/Memory affinity on Linux. | # If you want to disable CPU/Memory affinity on Linux. | ||||
| # NO_AFFINITY = 1 | # NO_AFFINITY = 1 | ||||
| # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers | |||||
| # and OS. However, the performance is low. | |||||
| # NO_AVX = 1 | |||||
| # If you would like to know minute performance report of GotoBLAS. | # If you would like to know minute performance report of GotoBLAS. | ||||
| # FUNCTION_PROFILE = 1 | # FUNCTION_PROFILE = 1 | ||||
| @@ -90,8 +104,8 @@ VERSION = 0.1.0 | |||||
| # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | ||||
| # with single thread. You can use this flag to avoid the overhead of multi-threading | # with single thread. You can use this flag to avoid the overhead of multi-threading | ||||
| # in small matrix sizes. The default value is 4. | |||||
| # GEMM_MULTITHREAD_THRESHOLD = 4 | |||||
| # in small matrix sizes. The default value is 50. | |||||
| # GEMM_MULTITHREAD_THRESHOLD = 50 | |||||
| # If you need santy check by comparing reference BLAS. It'll be very | # If you need santy check by comparing reference BLAS. It'll be very | ||||
| # slow (Not implemented yet). | # slow (Not implemented yet). | ||||
| @@ -104,19 +118,16 @@ VERSION = 0.1.0 | |||||
| # The installation directory. | # The installation directory. | ||||
| # PREFIX = /opt/OpenBLAS | # PREFIX = /opt/OpenBLAS | ||||
| # Common Optimization Flag; -O2 is enough. | |||||
| # DEBUG = 1 | |||||
| ifeq ($(DEBUG), 1) | |||||
| COMMON_OPT += -g | |||||
| # -DDEBUG | |||||
| else | |||||
| COMMON_OPT += -O2 | |||||
| endif | |||||
| # Common Optimization Flag; | |||||
| # The default -O2 is enough. | |||||
| # COMMON_OPT = -O2 | |||||
| # Profiling flags | # Profiling flags | ||||
| COMMON_PROF = -pg | COMMON_PROF = -pg | ||||
| # Build Debug version | |||||
| # DEBUG = 1 | |||||
| # | # | ||||
| # End of user configuration | # End of user configuration | ||||
| # | # | ||||
| @@ -9,8 +9,20 @@ ifndef TOPDIR | |||||
| TOPDIR = . | TOPDIR = . | ||||
| endif | endif | ||||
| ifndef NETLIB_LAPACK_DIR | |||||
| NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.2 | |||||
| endif | |||||
| # Default C compiler | # Default C compiler | ||||
| # - Only set if not specified on the command line or inherited from the environment. | |||||
| # - CC is an implicit variable so neither '?=' or 'ifndef' can be used. | |||||
| # http://stackoverflow.com/questions/4029274/mingw-and-make-variables | |||||
| # - Default value is 'cc' which is not always a valid command (e.g. MinGW). | |||||
| ifeq ($(origin CC),default) | |||||
| CC = gcc | CC = gcc | ||||
| endif | |||||
| # Default Fortran compiler (FC) is selected by f_check. | |||||
| ifndef MAKEFILE_RULE | ifndef MAKEFILE_RULE | ||||
| include $(TOPDIR)/Makefile.rule | include $(TOPDIR)/Makefile.rule | ||||
| @@ -41,16 +53,24 @@ GETARCH_FLAGS += -DUSE64BITINT | |||||
| endif | endif | ||||
| ifndef GEMM_MULTITHREAD_THRESHOLD | ifndef GEMM_MULTITHREAD_THRESHOLD | ||||
| GEMM_MULTITHREAD_THRESHOLD=4 | |||||
| GEMM_MULTITHREAD_THRESHOLD=50 | |||||
| endif | endif | ||||
| GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) | GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) | ||||
| ifeq ($(NO_AVX), 1) | |||||
| GETARCH_FLAGS += -DNO_AVX | |||||
| endif | |||||
| ifeq ($(DEBUG), 1) | |||||
| GETARCH_FLAGS += -g | |||||
| endif | |||||
| # This operation is expensive, so execution should be once. | # This operation is expensive, so execution should be once. | ||||
| ifndef GOTOBLAS_MAKEFILE | ifndef GOTOBLAS_MAKEFILE | ||||
| export GOTOBLAS_MAKEFILE = 1 | export GOTOBLAS_MAKEFILE = 1 | ||||
| # Generating Makefile.conf and config.h | # Generating Makefile.conf and config.h | ||||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) | |||||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) | |||||
| ifndef TARGET_CORE | ifndef TARGET_CORE | ||||
| include $(TOPDIR)/Makefile.conf | include $(TOPDIR)/Makefile.conf | ||||
| @@ -101,6 +121,15 @@ DLLWRAP = $(CROSS_SUFFIX)dllwrap | |||||
| ifeq ($(OSNAME), Darwin) | ifeq ($(OSNAME), Darwin) | ||||
| export MACOSX_DEPLOYMENT_TARGET=10.2 | export MACOSX_DEPLOYMENT_TARGET=10.2 | ||||
| MD5SUM = md5 -r | |||||
| endif | |||||
| ifeq ($(OSNAME), FreeBSD) | |||||
| MD5SUM = md5 -r | |||||
| endif | |||||
| ifeq ($(OSNAME), NetBSD) | |||||
| MD5SUM = md5 -n | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), Linux) | ifeq ($(OSNAME), Linux) | ||||
| @@ -120,6 +149,26 @@ EXTRALIB += -defaultlib:advapi32 | |||||
| SUFFIX = obj | SUFFIX = obj | ||||
| PSUFFIX = pobj | PSUFFIX = pobj | ||||
| LIBSUFFIX = lib | LIBSUFFIX = lib | ||||
| ifeq ($(C_COMPILER), GCC) | |||||
| #Test for supporting MS_ABI | |||||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||||
| GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) | |||||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) | |||||
| ifeq ($(GCCVERSIONGT4), 1) | |||||
| # GCC Majar version > 4 | |||||
| # It is compatible with MSVC ABI. | |||||
| CCOMMON_OPT += -DMS_ABI | |||||
| endif | |||||
| ifeq ($(GCCVERSIONGTEQ4), 1) | |||||
| ifeq ($(GCCMINORVERSIONGTEQ7), 1) | |||||
| # GCC Version >=4.7 | |||||
| # It is compatible with MSVC ABI. | |||||
| CCOMMON_OPT += -DMS_ABI | |||||
| endif | |||||
| endif | |||||
| endif | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), Interix) | ifeq ($(OSNAME), Interix) | ||||
| @@ -223,14 +272,20 @@ endif | |||||
| endif | endif | ||||
| ifdef DYNAMIC_ARCH | |||||
| ifeq ($(DYNAMIC_ARCH), 1) | |||||
| ifeq ($(ARCH), x86) | ifeq ($(ARCH), x86) | ||||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | ||||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||||
| ifneq ($(NO_AVX), 1) | |||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||||
| endif | |||||
| endif | endif | ||||
| ifeq ($(ARCH), x86_64) | ifeq ($(ARCH), x86_64) | ||||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||||
| ifneq ($(NO_AVX), 1) | |||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||||
| endif | |||||
| endif | endif | ||||
| ifndef DYNAMIC_CORE | ifndef DYNAMIC_CORE | ||||
| @@ -459,11 +514,28 @@ ifdef INTERFACE64 | |||||
| FCOMMON_OPT += -i8 | FCOMMON_OPT += -i8 | ||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(ARCH), mips64) | |||||
| ifndef BINARY64 | |||||
| FCOMMON_OPT += -n32 | |||||
| else | |||||
| FCOMMON_OPT += -n64 | |||||
| endif | |||||
| ifeq ($(CORE), LOONGSON3A) | |||||
| FCOMMON_OPT += -loongson3 | |||||
| endif | |||||
| ifeq ($(CORE), LOONGSON3B) | |||||
| FCOMMON_OPT += -loongson3 | |||||
| endif | |||||
| else | |||||
| ifndef BINARY64 | ifndef BINARY64 | ||||
| FCOMMON_OPT += -m32 | FCOMMON_OPT += -m32 | ||||
| else | else | ||||
| FCOMMON_OPT += -m64 | FCOMMON_OPT += -m64 | ||||
| endif | endif | ||||
| endif | |||||
| ifdef USE_OPENMP | ifdef USE_OPENMP | ||||
| FEXTRALIB += -lstdc++ | FEXTRALIB += -lstdc++ | ||||
| @@ -472,12 +544,30 @@ endif | |||||
| endif | endif | ||||
| ifeq ($(C_COMPILER), OPEN64) | ifeq ($(C_COMPILER), OPEN64) | ||||
| ifeq ($(ARCH), mips64) | |||||
| ifndef BINARY64 | |||||
| CCOMMON_OPT += -n32 | |||||
| else | |||||
| CCOMMON_OPT += -n64 | |||||
| endif | |||||
| ifeq ($(CORE), LOONGSON3A) | |||||
| CCOMMON_OPT += -loongson3 | |||||
| endif | |||||
| ifeq ($(CORE), LOONGSON3B) | |||||
| CCOMMON_OPT += -loongson3 | |||||
| endif | |||||
| else | |||||
| ifndef BINARY64 | ifndef BINARY64 | ||||
| CCOMMON_OPT += -m32 | CCOMMON_OPT += -m32 | ||||
| else | else | ||||
| CCOMMON_OPT += -m64 | CCOMMON_OPT += -m64 | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(C_COMPILER), SUN) | ifeq ($(C_COMPILER), SUN) | ||||
| CCOMMON_OPT += -w | CCOMMON_OPT += -w | ||||
| @@ -533,6 +623,16 @@ endif | |||||
| ifeq ($(NO_LAPACK), 1) | ifeq ($(NO_LAPACK), 1) | ||||
| CCOMMON_OPT += -DNO_LAPACK | CCOMMON_OPT += -DNO_LAPACK | ||||
| #Disable LAPACK C interface | |||||
| NO_LAPACKE = 1 | |||||
| endif | |||||
| ifeq ($(NO_LAPACKE), 1) | |||||
| CCOMMON_OPT += -DNO_LAPACKE | |||||
| endif | |||||
| ifeq ($(NO_AVX), 1) | |||||
| CCOMMON_OPT += -DNO_AVX | |||||
| endif | endif | ||||
| ifdef SMP | ifdef SMP | ||||
| @@ -651,17 +751,30 @@ PATCH = patch | |||||
| GREP = grep | GREP = grep | ||||
| endif | endif | ||||
| ifndef MD5SUM | |||||
| MD5SUM = md5sum | MD5SUM = md5sum | ||||
| endif | |||||
| AWK = awk | AWK = awk | ||||
| REVISION = -r$(VERSION) | REVISION = -r$(VERSION) | ||||
| MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) | MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) | ||||
| CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) | |||||
| PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) | |||||
| ifeq ($(DEBUG), 1) | |||||
| COMMON_OPT += -g | |||||
| endif | |||||
| ifndef COMMON_OPT | |||||
| COMMON_OPT = -O2 | |||||
| endif | |||||
| override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) | |||||
| override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) | |||||
| FFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) | |||||
| FPFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) | |||||
| override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) | |||||
| override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) | |||||
| #MAKEOVERRIDES = | |||||
| ifndef SUFFIX | ifndef SUFFIX | ||||
| SUFFIX = o | SUFFIX = o | ||||
| @@ -675,7 +788,7 @@ ifndef LIBSUFFIX | |||||
| LIBSUFFIX = a | LIBSUFFIX = a | ||||
| endif | endif | ||||
| ifndef DYNAMIC_ARCH | |||||
| ifneq ($(DYNAMIC_ARCH), 1) | |||||
| ifndef SMP | ifndef SMP | ||||
| LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) | LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) | ||||
| LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) | LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) | ||||
| @@ -694,8 +807,8 @@ endif | |||||
| endif | endif | ||||
| LIBDLLNAME = $(LIBPREFIX).dll | |||||
| LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) | LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) | ||||
| LIBDLLNAME = $(LIBNAME:.$(LIBSUFFIX)=.dll) | |||||
| LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) | LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) | ||||
| LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) | LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) | ||||
| LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) | LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) | ||||
| @@ -740,6 +853,7 @@ export HAVE_SSE4_1 | |||||
| export HAVE_SSE4_2 | export HAVE_SSE4_2 | ||||
| export HAVE_SSE4A | export HAVE_SSE4A | ||||
| export HAVE_SSE5 | export HAVE_SSE5 | ||||
| export HAVE_AVX | |||||
| export KERNELDIR | export KERNELDIR | ||||
| export FUNCTION_PROFILE | export FUNCTION_PROFILE | ||||
| export TARGET_CORE | export TARGET_CORE | ||||
| @@ -22,19 +22,19 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) | |||||
| BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) | BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) | ||||
| endif | endif | ||||
| $(SBLASOBJS) $(SBLASOBJS_P) : CFLAGS += -UDOUBLE -UCOMPLEX | |||||
| $(DBLASOBJS) $(DBLASOBJS_P) : CFLAGS += -DDOUBLE -UCOMPLEX | |||||
| $(QBLASOBJS) $(QBLASOBJS_P) : CFLAGS += -DXDOUBLE -UCOMPLEX | |||||
| $(CBLASOBJS) $(CBLASOBJS_P) : CFLAGS += -UDOUBLE -DCOMPLEX | |||||
| $(ZBLASOBJS) $(ZBLASOBJS_P) : CFLAGS += -DDOUBLE -DCOMPLEX | |||||
| $(XBLASOBJS) $(XBLASOBJS_P) : CFLAGS += -DXDOUBLE -DCOMPLEX | |||||
| $(SBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) | |||||
| $(DBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) | |||||
| $(QBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) | |||||
| $(CBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) | |||||
| $(ZBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) | |||||
| $(XBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) | |||||
| $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX | |||||
| $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX | |||||
| $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX | |||||
| $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX | |||||
| $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX | |||||
| $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX | |||||
| $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||||
| $(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||||
| $(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||||
| $(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||||
| $(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||||
| $(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||||
| libs :: $(BLASOBJS) $(COMMONOBJS) | libs :: $(BLASOBJS) $(COMMONOBJS) | ||||
| $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ | $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ | ||||
| @@ -1,83 +0,0 @@ | |||||
| OpenBLAS Readme | |||||
| 1.Introduction | |||||
| OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn) | |||||
| 2.Intallation | |||||
| Download from project homepage. http://xianyi.github.com/OpenBLAS/ | |||||
| Or, | |||||
| check out codes from git://github.com/xianyi/OpenBLAS.git | |||||
| 1)Normal compile | |||||
| (a) type "make" to detect the CPU automatically. | |||||
| or | |||||
| (b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. | |||||
| 2)Cross compile | |||||
| Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. | |||||
| examples: | |||||
| On X86 box, compile this library for loongson3a CPU. | |||||
| make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A | |||||
| 3)Debug version | |||||
| make DEBUG=1 | |||||
| 4)Intall to the directory (Optional) | |||||
| e.g. | |||||
| make install PREFIX=your_installation_directory | |||||
| The default directory is /opt/OpenBLAS | |||||
| 3.Support CPU & OS | |||||
| Please read GotoBLAS_01Readme.txt | |||||
| Additional support CPU: | |||||
| x86_64: | |||||
| Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. | |||||
| MIPS64: | |||||
| ICT Loongson 3A //The initial version used GotoBLAS2 MIPS64 kernels. Thus, the performance is not good. | |||||
| 4.Usages | |||||
| Link with libopenblas.a or -lopenblas for shared library. | |||||
| 4.1 Set the number of threads with environment variables. for example, | |||||
| export OPENBLAS_NUM_THREADS=4 | |||||
| or | |||||
| export GOTO_NUM_THREADS=4 | |||||
| or | |||||
| export OMP_NUM_THREADS=4 | |||||
| The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. | |||||
| If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. | |||||
| 4.2 Set the number of threads with calling functions. for example, | |||||
| void goto_set_num_threads(int num_threads); | |||||
| or | |||||
| void openblas_set_num_threads(int num_threads); | |||||
| If you compile this lib with USE_OPENMP=1, you should use the above functions, too. | |||||
| 5.Report Bugs | |||||
| Please add a issue in https://github.com/xianyi/OpenBLAS/issues | |||||
| 6.To-Do List: | |||||
| Optimization on ICT Loongson 3A CPU | |||||
| 7.Contact | |||||
| OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas | |||||
| 8.ChangeLog | |||||
| Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | |||||
| 9.Known Issues | |||||
| * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit | |||||
| is 64. On 32 bits, it is 32. | |||||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. | |||||
| 10. Specification of Git Branches | |||||
| We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | |||||
| Now, there are 4 branches in github.com. | |||||
| * The master branch. This a main branch to reflect a production-ready state. | |||||
| * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. | |||||
| * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. | |||||
| * The gh-pages branch. This is for web pages | |||||
| @@ -0,0 +1,117 @@ | |||||
| # OpenBLAS | |||||
| ## Introduction | |||||
| OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS <http://www.rdcps.ac.cn>. | |||||
| Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>. | |||||
| ## Installation | |||||
| Download from project homepage. http://xianyi.github.com/OpenBLAS/ | |||||
| Or, check out codes from git://github.com/xianyi/OpenBLAS.git | |||||
| ### Normal compile | |||||
| * type "make" to detect the CPU automatically. | |||||
| or | |||||
| * type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. | |||||
| ### Cross compile | |||||
| Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. | |||||
| Examples: | |||||
| On X86 box, compile this library for loongson3a CPU. | |||||
| make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A | |||||
| On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler. | |||||
| make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 | |||||
| ### Debug version | |||||
| make DEBUG=1 | |||||
| ### Install to the directory (Optional) | |||||
| Example: | |||||
| make install PREFIX=your_installation_directory | |||||
| The default directory is /opt/OpenBLAS | |||||
| ## Support CPU & OS | |||||
| Please read GotoBLAS_01Readme.txt | |||||
| ### Additional support CPU: | |||||
| #### x86/x86-64: | |||||
| - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. | |||||
| - **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. | |||||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | |||||
| - **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) | |||||
| #### MIPS64: | |||||
| - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. | |||||
| - **ICT Loongson 3B**: Experimental | |||||
| ### Support OS: | |||||
| - **GNU/Linux** | |||||
| - **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | |||||
| - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. | |||||
| - **FreeBSD**: Supportted by community. We didn't test the library on this OS. | |||||
| ## Usages | |||||
| Link with libopenblas.a or -lopenblas for shared library. | |||||
| ### Set the number of threads with environment variables. | |||||
| Examples: | |||||
| export OPENBLAS_NUM_THREADS=4 | |||||
| or | |||||
| export GOTO_NUM_THREADS=4 | |||||
| or | |||||
| export OMP_NUM_THREADS=4 | |||||
| The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. | |||||
| If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. | |||||
| ### Set the number of threads on runtime. | |||||
| We provided the below functions to controll the number of threads on runtime. | |||||
| void goto_set_num_threads(int num_threads); | |||||
| void openblas_set_num_threads(int num_threads); | |||||
| If you compile this lib with USE_OPENMP=1, you should use the above functions, too. | |||||
| ## Report Bugs | |||||
| Please add a issue in https://github.com/xianyi/OpenBLAS/issues | |||||
| ## Contact | |||||
| OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas | |||||
| ## ChangeLog | |||||
| Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | |||||
| ## Troubleshooting | |||||
| * Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. | |||||
| * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. | |||||
| * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. | |||||
| * The number of CPUs/Cores should less than or equal to 256. | |||||
| * On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1. | |||||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. | |||||
| ## Specification of Git Branches | |||||
| We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | |||||
| Now, there are 4 branches in github.com. | |||||
| * The master branch. This a main branch to reflect a production-ready state. | |||||
| * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. | |||||
| * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. | |||||
| * The gh-pages branch. This is for web pages | |||||
| @@ -8,8 +8,8 @@ Supported List: | |||||
| 1.X86/X86_64 | 1.X86/X86_64 | ||||
| a)Intel CPU: | a)Intel CPU: | ||||
| P2 | P2 | ||||
| COPPERMINE | |||||
| KATMAI | KATMAI | ||||
| COPPERMINE | |||||
| NORTHWOOD | NORTHWOOD | ||||
| PRESCOTT | PRESCOTT | ||||
| BANIAS | BANIAS | ||||
| @@ -18,6 +18,7 @@ CORE2 | |||||
| PENRYN | PENRYN | ||||
| DUNNINGTON | DUNNINGTON | ||||
| NEHALEM | NEHALEM | ||||
| SANDYBRIDGE | |||||
| ATOM | ATOM | ||||
| b)AMD CPU: | b)AMD CPU: | ||||
| @@ -27,6 +28,8 @@ OPTERON_SSE3 | |||||
| BARCELONA | BARCELONA | ||||
| SHANGHAI | SHANGHAI | ||||
| ISTANBUL | ISTANBUL | ||||
| BOBCAT | |||||
| BULLDOZER | |||||
| c)VIA CPU: | c)VIA CPU: | ||||
| SSE_GENERIC | SSE_GENERIC | ||||
| @@ -47,6 +50,7 @@ CELL | |||||
| 3.MIPS64 CPU: | 3.MIPS64 CPU: | ||||
| SICORTEX | SICORTEX | ||||
| LOONGSON3A | LOONGSON3A | ||||
| LOONGSON3B | |||||
| 4.IA64 CPU: | 4.IA64 CPU: | ||||
| ITANIUM2 | ITANIUM2 | ||||
| @@ -43,14 +43,14 @@ $compiler = DEC if ($data =~ /COMPILER_DEC/); | |||||
| $compiler = GCC if ($compiler eq ""); | $compiler = GCC if ($compiler eq ""); | ||||
| $os = Linux if ($data =~ /OS_LINUX/); | $os = Linux if ($data =~ /OS_LINUX/); | ||||
| $os = FreeBSD if ($data =~ /OS_FreeBSD/); | |||||
| $os = NetBSD if ($data =~ /OS_NetBSD/); | |||||
| $os = Darwin if ($data =~ /OS_Darwin/); | |||||
| $os = SunOS if ($data =~ /OS_SunOS/); | |||||
| $os = FreeBSD if ($data =~ /OS_FREEBSD/); | |||||
| $os = NetBSD if ($data =~ /OS_NETBSD/); | |||||
| $os = Darwin if ($data =~ /OS_DARWIN/); | |||||
| $os = SunOS if ($data =~ /OS_SUNOS/); | |||||
| $os = AIX if ($data =~ /OS_AIX/); | $os = AIX if ($data =~ /OS_AIX/); | ||||
| $os = osf if ($data =~ /OS_OSF/); | $os = osf if ($data =~ /OS_OSF/); | ||||
| $os = WINNT if ($data =~ /OS_WINNT/); | $os = WINNT if ($data =~ /OS_WINNT/); | ||||
| $os = CYGWIN_NT if ($data =~ /OS_CYGWIN/); | |||||
| $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); | |||||
| $os = Interix if ($data =~ /OS_INTERIX/); | $os = Interix if ($data =~ /OS_INTERIX/); | ||||
| $architecture = x86 if ($data =~ /ARCH_X86/); | $architecture = x86 if ($data =~ /ARCH_X86/); | ||||
| @@ -174,6 +174,8 @@ $linker_a = ""; | |||||
| $link =~ s/\-Y\sP\,/\-Y/g; | $link =~ s/\-Y\sP\,/\-Y/g; | ||||
| @flags = split(/[\s\,\n]/, $link); | @flags = split(/[\s\,\n]/, $link); | ||||
| # remove leading and trailing quotes from each flag. | |||||
| @flags = map {s/^['"]|['"]$//g; $_} @flags; | |||||
| foreach $flags (@flags) { | foreach $flags (@flags) { | ||||
| if ( | if ( | ||||
| @@ -1,287 +1,293 @@ | |||||
| #ifndef CBLAS_H | #ifndef CBLAS_H | ||||
| #define CBLAS_H | #define CBLAS_H | ||||
| #include <stddef.h> | |||||
| #include "common.h" | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| extern "C" { | extern "C" { | ||||
| /* Assume C declarations for C++ */ | /* Assume C declarations for C++ */ | ||||
| #endif /* __cplusplus */ | #endif /* __cplusplus */ | ||||
| #include <stddef.h> | |||||
| #include "common.h" | |||||
| /*Set the number of threads on runtime.*/ | |||||
| void openblas_set_num_threads(int num_threads); | |||||
| void goto_set_num_threads(int num_threads); | |||||
| /*Get the build configure on runtime.*/ | |||||
| char* openblas_get_config(void); | |||||
| #define CBLAS_INDEX size_t | #define CBLAS_INDEX size_t | ||||
| enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; | |||||
| enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114}; | |||||
| enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; | |||||
| enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; | |||||
| enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; | |||||
| float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy); | |||||
| double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); | |||||
| float _Complex cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| float _Complex cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| double _Complex cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); | |||||
| double _Complex cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); | |||||
| void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, float _Complex *ret); | |||||
| void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, float _Complex *ret); | |||||
| void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, double _Complex *ret); | |||||
| void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, double _Complex *ret); | |||||
| float cblas_sasum (blasint n, float *x, blasint incx); | |||||
| double cblas_dasum (blasint n, double *x, blasint incx); | |||||
| float cblas_scasum(blasint n, float *x, blasint incx); | |||||
| double cblas_dzasum(blasint n, double *x, blasint incx); | |||||
| float cblas_snrm2 (blasint N, float *X, blasint incX); | |||||
| double cblas_dnrm2 (blasint N, double *X, blasint incX); | |||||
| float cblas_scnrm2(blasint N, float *X, blasint incX); | |||||
| double cblas_dznrm2(blasint N, double *X, blasint incX); | |||||
| CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); | |||||
| CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); | |||||
| CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); | |||||
| CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); | |||||
| void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy); | |||||
| void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy); | |||||
| void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy); | |||||
| void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy); | |||||
| void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); | |||||
| void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); | |||||
| void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); | |||||
| void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); | |||||
| void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); | |||||
| void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); | |||||
| void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); | |||||
| typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; | |||||
| typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; | |||||
| typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; | |||||
| typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; | |||||
| typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; | |||||
| float cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy); | |||||
| double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); | |||||
| float cblas_sdot(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); | |||||
| double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); | |||||
| openblas_complex_float cblas_cdotu(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); | |||||
| openblas_complex_float cblas_cdotc(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); | |||||
| openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); | |||||
| openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); | |||||
| void cblas_cdotu_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret); | |||||
| void cblas_cdotc_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret); | |||||
| void cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); | |||||
| void cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); | |||||
| float cblas_sasum (const blasint n, const float *x, const blasint incx); | |||||
| double cblas_dasum (const blasint n, const double *x, const blasint incx); | |||||
| float cblas_scasum(const blasint n, const float *x, const blasint incx); | |||||
| double cblas_dzasum(const blasint n, const double *x, const blasint incx); | |||||
| float cblas_snrm2 (const blasint N, const float *X, const blasint incX); | |||||
| double cblas_dnrm2 (const blasint N, const double *X, const blasint incX); | |||||
| float cblas_scnrm2(const blasint N, const float *X, const blasint incX); | |||||
| double cblas_dznrm2(const blasint N, const double *X, const blasint incX); | |||||
| CBLAS_INDEX cblas_isamax(const blasint n, const float *x, const blasint incx); | |||||
| CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx); | |||||
| CBLAS_INDEX cblas_icamax(const blasint n, const float *x, const blasint incx); | |||||
| CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx); | |||||
| void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy); | |||||
| void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy); | |||||
| void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy); | |||||
| void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy); | |||||
| void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); | |||||
| void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); | |||||
| void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); | |||||
| void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); | |||||
| void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); | |||||
| void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); | |||||
| void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); | |||||
| void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); | |||||
| void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s); | |||||
| void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double s); | |||||
| void cblas_srotg(float *a, float *b, float *c, float *s); | void cblas_srotg(float *a, float *b, float *c, float *s); | ||||
| void cblas_drotg(double *a, double *b, double *c, double *s); | void cblas_drotg(double *a, double *b, double *c, double *s); | ||||
| void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); | |||||
| void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); | |||||
| void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); | |||||
| void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); | |||||
| void cblas_sscal(blasint N, float alpha, float *X, blasint incX); | |||||
| void cblas_dscal(blasint N, double alpha, double *X, blasint incX); | |||||
| void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); | |||||
| void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); | |||||
| void cblas_csscal(blasint N, float alpha, float *X, blasint incX); | |||||
| void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); | |||||
| void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||||
| float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); | |||||
| void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||||
| double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); | |||||
| void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||||
| float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); | |||||
| void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||||
| double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); | |||||
| void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||||
| void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||||
| void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||||
| void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||||
| void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||||
| void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||||
| void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); | |||||
| void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); | |||||
| void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); | |||||
| void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); | |||||
| void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, | |||||
| blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||||
| void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, | |||||
| blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||||
| void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, | |||||
| float *Y, blasint incY, float *A, blasint lda); | |||||
| void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, | |||||
| double *Y, blasint incY, double *A, blasint lda); | |||||
| void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||||
| blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | |||||
| void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||||
| blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | |||||
| void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||||
| blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||||
| void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||||
| blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||||
| void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, | |||||
| blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | |||||
| void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, | |||||
| blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | |||||
| void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||||
| void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||||
| void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, float *Ap, float *X, blasint incX); | |||||
| void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, double *Ap, double *X, blasint incX); | |||||
| void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, float *Ap, float *X, blasint incX); | |||||
| void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, double *Ap, double *X, blasint incX); | |||||
| void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, float *Ap, float *X, blasint incX); | |||||
| void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, double *Ap, double *X, blasint incX); | |||||
| void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, float *Ap, float *X, blasint incX); | |||||
| void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||||
| blasint N, double *Ap, double *X, blasint incX); | |||||
| void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, | |||||
| blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | |||||
| void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, | |||||
| blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | |||||
| void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, | |||||
| blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||||
| void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, | |||||
| blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||||
| void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, | |||||
| float *X, blasint incX, float beta, float *Y, blasint incY); | |||||
| void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, | |||||
| double *X, blasint incX, double beta, double *Y, blasint incY); | |||||
| void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); | |||||
| void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); | |||||
| void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); | |||||
| void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); | |||||
| void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); | |||||
| void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); | |||||
| void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); | |||||
| void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); | |||||
| void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, | |||||
| float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||||
| void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, | |||||
| double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||||
| void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, | |||||
| float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||||
| void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, | |||||
| double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||||
| void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||||
| float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||||
| void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||||
| double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||||
| void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||||
| void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||||
| void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||||
| float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||||
| void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||||
| double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||||
| void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||||
| void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||||
| void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); | |||||
| void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); | |||||
| void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); | |||||
| void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); | |||||
| void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||||
| void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||||
| void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||||
| void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||||
| blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||||
| void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); | |||||
| void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); | |||||
| void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); | |||||
| void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); | |||||
| void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); | |||||
| void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); | |||||
| void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); | |||||
| void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); | |||||
| void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||||
| void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||||
| void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||||
| float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); | |||||
| void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||||
| double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); | |||||
| void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||||
| void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||||
| void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P); | |||||
| void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P); | |||||
| void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); | |||||
| void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); | |||||
| void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX); | |||||
| void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX); | |||||
| void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX); | |||||
| void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX); | |||||
| void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX); | |||||
| void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX); | |||||
| void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, | |||||
| const float alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float beta, float *y, const blasint incy); | |||||
| void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, | |||||
| const double alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double beta, double *y, const blasint incy); | |||||
| void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, | |||||
| const float *alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float *beta, float *y, const blasint incy); | |||||
| void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, | |||||
| const double *alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double *beta, double *y, const blasint incy); | |||||
| void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); | |||||
| void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | |||||
| void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); | |||||
| void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); | |||||
| void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | |||||
| void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | |||||
| void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | |||||
| void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | |||||
| void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | |||||
| void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | |||||
| void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | |||||
| void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | |||||
| void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | |||||
| void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | |||||
| void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); | |||||
| void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); | |||||
| void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); | |||||
| void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); | |||||
| void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X, | |||||
| const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); | |||||
| void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, | |||||
| const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | |||||
| void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, | |||||
| const float *Y, const blasint incY, float *A, const blasint lda); | |||||
| void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, | |||||
| const double *Y, const blasint incY, double *A, const blasint lda); | |||||
| void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | |||||
| const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | |||||
| void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | |||||
| const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | |||||
| void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | |||||
| const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | |||||
| void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | |||||
| const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | |||||
| void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A, | |||||
| const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | |||||
| void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A, | |||||
| const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | |||||
| void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | |||||
| void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | |||||
| void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | |||||
| void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | |||||
| void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | |||||
| void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | |||||
| void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | |||||
| void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | |||||
| void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const float *Ap, float *X, const blasint incX); | |||||
| void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const double *Ap, double *X, const blasint incX); | |||||
| void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const float *Ap, float *X, const blasint incX); | |||||
| void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const double *Ap, double *X, const blasint incX); | |||||
| void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const float *Ap, float *X, const blasint incX); | |||||
| void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const double *Ap, double *X, const blasint incX); | |||||
| void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const float *Ap, float *X, const blasint incX); | |||||
| void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||||
| const blasint N, const double *Ap, double *X, const blasint incX); | |||||
| void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A, | |||||
| const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | |||||
| void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A, | |||||
| const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | |||||
| void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A, | |||||
| const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | |||||
| void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A, | |||||
| const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | |||||
| void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap, | |||||
| const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | |||||
| void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap, | |||||
| const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | |||||
| void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap); | |||||
| void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap); | |||||
| void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A); | |||||
| void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A); | |||||
| void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A); | |||||
| void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A); | |||||
| void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap); | |||||
| void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap); | |||||
| void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, | |||||
| const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | |||||
| void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, | |||||
| const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | |||||
| void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, | |||||
| const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | |||||
| void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, | |||||
| const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | |||||
| void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | |||||
| const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | |||||
| void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | |||||
| const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | |||||
| void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | |||||
| const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | |||||
| void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | |||||
| const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | |||||
| void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||||
| const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | |||||
| void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||||
| const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | |||||
| void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||||
| const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | |||||
| void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||||
| const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | |||||
| void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||||
| const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); | |||||
| void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||||
| const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); | |||||
| void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||||
| const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc); | |||||
| void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||||
| const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc); | |||||
| void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||||
| const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | |||||
| void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||||
| const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | |||||
| void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||||
| const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | |||||
| void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||||
| const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | |||||
| void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); | |||||
| void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); | |||||
| void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); | |||||
| void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); | |||||
| void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); | |||||
| void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); | |||||
| void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); | |||||
| void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); | |||||
| void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||||
| const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | |||||
| void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||||
| const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | |||||
| void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | |||||
| const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); | |||||
| void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | |||||
| const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); | |||||
| void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | |||||
| const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | |||||
| void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | |||||
| const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | |||||
| void cblas_xerbla(blasint p, char *rout, char *form, ...); | void cblas_xerbla(blasint p, char *rout, char *form, ...); | ||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| } | } | ||||
| #endif /* __cplusplus */ | #endif /* __cplusplus */ | ||||
| #endif | #endif | ||||
| @@ -68,7 +68,7 @@ extern "C" { | |||||
| #define SMP | #define SMP | ||||
| #endif | #endif | ||||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_Interix) | |||||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) | |||||
| #define WINDOWS_ABI | #define WINDOWS_ABI | ||||
| #define OS_WINDOWS | #define OS_WINDOWS | ||||
| @@ -89,7 +89,7 @@ extern "C" { | |||||
| #include <sched.h> | #include <sched.h> | ||||
| #endif | #endif | ||||
| #ifdef OS_DARWIN | |||||
| #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) | |||||
| #include <sched.h> | #include <sched.h> | ||||
| #endif | #endif | ||||
| @@ -351,7 +351,12 @@ typedef int blasint; | |||||
| #endif | #endif | ||||
| #define MMAP_ACCESS (PROT_READ | PROT_WRITE) | #define MMAP_ACCESS (PROT_READ | PROT_WRITE) | ||||
| #ifdef __NetBSD__ | |||||
| #define MMAP_POLICY (MAP_PRIVATE | MAP_ANON) | |||||
| #else | |||||
| #define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) | #define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) | ||||
| #endif | |||||
| #include "param.h" | #include "param.h" | ||||
| #include "common_param.h" | #include "common_param.h" | ||||
| @@ -374,6 +379,31 @@ typedef int blasint; | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifndef ASSEMBLER | |||||
| #ifndef NOINCLUDE | |||||
| /* Inclusion of a standard header file is needed for definition of __STDC_* | |||||
| predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs | |||||
| as a side effect of including either <features.h> or <stdc-predef.h>. */ | |||||
| #include <stdio.h> | |||||
| #endif // NOINCLUDE | |||||
| /* C99 supports complex floating numbers natively, which GCC also offers as an | |||||
| extension since version 3.0. If neither are available, use a compatible | |||||
| structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ | |||||
| #if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ | |||||
| (__GNUC__ >= 3 && !defined(__cplusplus))) | |||||
| #define OPENBLAS_COMPLEX_C99 | |||||
| typedef float _Complex openblas_complex_float; | |||||
| typedef double _Complex openblas_complex_double; | |||||
| typedef xdouble _Complex openblas_complex_xdouble; | |||||
| #else | |||||
| #define OPENBLAS_COMPLEX_STRUCT | |||||
| typedef struct { float real, imag; } openblas_complex_float; | |||||
| typedef struct { double real, imag; } openblas_complex_double; | |||||
| typedef struct { xdouble real, imag; } openblas_complex_xdouble; | |||||
| #endif | |||||
| #endif // ASSEMBLER | |||||
| #ifndef IFLUSH | #ifndef IFLUSH | ||||
| #define IFLUSH | #define IFLUSH | ||||
| #endif | #endif | ||||
| @@ -528,7 +558,8 @@ typedef struct { | |||||
| #include "common_level3.h" | #include "common_level3.h" | ||||
| #include "common_lapack.h" | #include "common_lapack.h" | ||||
| #ifdef CBLAS | #ifdef CBLAS | ||||
| #include "cblas.h" | |||||
| /* This header file is generated from "cblas.h" (see Makefile.prebuild). */ | |||||
| #include "cblas_noconst.h" | |||||
| #endif | #endif | ||||
| #ifndef ASSEMBLER | #ifndef ASSEMBLER | ||||
| @@ -45,6 +45,8 @@ extern "C" { | |||||
| int BLASFUNC(xerbla)(char *, blasint *info, blasint); | int BLASFUNC(xerbla)(char *, blasint *info, blasint); | ||||
| void openblas_set_num_threads_(int *); | |||||
| FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); | FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); | ||||
| FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); | FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); | ||||
| @@ -74,19 +76,19 @@ myxcomplex_t BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, | |||||
| myxcomplex_t BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); | myxcomplex_t BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); | ||||
| #elif defined RETURN_BY_STACK | #elif defined RETURN_BY_STACK | ||||
| void BLASFUNC(cdotu) (float _Complex *, blasint *, float * , blasint *, float *, blasint *); | |||||
| void BLASFUNC(cdotc) (float _Complex *, blasint *, float *, blasint *, float *, blasint *); | |||||
| void BLASFUNC(zdotu) (double _Complex *, blasint *, double *, blasint *, double *, blasint *); | |||||
| void BLASFUNC(zdotc) (double _Complex *, blasint *, double *, blasint *, double *, blasint *); | |||||
| void BLASFUNC(xdotu) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||||
| void BLASFUNC(xdotc) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||||
| void BLASFUNC(cdotu) (openblas_complex_float *, blasint *, float * , blasint *, float *, blasint *); | |||||
| void BLASFUNC(cdotc) (openblas_complex_float *, blasint *, float *, blasint *, float *, blasint *); | |||||
| void BLASFUNC(zdotu) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *); | |||||
| void BLASFUNC(zdotc) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *); | |||||
| void BLASFUNC(xdotu) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||||
| void BLASFUNC(xdotc) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||||
| #else | #else | ||||
| float _Complex BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); | |||||
| float _Complex BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); | |||||
| double _Complex BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); | |||||
| double _Complex BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); | |||||
| xdouble _Complex BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||||
| xdouble _Complex BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||||
| openblas_complex_float BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); | |||||
| openblas_complex_float BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); | |||||
| openblas_complex_double BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); | |||||
| openblas_complex_double BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); | |||||
| openblas_complex_xdouble BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||||
| openblas_complex_xdouble BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||||
| #endif | #endif | ||||
| void BLASFUNC(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *); | void BLASFUNC(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *); | ||||
| @@ -640,6 +642,8 @@ int BLASFUNC(zgemc)(char *, char *, blasint *, blasint *, blasint *, double *, | |||||
| int BLASFUNC(xgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *, | int BLASFUNC(xgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *, | ||||
| xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); | xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); | ||||
| /* Lapack routines */ | |||||
| int BLASFUNC(sgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); | int BLASFUNC(sgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); | ||||
| int BLASFUNC(dgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); | int BLASFUNC(dgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); | ||||
| int BLASFUNC(qgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); | int BLASFUNC(qgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); | ||||
| @@ -675,6 +679,13 @@ int BLASFUNC(cgesv)(blasint *, blasint *, float *, blasint *, blasint *, float | |||||
| int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *); | int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *); | ||||
| int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *); | int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *); | ||||
| int BLASFUNC(sgesvd)(char *, char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *, blasint *); | |||||
| int BLASFUNC(dgesvd)(char *, char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *, blasint *); | |||||
| int BLASFUNC(qgesvd)(char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); | |||||
| int BLASFUNC(cgesvd)(char *, char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *, blasint *); | |||||
| int BLASFUNC(zgesvd)(char *, char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *, blasint *); | |||||
| int BLASFUNC(xgesvd)(char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); | |||||
| int BLASFUNC(spotf2)(char *, blasint *, float *, blasint *, blasint *); | int BLASFUNC(spotf2)(char *, blasint *, float *, blasint *, blasint *); | ||||
| int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *); | int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *); | ||||
| int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *); | int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *); | ||||
| @@ -689,6 +700,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *); | |||||
| int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); | int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); | ||||
| int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); | int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); | ||||
| int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); | |||||
| int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); | |||||
| int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); | |||||
| int BLASFUNC(cpotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); | |||||
| int BLASFUNC(zpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); | |||||
| int BLASFUNC(xpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); | |||||
| int BLASFUNC(slauu2)(char *, blasint *, float *, blasint *, blasint *); | int BLASFUNC(slauu2)(char *, blasint *, float *, blasint *, blasint *); | ||||
| int BLASFUNC(dlauu2)(char *, blasint *, double *, blasint *, blasint *); | int BLASFUNC(dlauu2)(char *, blasint *, double *, blasint *, blasint *); | ||||
| int BLASFUNC(qlauu2)(char *, blasint *, xdouble *, blasint *, blasint *); | int BLASFUNC(qlauu2)(char *, blasint *, xdouble *, blasint *, blasint *); | ||||
| @@ -86,7 +86,13 @@ static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned | |||||
| return syscall(SYS_set_mempolicy, mode, addr, flag); | return syscall(SYS_set_mempolicy, mode, addr, flag); | ||||
| } | } | ||||
| static inline int my_gettid(void) { return syscall(SYS_gettid); } | |||||
| static inline int my_gettid(void) { | |||||
| #ifdef SYS_gettid | |||||
| return syscall(SYS_gettid); | |||||
| #else | |||||
| return getpid(); | |||||
| #endif | |||||
| } | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -63,5 +63,7 @@ double _Complex BLASFUNC_REF(zdotc) (blasint *, double *, blasint *, double | |||||
| void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *); | void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *); | ||||
| double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*); | double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*); | ||||
| FLOATRET BLASFUNC_REF(samax) (blasint *, float *, blasint *); | |||||
| #endif | #endif | ||||
| @@ -135,7 +135,7 @@ static __inline int num_cpu_avail(int level) { | |||||
| int openmp_nthreads=0; | int openmp_nthreads=0; | ||||
| #endif | #endif | ||||
| if ((blas_cpu_number == 1) | |||||
| if (blas_cpu_number == 1 | |||||
| #ifdef USE_OPENMP | #ifdef USE_OPENMP | ||||
| || omp_in_parallel() | || omp_in_parallel() | ||||
| @@ -254,7 +254,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
| #define PROFCODE | #define PROFCODE | ||||
| #endif | #endif | ||||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) | |||||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) | |||||
| #define SAVEREGISTERS \ | #define SAVEREGISTERS \ | ||||
| subl $32, %esp;\ | subl $32, %esp;\ | ||||
| movups %xmm6, 0(%esp);\ | movups %xmm6, 0(%esp);\ | ||||
| @@ -269,7 +269,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
| #define RESTOREREGISTERS | #define RESTOREREGISTERS | ||||
| #endif | #endif | ||||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) | |||||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) | |||||
| #define PROLOGUE \ | #define PROLOGUE \ | ||||
| .text; \ | .text; \ | ||||
| .align 16; \ | .align 16; \ | ||||
| @@ -282,7 +282,7 @@ REALNAME: | |||||
| #define EPILOGUE .end REALNAME | #define EPILOGUE .end REALNAME | ||||
| #endif | #endif | ||||
| #if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) | |||||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) | |||||
| #define PROLOGUE \ | #define PROLOGUE \ | ||||
| .text; \ | .text; \ | ||||
| .align 16; \ | .align 16; \ | ||||
| @@ -356,4 +356,11 @@ REALNAME: | |||||
| #ifndef ALIGN_6 | #ifndef ALIGN_6 | ||||
| #define ALIGN_6 .align 64 | #define ALIGN_6 .align 64 | ||||
| // ffreep %st(0). | |||||
| // Because Clang didn't support ffreep, we directly use the opcode. | |||||
| // Please check out http://www.sandpile.org/x86/opc_fpu.htm | |||||
| #ifndef ffreep | |||||
| #define ffreep .byte 0xdf, 0xc0 # | |||||
| #endif | |||||
| #endif | #endif | ||||
| @@ -353,7 +353,7 @@ REALNAME: | |||||
| #define EPILOGUE .end REALNAME | #define EPILOGUE .end REALNAME | ||||
| #endif | #endif | ||||
| #if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) | |||||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) | |||||
| #define PROLOGUE \ | #define PROLOGUE \ | ||||
| .text; \ | .text; \ | ||||
| .align 512; \ | .align 512; \ | ||||
| @@ -425,6 +425,7 @@ REALNAME: | |||||
| #define ALIGN_2 .align 2 | #define ALIGN_2 .align 2 | ||||
| #define ALIGN_3 .align 3 | #define ALIGN_3 .align 3 | ||||
| #define ALIGN_4 .align 4 | #define ALIGN_4 .align 4 | ||||
| #define ALIGN_5 .align 5 | |||||
| #define ffreep fstp | #define ffreep fstp | ||||
| #endif | #endif | ||||
| @@ -448,4 +449,10 @@ REALNAME: | |||||
| #define ALIGN_6 .align 64 | #define ALIGN_6 .align 64 | ||||
| #endif | #endif | ||||
| // ffreep %st(0). | |||||
| // Because Clang didn't support ffreep, we directly use the opcode. | |||||
| // Please check out http://www.sandpile.org/x86/opc_fpu.htm | |||||
| #ifndef ffreep | |||||
| #define ffreep .byte 0xdf, 0xc0 # | |||||
| #endif | |||||
| #endif | #endif | ||||
| @@ -103,6 +103,9 @@ | |||||
| #define CORE_NEHALEM 17 | #define CORE_NEHALEM 17 | ||||
| #define CORE_ATOM 18 | #define CORE_ATOM 18 | ||||
| #define CORE_NANO 19 | #define CORE_NANO 19 | ||||
| #define CORE_SANDYBRIDGE 20 | |||||
| #define CORE_BOBCAT 21 | |||||
| #define CORE_BULLDOZER 22 | |||||
| #define HAVE_SSE (1 << 0) | #define HAVE_SSE (1 << 0) | ||||
| #define HAVE_SSE2 (1 << 1) | #define HAVE_SSE2 (1 << 1) | ||||
| @@ -122,6 +125,8 @@ | |||||
| #define HAVE_MISALIGNSSE (1 << 15) | #define HAVE_MISALIGNSSE (1 << 15) | ||||
| #define HAVE_128BITFPU (1 << 16) | #define HAVE_128BITFPU (1 << 16) | ||||
| #define HAVE_FASTMOVU (1 << 17) | #define HAVE_FASTMOVU (1 << 17) | ||||
| #define HAVE_AVX (1 << 18) | |||||
| #define HAVE_FMA4 (1 << 19) | |||||
| #define CACHE_INFO_L1_I 1 | #define CACHE_INFO_L1_I 1 | ||||
| #define CACHE_INFO_L1_D 2 | #define CACHE_INFO_L1_D 2 | ||||
| @@ -188,4 +193,7 @@ typedef struct { | |||||
| #define CPUTYPE_NSGEODE 41 | #define CPUTYPE_NSGEODE 41 | ||||
| #define CPUTYPE_VIAC3 42 | #define CPUTYPE_VIAC3 42 | ||||
| #define CPUTYPE_NANO 43 | #define CPUTYPE_NANO 43 | ||||
| #define CPUTYPE_SANDYBRIDGE 44 | |||||
| #define CPUTYPE_BOBCAT 45 | |||||
| #define CPUTYPE_BULLDOZER 46 | |||||
| #endif | #endif | ||||
| @@ -1,5 +1,5 @@ | |||||
| /***************************************************************************** | /***************************************************************************** | ||||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -101,12 +101,14 @@ int detect(void){ | |||||
| fclose(infile); | fclose(infile); | ||||
| if(p != NULL){ | |||||
| if (strstr(p, "Loongson-3A")){ | if (strstr(p, "Loongson-3A")){ | ||||
| return CPU_LOONGSON3A; | return CPU_LOONGSON3A; | ||||
| }else if(strstr(p, "Loongson-3B")){ | }else if(strstr(p, "Loongson-3B")){ | ||||
| return CPU_LOONGSON3B; | return CPU_LOONGSON3B; | ||||
| }else if (strstr(p, "Loongson-3")){ | }else if (strstr(p, "Loongson-3")){ | ||||
| infile = fopen("/proc/cpuinfo", "r"); | infile = fopen("/proc/cpuinfo", "r"); | ||||
| p = (char *)NULL; | |||||
| while (fgets(buffer, sizeof(buffer), infile)){ | while (fgets(buffer, sizeof(buffer), infile)){ | ||||
| if (!strncmp("system type", buffer, 11)){ | if (!strncmp("system type", buffer, 11)){ | ||||
| p = strchr(buffer, ':') + 2; | p = strchr(buffer, ':') + 2; | ||||
| @@ -119,6 +121,24 @@ int detect(void){ | |||||
| }else{ | }else{ | ||||
| return CPU_SICORTEX; | return CPU_SICORTEX; | ||||
| } | } | ||||
| } | |||||
| //Check model name for Loongson3 | |||||
| infile = fopen("/proc/cpuinfo", "r"); | |||||
| p = (char *)NULL; | |||||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||||
| if (!strncmp("model name", buffer, 10)){ | |||||
| p = strchr(buffer, ':') + 2; | |||||
| break; | |||||
| } | |||||
| } | |||||
| fclose(infile); | |||||
| if(p != NULL){ | |||||
| if (strstr(p, "Loongson-3A")){ | |||||
| return CPU_LOONGSON3A; | |||||
| }else if(strstr(p, "Loongson-3B")){ | |||||
| return CPU_LOONGSON3B; | |||||
| } | |||||
| } | |||||
| #endif | #endif | ||||
| return CPU_UNKNOWN; | return CPU_UNKNOWN; | ||||
| } | } | ||||
| @@ -40,6 +40,13 @@ | |||||
| #include <string.h> | #include <string.h> | ||||
| #include "cpuid.h" | #include "cpuid.h" | ||||
| #ifdef NO_AVX | |||||
| #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM | |||||
| #define CORE_SANDYBRIDGE CORE_NEHALEM | |||||
| #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | |||||
| #define CORE_BULLDOZER CORE_BARCELONA | |||||
| #endif | |||||
| #ifndef CPUIDEMU | #ifndef CPUIDEMU | ||||
| #if defined(__APPLE__) && defined(__i386__) | #if defined(__APPLE__) && defined(__i386__) | ||||
| @@ -109,6 +116,33 @@ static inline int have_excpuid(void){ | |||||
| return eax & 0xffff; | return eax & 0xffff; | ||||
| } | } | ||||
| #ifndef NO_AVX | |||||
| static inline void xgetbv(int op, int * eax, int * edx){ | |||||
| //Use binary code for xgetbv | |||||
| __asm__ __volatile__ | |||||
| (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); | |||||
| } | |||||
| #endif | |||||
| int support_avx(){ | |||||
| #ifndef NO_AVX | |||||
| int eax, ebx, ecx, edx; | |||||
| int ret=0; | |||||
| cpuid(1, &eax, &ebx, &ecx, &edx); | |||||
| if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){ | |||||
| xgetbv(0, &eax, &edx); | |||||
| if((eax & 6) == 6){ | |||||
| ret=1; //OS support AVX | |||||
| } | |||||
| } | |||||
| return ret; | |||||
| #else | |||||
| return 0; | |||||
| #endif | |||||
| } | |||||
| int get_vendor(void){ | int get_vendor(void){ | ||||
| int eax, ebx, ecx, edx; | int eax, ebx, ecx, edx; | ||||
| char vendor[13]; | char vendor[13]; | ||||
| @@ -189,11 +223,17 @@ int get_cputype(int gettype){ | |||||
| if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; | if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; | ||||
| if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; | if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; | ||||
| if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; | if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; | ||||
| #ifndef NO_AVX | |||||
| if (support_avx()) feature |= HAVE_AVX; | |||||
| #endif | |||||
| if (have_excpuid() >= 0x01) { | if (have_excpuid() >= 0x01) { | ||||
| cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | ||||
| if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; | if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; | ||||
| if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; | if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; | ||||
| #ifndef NO_AVX | |||||
| if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4; | |||||
| #endif | |||||
| if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; | if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; | ||||
| if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; | if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; | ||||
| } | } | ||||
| @@ -974,21 +1014,44 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_DUNNINGTON; | return CPUTYPE_DUNNINGTON; | ||||
| } | } | ||||
| break; | break; | ||||
| case 2: | |||||
| switch (model) { | |||||
| case 5: | |||||
| //Intel Core (Clarkdale) / Core (Arrandale) | |||||
| // Pentium (Clarkdale) / Pentium Mobile (Arrandale) | |||||
| // Xeon (Clarkdale), 32nm | |||||
| return CPUTYPE_NEHALEM; | |||||
| case 10: | |||||
| //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | |||||
| return CPUTYPE_NEHALEM; | |||||
| case 12: | |||||
| //Xeon Processor 5600 (Westmere-EP) | |||||
| return CPUTYPE_NEHALEM; | |||||
| } | |||||
| break; | |||||
| case 2: | |||||
| switch (model) { | |||||
| case 5: | |||||
| //Intel Core (Clarkdale) / Core (Arrandale) | |||||
| // Pentium (Clarkdale) / Pentium Mobile (Arrandale) | |||||
| // Xeon (Clarkdale), 32nm | |||||
| return CPUTYPE_NEHALEM; | |||||
| case 10: | |||||
| //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | |||||
| if(support_avx()) | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| else | |||||
| return CPUTYPE_NEHALEM; //OS doesn't support AVX | |||||
| case 12: | |||||
| //Xeon Processor 5600 (Westmere-EP) | |||||
| return CPUTYPE_NEHALEM; | |||||
| case 13: | |||||
| //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) | |||||
| if(support_avx()) | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| else | |||||
| return CPUTYPE_NEHALEM; | |||||
| case 14: | |||||
| // Xeon E7540 | |||||
| case 15: | |||||
| //Xeon Processor E7 (Westmere-EX) | |||||
| return CPUTYPE_NEHALEM; | |||||
| } | |||||
| break; | |||||
| case 3: | |||||
| switch (model) { | |||||
| case 10: | |||||
| if(support_avx()) | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| else | |||||
| return CPUTYPE_NEHALEM; | |||||
| } | |||||
| break; | |||||
| } | } | ||||
| break; | break; | ||||
| case 0x7: | case 0x7: | ||||
| @@ -1021,6 +1084,13 @@ int get_cpuname(void){ | |||||
| case 1: | case 1: | ||||
| case 10: | case 10: | ||||
| return CPUTYPE_BARCELONA; | return CPUTYPE_BARCELONA; | ||||
| case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | |||||
| return CPUTYPE_BULLDOZER; | |||||
| else | |||||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||||
| case 5: | |||||
| return CPUTYPE_BOBCAT; | |||||
| } | } | ||||
| break; | break; | ||||
| } | } | ||||
| @@ -1140,6 +1210,9 @@ static char *cpuname[] = { | |||||
| "NSGEODE", | "NSGEODE", | ||||
| "VIAC3", | "VIAC3", | ||||
| "NANO", | "NANO", | ||||
| "SANDYBRIDGE", | |||||
| "BOBCAT", | |||||
| "BULLDOZER", | |||||
| }; | }; | ||||
| static char *lowercpuname[] = { | static char *lowercpuname[] = { | ||||
| @@ -1186,6 +1259,9 @@ static char *lowercpuname[] = { | |||||
| "tms3x00", | "tms3x00", | ||||
| "nsgeode", | "nsgeode", | ||||
| "nano", | "nano", | ||||
| "sandybridge", | |||||
| "bobcat", | |||||
| "bulldozer", | |||||
| }; | }; | ||||
| static char *corename[] = { | static char *corename[] = { | ||||
| @@ -1209,6 +1285,9 @@ static char *corename[] = { | |||||
| "NEHALEM", | "NEHALEM", | ||||
| "ATOM", | "ATOM", | ||||
| "NANO", | "NANO", | ||||
| "SANDYBRIDGE", | |||||
| "BOBCAT", | |||||
| "BULLDOZER", | |||||
| }; | }; | ||||
| static char *corename_lower[] = { | static char *corename_lower[] = { | ||||
| @@ -1232,6 +1311,9 @@ static char *corename_lower[] = { | |||||
| "nehalem", | "nehalem", | ||||
| "atom", | "atom", | ||||
| "nano", | "nano", | ||||
| "sandybridge", | |||||
| "bobcat", | |||||
| "bulldozer", | |||||
| }; | }; | ||||
| @@ -1315,10 +1397,33 @@ int get_coretype(void){ | |||||
| return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
| case 10: | case 10: | ||||
| //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | ||||
| return CORE_NEHALEM; | |||||
| if(support_avx()) | |||||
| return CORE_SANDYBRIDGE; | |||||
| else | |||||
| return CORE_NEHALEM; //OS doesn't support AVX | |||||
| case 12: | case 12: | ||||
| //Xeon Processor 5600 (Westmere-EP) | //Xeon Processor 5600 (Westmere-EP) | ||||
| return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
| case 13: | |||||
| //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) | |||||
| if(support_avx()) | |||||
| return CORE_SANDYBRIDGE; | |||||
| else | |||||
| return CORE_NEHALEM; //OS doesn't support AVX | |||||
| case 14: | |||||
| //Xeon E7540 | |||||
| case 15: | |||||
| //Xeon Processor E7 (Westmere-EX) | |||||
| return CORE_NEHALEM; | |||||
| } | |||||
| break; | |||||
| case 3: | |||||
| switch (model) { | |||||
| case 10: | |||||
| if(support_avx()) | |||||
| return CORE_SANDYBRIDGE; | |||||
| else | |||||
| return CORE_NEHALEM; //OS doesn't support AVX | |||||
| } | } | ||||
| break; | break; | ||||
| } | } | ||||
| @@ -1334,7 +1439,15 @@ int get_coretype(void){ | |||||
| if (family <= 0x5) return CORE_80486; | if (family <= 0x5) return CORE_80486; | ||||
| if (family <= 0xe) return CORE_ATHLON; | if (family <= 0xe) return CORE_ATHLON; | ||||
| if (family == 0xf){ | if (family == 0xf){ | ||||
| if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; | |||||
| if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | |||||
| else if (exfamily == 5) return CORE_BOBCAT; | |||||
| else if (exfamily == 6) { | |||||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | |||||
| return CORE_BULLDOZER; | |||||
| else | |||||
| return CORE_BARCELONA; //OS don't support AVX. Use old kernels. | |||||
| }else return CORE_BARCELONA; | |||||
| } | } | ||||
| } | } | ||||
| @@ -1400,6 +1513,9 @@ void get_cpuconfig(void){ | |||||
| printf("#define DTB_SIZE %d\n", info.size * 1024); | printf("#define DTB_SIZE %d\n", info.size * 1024); | ||||
| printf("#define DTB_ASSOCIATIVE %d\n", info.associative); | printf("#define DTB_ASSOCIATIVE %d\n", info.associative); | ||||
| printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize); | printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize); | ||||
| } else { | |||||
| //fall back for some virtual machines. | |||||
| printf("#define DTB_DEFAULT_ENTRIES 32\n"); | |||||
| } | } | ||||
| features = get_cputype(GET_FEATURE); | features = get_cputype(GET_FEATURE); | ||||
| @@ -1414,8 +1530,10 @@ void get_cpuconfig(void){ | |||||
| if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); | if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); | ||||
| if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); | if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); | ||||
| if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); | if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); | ||||
| if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); | |||||
| if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | ||||
| if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | ||||
| if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | |||||
| if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); | if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); | ||||
| if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); | if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); | ||||
| if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); | if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); | ||||
| @@ -1479,7 +1597,9 @@ void get_sse(void){ | |||||
| if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); | if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); | ||||
| if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); | if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); | ||||
| if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); | if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); | ||||
| if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); | |||||
| if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | ||||
| if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | ||||
| if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | |||||
| } | } | ||||
| @@ -34,20 +34,20 @@ COMPILER_GNU | |||||
| OS_LINUX | OS_LINUX | ||||
| #endif | #endif | ||||
| #if defined(__FreeBSD__) | |||||
| OS_FreeBSD | |||||
| #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) | |||||
| OS_FREEBSD | |||||
| #endif | #endif | ||||
| #if defined(__NetBSD__) | #if defined(__NetBSD__) | ||||
| OS_NetBSD | |||||
| OS_NETBSD | |||||
| #endif | #endif | ||||
| #if defined(__sun) | #if defined(__sun) | ||||
| OS_SunOS | |||||
| OS_SUNOS | |||||
| #endif | #endif | ||||
| #if defined(__APPLE__) | #if defined(__APPLE__) | ||||
| OS_Darwin | |||||
| OS_DARWIN | |||||
| #endif | #endif | ||||
| #if defined(_AIX) | #if defined(_AIX) | ||||
| @@ -63,13 +63,18 @@ OS_WINNT | |||||
| #endif | #endif | ||||
| #if defined(__CYGWIN__) | #if defined(__CYGWIN__) | ||||
| OS_CYGWIN | |||||
| OS_CYGWIN_NT | |||||
| #endif | #endif | ||||
| #if defined(__INTERIX) | #if defined(__INTERIX) | ||||
| OS_INTERIX | OS_INTERIX | ||||
| #endif | #endif | ||||
| #if defined(__gnu_hurd__) | |||||
| /* Hurd is very similar to GNU/Linux, it should work out of the box */ | |||||
| OS_LINUX | |||||
| #endif | |||||
| #if defined(__i386) || defined(_X86) | #if defined(__i386) || defined(_X86) | ||||
| ARCH_X86 | ARCH_X86 | ||||
| #endif | #endif | ||||
| @@ -5,7 +5,7 @@ | |||||
| TOPDIR = .. | TOPDIR = .. | ||||
| include $(TOPDIR)/Makefile.system | include $(TOPDIR)/Makefile.system | ||||
| CFLAGS += -DADD$(BU) -DCBLAS | |||||
| override CFLAGS += -DADD$(BU) -DCBLAS | |||||
| LIB = $(TOPDIR)/$(LIBNAME) | LIB = $(TOPDIR)/$(LIBNAME) | ||||
| @@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||||
| a = (FLOAT *)args -> a; | a = (FLOAT *)args -> a; | ||||
| x = (FLOAT *)args -> b; | x = (FLOAT *)args -> b; | ||||
| y = (FLOAT *)args -> c; | |||||
| lda = args -> lda; | lda = args -> lda; | ||||
| incx = args -> ldb; | incx = args -> ldb; | ||||
| @@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||||
| n_from = 0; | n_from = 0; | ||||
| n_to = n; | n_to = n; | ||||
| //Use y as each thread's n* COMPSIZE elements in sb buffer | |||||
| y = buffer; | |||||
| buffer += ((COMPSIZE * n + 1023) & ~1023); | |||||
| if (range_m) { | if (range_m) { | ||||
| n_from = *(range_m + 0); | n_from = *(range_m + 0); | ||||
| n_to = *(range_m + 1); | n_to = *(range_m + 1); | ||||
| @@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||||
| a += n_from * lda * COMPSIZE; | a += n_from * lda * COMPSIZE; | ||||
| } | } | ||||
| if (range_n) y += *range_n * COMPSIZE; | |||||
| if (incx != 1) { | if (incx != 1) { | ||||
| COPY_K(n, x, incx, buffer, 1); | COPY_K(n, x, incx, buffer, 1); | ||||
| @@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||||
| if (num_cpu) { | if (num_cpu) { | ||||
| queue[0].sa = NULL; | queue[0].sa = NULL; | ||||
| queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; | |||||
| queue[0].sb = buffer; | |||||
| queue[num_cpu - 1].next = NULL; | queue[num_cpu - 1].next = NULL; | ||||
| exec_blas(num_cpu, queue); | exec_blas(num_cpu, queue); | ||||
| @@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||||
| #else | #else | ||||
| ONE, ZERO, | ONE, ZERO, | ||||
| #endif | #endif | ||||
| buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); | |||||
| (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0); | |||||
| } | } | ||||
| AXPYU_K(n, 0, 0, | AXPYU_K(n, 0, 0, | ||||
| @@ -71,7 +71,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||||
| queue[num_cpu].args = arg; | queue[num_cpu].args = arg; | ||||
| queue[num_cpu].range_m = range_m; | queue[num_cpu].range_m = range_m; | ||||
| queue[num_cpu].range_n = &range[num_cpu]; | queue[num_cpu].range_n = &range[num_cpu]; | ||||
| #if defined(LOONGSON3A) | |||||
| #if 0 //defined(LOONGSON3A) | |||||
| queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; | queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; | ||||
| queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; | queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; | ||||
| #else | #else | ||||
| @@ -83,7 +83,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||||
| } | } | ||||
| if (num_cpu) { | if (num_cpu) { | ||||
| #if defined(LOONGSON3A) | |||||
| #if 0 //defined(LOONGSON3A) | |||||
| queue[0].sa = sa; | queue[0].sa = sa; | ||||
| queue[0].sb = sa + GEMM_OFFSET_A1 * 5; | queue[0].sb = sa + GEMM_OFFSET_A1 * 5; | ||||
| #else | #else | ||||
| @@ -1,12 +1,12 @@ | |||||
| TOPDIR = ../.. | TOPDIR = ../.. | ||||
| include ../../Makefile.system | include ../../Makefile.system | ||||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) | |||||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) | |||||
| COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | ||||
| ifdef SMP | ifdef SMP | ||||
| COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) | |||||
| COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) | |||||
| ifndef NO_AFFINITY | ifndef NO_AFFINITY | ||||
| COMMONOBJS += init.$(SUFFIX) | COMMONOBJS += init.$(SUFFIX) | ||||
| endif | endif | ||||
| @@ -14,7 +14,7 @@ endif | |||||
| # COMMONOBJS += info.$(SUFFIX) | # COMMONOBJS += info.$(SUFFIX) | ||||
| ifdef DYNAMIC_ARCH | |||||
| ifeq ($(DYNAMIC_ARCH), 1) | |||||
| COMMONOBJS += dynamic.$(SUFFIX) | COMMONOBJS += dynamic.$(SUFFIX) | ||||
| else | else | ||||
| COMMONOBJS += parameter.$(SUFFIX) | COMMONOBJS += parameter.$(SUFFIX) | ||||
| @@ -70,7 +70,7 @@ ifndef BLAS_SERVER | |||||
| BLAS_SERVER = blas_server.c | BLAS_SERVER = blas_server.c | ||||
| endif | endif | ||||
| ifdef DYNAMIC_ARCH | |||||
| ifeq ($(DYNAMIC_ARCH), 1) | |||||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | ||||
| else | else | ||||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | ||||
| @@ -103,6 +103,9 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../. | |||||
| openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c | openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c | ||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | $(CC) $(CFLAGS) -c $< -o $(@F) | ||||
| openblas_get_config.$(SUFFIX) : openblas_get_config.c | |||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||||
| blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h | blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h | ||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | $(CC) $(CFLAGS) -c $< -o $(@F) | ||||
| @@ -215,7 +218,7 @@ info.$(SUFFIX) : info.c info.h ../../common.h ../../param.h | |||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | $(CC) $(CFLAGS) -c $< -o $(@F) | ||||
| hpl : CFLAGS += -DHPL | |||||
| hpl_p : CFLAGS += -DHPL | |||||
| hpl : override CFLAGS += -DHPL | |||||
| hpl_p : override CFLAGS += -DHPL | |||||
| include $(TOPDIR)/Makefile.tail | include $(TOPDIR)/Makefile.tail | ||||
| @@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){ | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | ||||
| } | } | ||||
| } | } | ||||
| queue->sb=sb; | |||||
| } | } | ||||
| #ifdef MONITOR | #ifdef MONITOR | ||||
| @@ -435,7 +436,7 @@ static int blas_thread_server(void *arg){ | |||||
| blas_memory_free(buffer); | blas_memory_free(buffer); | ||||
| pthread_exit(NULL); | |||||
| //pthread_exit(NULL); | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| @@ -770,6 +771,19 @@ void goto_set_num_threads(int num_threads) { | |||||
| if (num_threads < 1) num_threads = blas_num_threads; | if (num_threads < 1) num_threads = blas_num_threads; | ||||
| #ifndef NO_AFFINITY | |||||
| if (num_threads == 1) { | |||||
| if (blas_cpu_number == 1){ | |||||
| //OpenBLAS is already single thread. | |||||
| return; | |||||
| }else{ | |||||
| //From multi-threads to single thread | |||||
| //Restore the original affinity mask | |||||
| gotoblas_set_affinity(-1); | |||||
| } | |||||
| } | |||||
| #endif | |||||
| if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | ||||
| if (num_threads > blas_num_threads) { | if (num_threads > blas_num_threads) { | ||||
| @@ -800,6 +814,13 @@ void goto_set_num_threads(int num_threads) { | |||||
| UNLOCK_COMMAND(&server_lock); | UNLOCK_COMMAND(&server_lock); | ||||
| } | } | ||||
| #ifndef NO_AFFINITY | |||||
| if(blas_cpu_number == 1 && num_threads > 1){ | |||||
| //Restore the thread 0 affinity. | |||||
| gotoblas_set_affinity(0); | |||||
| } | |||||
| #endif | |||||
| blas_cpu_number = num_threads; | blas_cpu_number = num_threads; | ||||
| #if defined(ARCH_MIPS64) | #if defined(ARCH_MIPS64) | ||||
| @@ -49,8 +49,12 @@ | |||||
| int blas_server_avail = 0; | int blas_server_avail = 0; | ||||
| static void * blas_thread_buffer[MAX_CPU_NUMBER]; | |||||
| void goto_set_num_threads(int num_threads) { | void goto_set_num_threads(int num_threads) { | ||||
| int i=0; | |||||
| if (num_threads < 1) num_threads = blas_num_threads; | if (num_threads < 1) num_threads = blas_num_threads; | ||||
| if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | ||||
| @@ -62,7 +66,19 @@ void goto_set_num_threads(int num_threads) { | |||||
| blas_cpu_number = num_threads; | blas_cpu_number = num_threads; | ||||
| omp_set_num_threads(blas_cpu_number); | omp_set_num_threads(blas_cpu_number); | ||||
| //adjust buffer for each thread | |||||
| for(i=0; i<blas_cpu_number; i++){ | |||||
| if(blas_thread_buffer[i]==NULL){ | |||||
| blas_thread_buffer[i]=blas_memory_alloc(2); | |||||
| } | |||||
| } | |||||
| for(; i<MAX_CPU_NUMBER; i++){ | |||||
| if(blas_thread_buffer[i]!=NULL){ | |||||
| blas_memory_free(blas_thread_buffer[i]); | |||||
| blas_thread_buffer[i]=NULL; | |||||
| } | |||||
| } | |||||
| #if defined(ARCH_MIPS64) | #if defined(ARCH_MIPS64) | ||||
| //set parameters for different number of threads. | //set parameters for different number of threads. | ||||
| blas_set_parameter(); | blas_set_parameter(); | ||||
| @@ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) { | |||||
| int blas_thread_init(void){ | int blas_thread_init(void){ | ||||
| int i=0; | |||||
| blas_get_cpu_number(); | blas_get_cpu_number(); | ||||
| blas_server_avail = 1; | blas_server_avail = 1; | ||||
| for(i=0; i<blas_num_threads; i++){ | |||||
| blas_thread_buffer[i]=blas_memory_alloc(2); | |||||
| } | |||||
| for(; i<MAX_CPU_NUMBER; i++){ | |||||
| blas_thread_buffer[i]=NULL; | |||||
| } | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| int BLASFUNC(blas_thread_shutdown)(void){ | int BLASFUNC(blas_thread_shutdown)(void){ | ||||
| int i=0; | |||||
| blas_server_avail = 0; | blas_server_avail = 0; | ||||
| for(i=0; i<MAX_CPU_NUMBER; i++){ | |||||
| if(blas_thread_buffer[i]!=NULL){ | |||||
| blas_memory_free(blas_thread_buffer[i]); | |||||
| blas_thread_buffer[i]=NULL; | |||||
| } | |||||
| } | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| @@ -177,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||||
| static void exec_threads(blas_queue_t *queue){ | static void exec_threads(blas_queue_t *queue){ | ||||
| void *buffer, *sa, *sb; | void *buffer, *sa, *sb; | ||||
| int pos=0, release_flag=0; | |||||
| buffer = NULL; | buffer = NULL; | ||||
| sa = queue -> sa; | sa = queue -> sa; | ||||
| sb = queue -> sb; | sb = queue -> sb; | ||||
| @@ -189,7 +222,14 @@ static void exec_threads(blas_queue_t *queue){ | |||||
| if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { | if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { | ||||
| buffer = blas_memory_alloc(2); | |||||
| pos = omp_get_thread_num(); | |||||
| buffer = blas_thread_buffer[pos]; | |||||
| //fallback | |||||
| if(buffer==NULL) { | |||||
| buffer = blas_memory_alloc(2); | |||||
| release_flag=1; | |||||
| } | |||||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | ||||
| @@ -224,6 +264,7 @@ static void exec_threads(blas_queue_t *queue){ | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | ||||
| } | } | ||||
| } | } | ||||
| queue->sb=sb; | |||||
| } | } | ||||
| } | } | ||||
| @@ -241,7 +282,7 @@ static void exec_threads(blas_queue_t *queue){ | |||||
| } | } | ||||
| if (buffer != NULL) blas_memory_free(buffer); | |||||
| if (release_flag) blas_memory_free(buffer); | |||||
| } | } | ||||
| @@ -63,6 +63,8 @@ static blas_pool_t pool; | |||||
| static HANDLE blas_threads [MAX_CPU_NUMBER]; | static HANDLE blas_threads [MAX_CPU_NUMBER]; | ||||
| static DWORD blas_threads_id[MAX_CPU_NUMBER]; | static DWORD blas_threads_id[MAX_CPU_NUMBER]; | ||||
| static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | ||||
| if (!(mode & BLAS_COMPLEX)){ | if (!(mode & BLAS_COMPLEX)){ | ||||
| @@ -179,7 +181,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||||
| do { | do { | ||||
| action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); | action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); | ||||
| } while ((action != WAIT_OBJECT_0) && (action == WAIT_OBJECT_0 + 1)); | |||||
| } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1)); | |||||
| if (action == WAIT_OBJECT_0 + 1) break; | if (action == WAIT_OBJECT_0 + 1) break; | ||||
| @@ -251,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | ||||
| } | } | ||||
| } | } | ||||
| queue->sb=sb; | |||||
| } | } | ||||
| #ifdef MONITOR | #ifdef MONITOR | ||||
| @@ -263,7 +266,9 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||||
| } else { | } else { | ||||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | legacy_exec(routine, queue -> mode, queue -> args, sb); | ||||
| } | } | ||||
| } | |||||
| }else{ | |||||
| continue; //if queue == NULL | |||||
| } | |||||
| #ifdef SMP_DEBUG | #ifdef SMP_DEBUG | ||||
| fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); | fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); | ||||
| @@ -425,7 +430,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||||
| /* Shutdown procedure, but user don't have to call this routine. The */ | /* Shutdown procedure, but user don't have to call this routine. The */ | ||||
| /* kernel automatically kill threads. */ | /* kernel automatically kill threads. */ | ||||
| int blas_thread_shutdown_(void){ | |||||
| int BLASFUNC(blas_thread_shutdown)(void){ | |||||
| int i; | int i; | ||||
| @@ -437,7 +442,7 @@ int blas_thread_shutdown_(void){ | |||||
| SetEvent(pool.killed); | SetEvent(pool.killed); | ||||
| for(i = 0; i < blas_cpu_number - 1; i++){ | |||||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||||
| WaitForSingleObject(blas_threads[i], INFINITE); | WaitForSingleObject(blas_threads[i], INFINITE); | ||||
| } | } | ||||
| @@ -448,3 +453,47 @@ int blas_thread_shutdown_(void){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void goto_set_num_threads(int num_threads) | |||||
| { | |||||
| long i; | |||||
| if (num_threads < 1) num_threads = blas_cpu_number; | |||||
| if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | |||||
| if (num_threads > blas_num_threads) { | |||||
| LOCK_COMMAND(&server_lock); | |||||
| //increased_threads = 1; | |||||
| if (!blas_server_avail){ | |||||
| InitializeCriticalSection(&pool.lock); | |||||
| pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); | |||||
| pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); | |||||
| pool.shutdown = 0; | |||||
| pool.queue = NULL; | |||||
| blas_server_avail = 1; | |||||
| } | |||||
| for(i = blas_num_threads - 1; i < num_threads - 1; i++){ | |||||
| blas_threads[i] = CreateThread(NULL, 0, | |||||
| blas_thread_server, (void *)i, | |||||
| 0, &blas_threads_id[i]); | |||||
| } | |||||
| blas_num_threads = num_threads; | |||||
| UNLOCK_COMMAND(&server_lock); | |||||
| } | |||||
| blas_cpu_number = num_threads; | |||||
| } | |||||
| void openblas_set_num_threads(int num) | |||||
| { | |||||
| goto_set_num_threads(num); | |||||
| } | |||||
| @@ -60,6 +60,16 @@ extern gotoblas_t gotoblas_NEHALEM; | |||||
| extern gotoblas_t gotoblas_OPTERON; | extern gotoblas_t gotoblas_OPTERON; | ||||
| extern gotoblas_t gotoblas_OPTERON_SSE3; | extern gotoblas_t gotoblas_OPTERON_SSE3; | ||||
| extern gotoblas_t gotoblas_BARCELONA; | extern gotoblas_t gotoblas_BARCELONA; | ||||
| extern gotoblas_t gotoblas_BOBCAT; | |||||
| #ifndef NO_AVX | |||||
| extern gotoblas_t gotoblas_SANDYBRIDGE; | |||||
| extern gotoblas_t gotoblas_BULLDOZER; | |||||
| #else | |||||
| //Use NEHALEM kernels for sandy bridge | |||||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | |||||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | |||||
| #endif | |||||
| #define VENDOR_INTEL 1 | #define VENDOR_INTEL 1 | ||||
| #define VENDOR_AMD 2 | #define VENDOR_AMD 2 | ||||
| @@ -68,6 +78,32 @@ extern gotoblas_t gotoblas_BARCELONA; | |||||
| #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | ||||
| #ifndef NO_AVX | |||||
| static inline void xgetbv(int op, int * eax, int * edx){ | |||||
| //Use binary code for xgetbv | |||||
| __asm__ __volatile__ | |||||
| (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); | |||||
| } | |||||
| #endif | |||||
| int support_avx(){ | |||||
| #ifndef NO_AVX | |||||
| int eax, ebx, ecx, edx; | |||||
| int ret=0; | |||||
| cpuid(1, &eax, &ebx, &ecx, &edx); | |||||
| if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){ | |||||
| xgetbv(0, &eax, &edx); | |||||
| if((eax & 6) == 6){ | |||||
| ret=1; //OS support AVX | |||||
| } | |||||
| } | |||||
| return ret; | |||||
| #else | |||||
| return 0; | |||||
| #endif | |||||
| } | |||||
| static int get_vendor(void){ | static int get_vendor(void){ | ||||
| int eax, ebx, ecx, edx; | int eax, ebx, ecx, edx; | ||||
| char vendor[13]; | char vendor[13]; | ||||
| @@ -122,15 +158,39 @@ static gotoblas_t *get_coretype(void){ | |||||
| if (model == 12) return &gotoblas_ATOM; | if (model == 12) return &gotoblas_ATOM; | ||||
| return NULL; | return NULL; | ||||
| case 2: | |||||
| //Intel Core (Clarkdale) / Core (Arrandale) | |||||
| // Pentium (Clarkdale) / Pentium Mobile (Arrandale) | |||||
| // Xeon (Clarkdale), 32nm | |||||
| if (model == 5) return &gotoblas_NEHALEM; | |||||
| case 2: | |||||
| //Intel Core (Clarkdale) / Core (Arrandale) | |||||
| // Pentium (Clarkdale) / Pentium Mobile (Arrandale) | |||||
| // Xeon (Clarkdale), 32nm | |||||
| if (model == 5) return &gotoblas_NEHALEM; | |||||
| //Intel Xeon Processor 5600 (Westmere-EP) | |||||
| if (model == 12) return &gotoblas_NEHALEM; | |||||
| return NULL; | |||||
| //Intel Xeon Processor 5600 (Westmere-EP) | |||||
| //Xeon Processor E7 (Westmere-EX) | |||||
| //Xeon E7540 | |||||
| if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; | |||||
| //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | |||||
| //Intel Core i7-3000 / Xeon E5 | |||||
| if (model == 10 || model == 13) { | |||||
| if(support_avx()) | |||||
| return &gotoblas_SANDYBRIDGE; | |||||
| else{ | |||||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); | |||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| } | |||||
| return NULL; | |||||
| case 3: | |||||
| //Intel Sandy Bridge 22nm (Ivy Bridge?) | |||||
| if (model == 10) { | |||||
| if(support_avx()) | |||||
| return &gotoblas_SANDYBRIDGE; | |||||
| else{ | |||||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); | |||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| } | |||||
| return NULL; | |||||
| } | } | ||||
| case 0xf: | case 0xf: | ||||
| if (model <= 0x2) return &gotoblas_NORTHWOOD; | if (model <= 0x2) return &gotoblas_NORTHWOOD; | ||||
| @@ -144,7 +204,17 @@ static gotoblas_t *get_coretype(void){ | |||||
| if ((exfamily == 0) || (exfamily == 2)) { | if ((exfamily == 0) || (exfamily == 2)) { | ||||
| if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; | if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; | ||||
| else return &gotoblas_OPTERON; | else return &gotoblas_OPTERON; | ||||
| } else { | |||||
| } else if (exfamily == 5) { | |||||
| return &gotoblas_BOBCAT; | |||||
| } else if (exfamily == 6) { | |||||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | |||||
| return &gotoblas_BULLDOZER; | |||||
| else{ | |||||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"); | |||||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| } else { | |||||
| return &gotoblas_BARCELONA; | return &gotoblas_BARCELONA; | ||||
| } | } | ||||
| } | } | ||||
| @@ -178,6 +248,9 @@ static char *corename[] = { | |||||
| "Opteron(SSE3)", | "Opteron(SSE3)", | ||||
| "Barcelona", | "Barcelona", | ||||
| "Nano", | "Nano", | ||||
| "Sandybridge", | |||||
| "Bobcat", | |||||
| "Bulldozer", | |||||
| }; | }; | ||||
| char *gotoblas_corename(void) { | char *gotoblas_corename(void) { | ||||
| @@ -197,7 +270,10 @@ char *gotoblas_corename(void) { | |||||
| if (gotoblas == &gotoblas_OPTERON) return corename[13]; | if (gotoblas == &gotoblas_OPTERON) return corename[13]; | ||||
| if (gotoblas == &gotoblas_BARCELONA) return corename[14]; | if (gotoblas == &gotoblas_BARCELONA) return corename[14]; | ||||
| if (gotoblas == &gotoblas_NANO) return corename[15]; | if (gotoblas == &gotoblas_NANO) return corename[15]; | ||||
| if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | |||||
| if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | |||||
| if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | |||||
| return corename[0]; | return corename[0]; | ||||
| } | } | ||||
| @@ -211,12 +287,21 @@ void gotoblas_dynamic_init(void) { | |||||
| if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; | if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; | ||||
| #else | #else | ||||
| if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; | if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; | ||||
| /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ | |||||
| if (sizeof(void*) == 8) { | |||||
| if (gotoblas == &gotoblas_KATMAI || | |||||
| gotoblas == &gotoblas_COPPERMINE || | |||||
| gotoblas == &gotoblas_NORTHWOOD || | |||||
| gotoblas == &gotoblas_BANIAS || | |||||
| gotoblas == &gotoblas_ATHLON) | |||||
| gotoblas = &gotoblas_PRESCOTT; | |||||
| } | |||||
| #endif | #endif | ||||
| if (gotoblas && gotoblas -> init) { | if (gotoblas && gotoblas -> init) { | ||||
| gotoblas -> init(); | gotoblas -> init(); | ||||
| } else { | } else { | ||||
| fprintf(stderr, "GotoBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||||
| fprintf(stderr, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||||
| exit(1); | exit(1); | ||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /***************************************************************************** | /***************************************************************************** | ||||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -85,6 +85,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define MAX_NODES 16 | #define MAX_NODES 16 | ||||
| #define MAX_CPUS 256 | #define MAX_CPUS 256 | ||||
| #define NCPUBITS (8*sizeof(unsigned long)) | |||||
| #define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS) | |||||
| #define CPUELT(cpu) ((cpu) / NCPUBITS) | |||||
| #define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS)) | |||||
| #define SH_MAGIC 0x510510 | #define SH_MAGIC 0x510510 | ||||
| @@ -103,10 +108,10 @@ typedef struct { | |||||
| int num_nodes; | int num_nodes; | ||||
| int num_procs; | int num_procs; | ||||
| int final_num_procs; | int final_num_procs; | ||||
| unsigned long avail; | |||||
| unsigned long avail [MAX_BITMASK_LEN]; | |||||
| int avail_count; | |||||
| unsigned long cpu_info [MAX_CPUS]; | unsigned long cpu_info [MAX_CPUS]; | ||||
| unsigned long node_info [MAX_NODES]; | |||||
| unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN]; | |||||
| int cpu_use[MAX_CPUS]; | int cpu_use[MAX_CPUS]; | ||||
| } shm_t; | } shm_t; | ||||
| @@ -126,7 +131,8 @@ static shm_t *common = (void *)-1; | |||||
| static int shmid, pshmid; | static int shmid, pshmid; | ||||
| static void *paddr; | static void *paddr; | ||||
| static unsigned long lprocmask, lnodemask; | |||||
| static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask; | |||||
| static int lprocmask_count = 0; | |||||
| static int numprocs = 1; | static int numprocs = 1; | ||||
| static int numnodes = 1; | static int numnodes = 1; | ||||
| @@ -177,70 +183,114 @@ static inline int rcount(unsigned long number) { | |||||
| than sizeof(unsigned long). On 64 bits, the limit | than sizeof(unsigned long). On 64 bits, the limit | ||||
| is 64. On 32 bits, it is 32. | is 64. On 32 bits, it is 32. | ||||
| ***/ | ***/ | ||||
| static inline unsigned long get_cpumap(int node) { | |||||
| static inline void get_cpumap(int node, unsigned long * node_info) { | |||||
| int infile; | int infile; | ||||
| unsigned long affinity; | |||||
| unsigned long affinity[32]; | |||||
| char name[160]; | char name[160]; | ||||
| char cpumap[160]; | char cpumap[160]; | ||||
| char *p, *dummy; | |||||
| char *dummy; | |||||
| int i=0; | int i=0; | ||||
| int count=0; | |||||
| int k=0; | |||||
| sprintf(name, CPUMAP_NAME, node); | sprintf(name, CPUMAP_NAME, node); | ||||
| infile = open(name, O_RDONLY); | infile = open(name, O_RDONLY); | ||||
| for(i=0; i<32; i++){ | |||||
| affinity[i] = 0; | |||||
| } | |||||
| affinity = 0; | |||||
| if (infile != -1) { | if (infile != -1) { | ||||
| read(infile, cpumap, sizeof(cpumap)); | read(infile, cpumap, sizeof(cpumap)); | ||||
| p = cpumap; | |||||
| while (*p != '\n' && i<160){ | |||||
| if(*p != ',') { | |||||
| name[i++]=*p; | |||||
| } | |||||
| p++; | |||||
| } | |||||
| p = name; | |||||
| // while ((*p == '0') || (*p == ',')) p++; | |||||
| for(i=0; i<160; i++){ | |||||
| if(cpumap[i] == '\n') | |||||
| break; | |||||
| if(cpumap[i] != ','){ | |||||
| name[k++]=cpumap[i]; | |||||
| //Enough data for Hex | |||||
| if(k >= NCPUBITS/4){ | |||||
| affinity[count++] = strtoul(name, &dummy, 16); | |||||
| k=0; | |||||
| } | |||||
| } | |||||
| affinity = strtoul(p, &dummy, 16); | |||||
| } | |||||
| if(k!=0){ | |||||
| name[k]='\0'; | |||||
| affinity[count++] = strtoul(name, &dummy, 16); | |||||
| k=0; | |||||
| } | |||||
| // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... | |||||
| // revert the sequence | |||||
| for(i=0; i<count && i<MAX_BITMASK_LEN; i++){ | |||||
| node_info[i]=affinity[count-i-1]; | |||||
| } | |||||
| close(infile); | close(infile); | ||||
| } | } | ||||
| return affinity; | |||||
| return ; | |||||
| } | } | ||||
| static inline unsigned long get_share(int cpu, int level) { | |||||
| static inline void get_share(int cpu, int level, unsigned long * share) { | |||||
| int infile; | int infile; | ||||
| unsigned long affinity; | |||||
| unsigned long affinity[32]; | |||||
| char cpumap[160]; | |||||
| char name[160]; | char name[160]; | ||||
| char *p; | |||||
| char *dummy; | |||||
| int count=0; | |||||
| int i=0,k=0; | |||||
| int bitmask_idx = 0; | |||||
| sprintf(name, SHARE_NAME, cpu, level); | sprintf(name, SHARE_NAME, cpu, level); | ||||
| infile = open(name, O_RDONLY); | infile = open(name, O_RDONLY); | ||||
| affinity = (1UL << cpu); | |||||
| // Init share | |||||
| for(i=0; i<MAX_BITMASK_LEN; i++){ | |||||
| share[i]=0; | |||||
| } | |||||
| bitmask_idx = CPUELT(cpu); | |||||
| share[bitmask_idx] = CPUMASK(cpu); | |||||
| if (infile != -1) { | if (infile != -1) { | ||||
| read(infile, name, sizeof(name)); | |||||
| p = name; | |||||
| read(infile, cpumap, sizeof(cpumap)); | |||||
| while ((*p == '0') || (*p == ',')) p++; | |||||
| for(i=0; i<160; i++){ | |||||
| if(cpumap[i] == '\n') | |||||
| break; | |||||
| if(cpumap[i] != ','){ | |||||
| name[k++]=cpumap[i]; | |||||
| //Enough data | |||||
| if(k >= NCPUBITS/4){ | |||||
| affinity[count++] = strtoul(name, &dummy, 16); | |||||
| k=0; | |||||
| } | |||||
| } | |||||
| affinity = strtol(p, &p, 16); | |||||
| } | |||||
| if(k!=0){ | |||||
| name[k]='\0'; | |||||
| affinity[count++] = strtoul(name, &dummy, 16); | |||||
| k=0; | |||||
| } | |||||
| // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... | |||||
| // revert the sequence | |||||
| for(i=0; i<count && i<MAX_BITMASK_LEN; i++){ | |||||
| share[i]=affinity[count-i-1]; | |||||
| } | |||||
| close(infile); | close(infile); | ||||
| } | } | ||||
| return affinity; | |||||
| return ; | |||||
| } | } | ||||
| static int numa_check(void) { | static int numa_check(void) { | ||||
| @@ -248,6 +298,7 @@ static int numa_check(void) { | |||||
| DIR *dp; | DIR *dp; | ||||
| struct dirent *dir; | struct dirent *dir; | ||||
| int node; | int node; | ||||
| int j; | |||||
| common -> num_nodes = 0; | common -> num_nodes = 0; | ||||
| @@ -258,7 +309,9 @@ static int numa_check(void) { | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0; | |||||
| for (node = 0; node < MAX_NODES; node ++) { | |||||
| for (j = 0; j<MAX_BITMASK_LEN; j++) common -> node_info[node][j] = 0; | |||||
| } | |||||
| while ((dir = readdir(dp)) != NULL) { | while ((dir = readdir(dp)) != NULL) { | ||||
| if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { | if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { | ||||
| @@ -266,12 +319,12 @@ static int numa_check(void) { | |||||
| node = atoi(&dir -> d_name[4]); | node = atoi(&dir -> d_name[4]); | ||||
| if (node > MAX_NODES) { | if (node > MAX_NODES) { | ||||
| fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n"); | |||||
| fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n"); | |||||
| exit(1); | exit(1); | ||||
| } | } | ||||
| common -> num_nodes ++; | common -> num_nodes ++; | ||||
| common -> node_info[node] = get_cpumap(node); | |||||
| get_cpumap(node, common->node_info[node]); | |||||
| } | } | ||||
| } | } | ||||
| @@ -284,7 +337,7 @@ static int numa_check(void) { | |||||
| fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); | fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); | ||||
| for (node = 0; node < common -> num_nodes; node ++) | for (node = 0; node < common -> num_nodes; node ++) | ||||
| fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]); | |||||
| fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]); | |||||
| #endif | #endif | ||||
| return common -> num_nodes; | return common -> num_nodes; | ||||
| @@ -296,11 +349,13 @@ static void numa_mapping(void) { | |||||
| int i, j, h; | int i, j, h; | ||||
| unsigned long work, bit; | unsigned long work, bit; | ||||
| int count = 0; | int count = 0; | ||||
| int bitmask_idx = 0; | |||||
| for (node = 0; node < common -> num_nodes; node ++) { | for (node = 0; node < common -> num_nodes; node ++) { | ||||
| core = 0; | core = 0; | ||||
| for (cpu = 0; cpu < common -> num_procs; cpu ++) { | for (cpu = 0; cpu < common -> num_procs; cpu ++) { | ||||
| if (common -> node_info[node] & common -> avail & (1UL << cpu)) { | |||||
| bitmask_idx = CPUELT(cpu); | |||||
| if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) { | |||||
| common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); | common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); | ||||
| count ++; | count ++; | ||||
| core ++; | core ++; | ||||
| @@ -357,58 +412,92 @@ static void numa_mapping(void) { | |||||
| static void disable_hyperthread(void) { | static void disable_hyperthread(void) { | ||||
| unsigned long share; | |||||
| unsigned long share[MAX_BITMASK_LEN]; | |||||
| int cpu; | int cpu; | ||||
| int bitmask_idx = 0; | |||||
| int i=0, count=0; | |||||
| bitmask_idx = CPUELT(common -> num_procs); | |||||
| if(common->num_procs > 64){ | |||||
| fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); | |||||
| exit(1); | |||||
| }else if(common->num_procs == 64){ | |||||
| common -> avail = 0xFFFFFFFFFFFFFFFFUL; | |||||
| }else | |||||
| common -> avail = (1UL << common -> num_procs) - 1; | |||||
| for(i=0; i< bitmask_idx; i++){ | |||||
| common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL; | |||||
| } | |||||
| if(CPUMASK(common -> num_procs) != 1){ | |||||
| common -> avail[count++] = CPUMASK(common -> num_procs) - 1; | |||||
| } | |||||
| common -> avail_count = count; | |||||
| /* if(common->num_procs > 64){ */ | |||||
| /* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */ | |||||
| /* exit(1); */ | |||||
| /* }else if(common->num_procs == 64){ */ | |||||
| /* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */ | |||||
| /* }else */ | |||||
| /* common -> avail = (1UL << common -> num_procs) - 1; */ | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); | |||||
| fprintf(stderr, "\nAvail CPUs : "); | |||||
| for(i=0; i<count; i++) | |||||
| fprintf(stderr, "%04lx ", common -> avail[i]); | |||||
| fprintf(stderr, ".\n"); | |||||
| #endif | #endif | ||||
| for (cpu = 0; cpu < common -> num_procs; cpu ++) { | for (cpu = 0; cpu < common -> num_procs; cpu ++) { | ||||
| share = (get_share(cpu, 1) & common -> avail); | |||||
| if (popcount(share) > 1) { | |||||
| get_share(cpu, 1, share); | |||||
| //When the shared cpu are in different element of share & avail array, this may be a bug. | |||||
| for (i = 0; i < count ; i++){ | |||||
| share[i] &= common->avail[i]; | |||||
| if (popcount(share[i]) > 1) { | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", | |||||
| cpu, share & ~(1UL << cpu)); | |||||
| fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", | |||||
| cpu, share[i] & ~(CPUMASK(cpu))); | |||||
| #endif | #endif | ||||
| common -> avail &= ~((share & ~(1UL << cpu))); | |||||
| common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu))); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| static void disable_affinity(void) { | static void disable_affinity(void) { | ||||
| int i=0; | |||||
| int bitmask_idx=0; | |||||
| int count=0; | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail); | |||||
| fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]); | |||||
| fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); | fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); | ||||
| #endif | #endif | ||||
| if(common->final_num_procs > 64){ | |||||
| fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); | |||||
| exit(1); | |||||
| }else if(common->final_num_procs == 64){ | |||||
| lprocmask = 0xFFFFFFFFFFFFFFFFUL; | |||||
| }else | |||||
| lprocmask = (1UL << common -> final_num_procs) - 1; | |||||
| /* if(common->final_num_procs > 64){ */ | |||||
| /* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */ | |||||
| /* exit(1); */ | |||||
| /* }else if(common->final_num_procs == 64){ */ | |||||
| /* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */ | |||||
| /* }else */ | |||||
| /* lprocmask = (1UL << common -> final_num_procs) - 1; */ | |||||
| bitmask_idx = CPUELT(common -> final_num_procs); | |||||
| for(i=0; i< bitmask_idx; i++){ | |||||
| lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL; | |||||
| } | |||||
| if(CPUMASK(common -> final_num_procs) != 1){ | |||||
| lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1; | |||||
| } | |||||
| lprocmask_count = count; | |||||
| #ifndef USE_OPENMP | #ifndef USE_OPENMP | ||||
| lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; | |||||
| for(i=0; i< count; i++){ | |||||
| lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i]; | |||||
| } | |||||
| #endif | #endif | ||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask); | |||||
| fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]); | |||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -498,7 +587,7 @@ static void create_pshmem(void) { | |||||
| static void local_cpu_map(void) { | static void local_cpu_map(void) { | ||||
| int cpu, id, mapping; | int cpu, id, mapping; | ||||
| int bitmask_idx = 0; | |||||
| cpu = 0; | cpu = 0; | ||||
| mapping = 0; | mapping = 0; | ||||
| @@ -508,8 +597,9 @@ static void local_cpu_map(void) { | |||||
| if (id > 0) { | if (id > 0) { | ||||
| if (is_dead(id)) common -> cpu_use[cpu] = 0; | if (is_dead(id)) common -> cpu_use[cpu] = 0; | ||||
| } | } | ||||
| if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) { | |||||
| bitmask_idx = CPUELT(cpu); | |||||
| if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) { | |||||
| common -> cpu_use[cpu] = pshmid; | common -> cpu_use[cpu] = pshmid; | ||||
| cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); | cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); | ||||
| @@ -595,6 +685,7 @@ void gotoblas_affinity_init(void) { | |||||
| #ifndef USE_OPENMP | #ifndef USE_OPENMP | ||||
| cpu_set_t cpu_mask; | cpu_set_t cpu_mask; | ||||
| #endif | #endif | ||||
| int i; | |||||
| if (initialized) return; | if (initialized) return; | ||||
| @@ -646,6 +737,11 @@ void gotoblas_affinity_init(void) { | |||||
| common -> num_procs = get_nprocs(); | common -> num_procs = get_nprocs(); | ||||
| if(common -> num_procs > MAX_CPUS) { | |||||
| fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); | |||||
| exit(1); | |||||
| } | |||||
| for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; | for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; | ||||
| numa_check(); | numa_check(); | ||||
| @@ -654,7 +750,8 @@ void gotoblas_affinity_init(void) { | |||||
| if (common -> num_nodes > 1) numa_mapping(); | if (common -> num_nodes > 1) numa_mapping(); | ||||
| common -> final_num_procs = popcount(common -> avail); | |||||
| common -> final_num_procs = 0; | |||||
| for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]); | |||||
| for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; | for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; | ||||
| @@ -664,7 +761,8 @@ void gotoblas_affinity_init(void) { | |||||
| disable_affinity(); | disable_affinity(); | ||||
| num_avail = popcount(lprocmask); | |||||
| num_avail = 0; | |||||
| for(i=0; i<lprocmask_count; i++) num_avail += popcount(lprocmask[i]); | |||||
| if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail; | if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail; | ||||
| @@ -1,5 +1,5 @@ | |||||
| /***************************************************************************** | /***************************************************************************** | ||||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -103,7 +103,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include <sys/syscall.h> | #include <sys/syscall.h> | ||||
| #endif | #endif | ||||
| #if defined(OS_FreeBSD) || defined(OS_Darwin) | |||||
| #if defined(OS_FREEBSD) || defined(OS_DARWIN) | |||||
| #include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
| #endif | #endif | ||||
| @@ -185,7 +185,7 @@ int get_num_procs(void) { | |||||
| #endif | #endif | ||||
| #if defined(OS_FreeBSD) || defined(OS_Darwin) | |||||
| #if defined(OS_FREEBSD) | |||||
| int get_num_procs(void) { | int get_num_procs(void) { | ||||
| @@ -206,7 +206,27 @@ int get_num_procs(void) { | |||||
| #endif | #endif | ||||
| #if defined(OS_DARWIN) | |||||
| int get_num_procs(void) { | |||||
| static int nums = 0; | |||||
| size_t len; | |||||
| if (nums == 0){ | |||||
| len = sizeof(int); | |||||
| sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0); | |||||
| } | |||||
| return nums; | |||||
| } | |||||
| #endif | |||||
| /* | |||||
| OpenBLAS uses the numbers of CPU cores in multithreading. | |||||
| It can be set by openblas_set_num_threads(int num_threads); | |||||
| */ | |||||
| int blas_cpu_number = 0; | int blas_cpu_number = 0; | ||||
| /* | |||||
| The numbers of threads in the thread pool. | |||||
| This value is equal or large than blas_cpu_number. This means some threads are sleep. | |||||
| */ | |||||
| int blas_num_threads = 0; | int blas_num_threads = 0; | ||||
| int goto_get_num_procs (void) { | int goto_get_num_procs (void) { | ||||
| @@ -215,7 +235,7 @@ int goto_get_num_procs (void) { | |||||
| int blas_get_cpu_number(void){ | int blas_get_cpu_number(void){ | ||||
| char *p; | char *p; | ||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) | |||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) | |||||
| int max_num; | int max_num; | ||||
| #endif | #endif | ||||
| int blas_goto_num = 0; | int blas_goto_num = 0; | ||||
| @@ -223,7 +243,7 @@ int blas_get_cpu_number(void){ | |||||
| if (blas_num_threads) return blas_num_threads; | if (blas_num_threads) return blas_num_threads; | ||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) | |||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) | |||||
| max_num = get_num_procs(); | max_num = get_num_procs(); | ||||
| #endif | #endif | ||||
| @@ -250,7 +270,7 @@ int blas_get_cpu_number(void){ | |||||
| else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | ||||
| else blas_num_threads = MAX_CPU_NUMBER; | else blas_num_threads = MAX_CPU_NUMBER; | ||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) | |||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) | |||||
| if (blas_num_threads > max_num) blas_num_threads = max_num; | if (blas_num_threads > max_num) blas_num_threads = max_num; | ||||
| #endif | #endif | ||||
| @@ -1128,7 +1148,7 @@ static BLASULONG init_lock = 0UL; | |||||
| static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, | static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, | ||||
| void *sa, void *sb, BLASLONG pos) { | void *sa, void *sb, BLASLONG pos) { | ||||
| #ifndef ARCH_POWER | |||||
| #if !defined(ARCH_POWER) && !defined(ARCH_SPARC) | |||||
| long size; | long size; | ||||
| BLASULONG buffer; | BLASULONG buffer; | ||||
| @@ -1289,6 +1309,7 @@ void DESTRUCTOR gotoblas_quit(void) { | |||||
| moncontrol (1); | moncontrol (1); | ||||
| #endif | #endif | ||||
| blas_shutdown(); | |||||
| } | } | ||||
| #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) | #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) | ||||
| @@ -0,0 +1,59 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include "common.h" | |||||
| static char* openblas_config_str="" | |||||
| #ifdef USE64BITINT | |||||
| "USE64BITINT " | |||||
| #endif | |||||
| #ifdef NO_CBLAS | |||||
| "NO_CBLAS " | |||||
| #endif | |||||
| #ifdef NO_LAPACK | |||||
| "NO_LAPACK " | |||||
| #endif | |||||
| #ifdef NO_LAPACKE | |||||
| "NO_LAPACKE " | |||||
| #endif | |||||
| #ifdef DYNAMIC_ARCH | |||||
| "DYNAMIC_ARCH " | |||||
| #endif | |||||
| #ifdef NO_AFFINITY | |||||
| "NO_AFFINITY " | |||||
| #endif | |||||
| ; | |||||
| char* CNAME() { | |||||
| return openblas_config_str; | |||||
| } | |||||
| @@ -1,5 +1,5 @@ | |||||
| /***************************************************************************** | /***************************************************************************** | ||||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -33,13 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #ifdef SMP_SERVER | #ifdef SMP_SERVER | ||||
| #ifdef OS_LINUX | |||||
| extern void openblas_set_num_threads(int num_threads) ; | extern void openblas_set_num_threads(int num_threads) ; | ||||
| void NAME(int* num_threads){ | |||||
| void openblas_set_num_threads_(int* num_threads){ | |||||
| openblas_set_num_threads(*num_threads); | openblas_set_num_threads(*num_threads); | ||||
| } | } | ||||
| #endif | |||||
| #else | |||||
| //Single thread | |||||
| void openblas_set_num_threads(int num_threads) { | |||||
| } | |||||
| void openblas_set_num_threads_(int* num_threads){ | |||||
| } | |||||
| #endif | #endif | ||||
| @@ -163,9 +163,9 @@ int get_L2_size(void){ | |||||
| int eax, ebx, ecx, edx; | int eax, ebx, ecx, edx; | ||||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ | |||||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ | |||||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | ||||
| defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC) | |||||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) | |||||
| cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | ||||
| @@ -384,6 +384,17 @@ void blas_set_parameter(void){ | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #if defined(SANDYBRIDGE) | |||||
| sgemm_p = 1024; | |||||
| dgemm_p = 512; | |||||
| cgemm_p = 512; | |||||
| zgemm_p = 256; | |||||
| #ifdef EXPRECISION | |||||
| qgemm_p = 256; | |||||
| xgemm_p = 128; | |||||
| #endif | |||||
| #endif | |||||
| #if defined(CORE_PRESCOTT) || defined(GENERIC) | #if defined(CORE_PRESCOTT) || defined(GENERIC) | ||||
| size >>= 6; | size >>= 6; | ||||
| @@ -435,7 +446,7 @@ void blas_set_parameter(void){ | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #if defined(CORE_BARCELONA) | |||||
| #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT) | |||||
| size >>= 8; | size >>= 8; | ||||
| sgemm_p = 232 * size; | sgemm_p = 232 * size; | ||||
| @@ -10,10 +10,23 @@ ifndef NO_CBLAS | |||||
| NO_CBLAS = 0 | NO_CBLAS = 0 | ||||
| endif | endif | ||||
| ifndef NO_LAPACK | |||||
| NO_LAPACK = 0 | |||||
| endif | |||||
| ifndef NO_LAPACKE | |||||
| NO_LAPACKE = 0 | |||||
| endif | |||||
| ifeq ($(OSNAME), WINNT) | ifeq ($(OSNAME), WINNT) | ||||
| ifeq ($(F_COMPILER), GFORTRAN) | ifeq ($(F_COMPILER), GFORTRAN) | ||||
| EXTRALIB += -lgfortran | EXTRALIB += -lgfortran | ||||
| endif | endif | ||||
| ifeq ($(USE_OPENMP), 1) | |||||
| ifeq ($(C_COMPILER), GCC) | |||||
| EXTRALIB += -lgomp | |||||
| endif | |||||
| endif | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), CYGWIN_NT) | ifeq ($(OSNAME), CYGWIN_NT) | ||||
| @@ -58,15 +71,20 @@ dll : ../$(LIBDLLNAME) | |||||
| dll2 : libgoto2_shared.dll | dll2 : libgoto2_shared.dll | ||||
| # On Windows, we only generate a DLL without a version suffix. This is because | |||||
| # applications which link against the dynamic library reference a fixed DLL name | |||||
| # in their import table. By instead using a stable name it is possible to | |||||
| # upgrade between library versions, without needing to re-link an application. | |||||
| # For more details see: https://github.com/xianyi/OpenBLAS/issues/127. | |||||
| ../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX) | ../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX) | ||||
| $(RANLIB) ../$(LIBNAME) | $(RANLIB) ../$(LIBNAME) | ||||
| ifeq ($(BINARY32), 1) | ifeq ($(BINARY32), 1) | ||||
| $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ | $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ | ||||
| --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) | |||||
| --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB) | |||||
| -lib /machine:i386 /def:libopenblas.def | -lib /machine:i386 /def:libopenblas.def | ||||
| else | else | ||||
| $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ | $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ | ||||
| --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) | |||||
| --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB) | |||||
| -lib /machine:X64 /def:libopenblas.def | -lib /machine:X64 /def:libopenblas.def | ||||
| endif | endif | ||||
| @@ -76,13 +94,13 @@ libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def | |||||
| -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) | -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) | ||||
| libopenblas.def : gensymbol | libopenblas.def : gensymbol | ||||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) | |||||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) | |||||
| libgoto2_shared.def : gensymbol | libgoto2_shared.def : gensymbol | ||||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) | |||||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) | |||||
| libgoto_hpl.def : gensymbol | libgoto_hpl.def : gensymbol | ||||
| perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) | |||||
| perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) | |||||
| $(LIBDYNNAME) : ../$(LIBNAME) osx.def | $(LIBDYNNAME) : ../$(LIBNAME) osx.def | ||||
| $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | ||||
| @@ -106,14 +124,15 @@ so : ../$(LIBSONAME) | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), FreeBSD) | |||||
| #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or | |||||
| ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD)) | |||||
| so : ../$(LIBSONAME) | so : ../$(LIBSONAME) | ||||
| ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c | ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c | ||||
| $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ | $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ | ||||
| -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ | -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ | ||||
| -Wl,--retain-symbols-file=linux.def $(EXTRALIB) | |||||
| -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB) | |||||
| $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | ||||
| rm -f linktest | rm -f linktest | ||||
| @@ -163,23 +182,23 @@ static : ../$(LIBNAME) | |||||
| rm -f goto.$(SUFFIX) | rm -f goto.$(SUFFIX) | ||||
| linux.def : gensymbol ../Makefile.system ../getarch.c | linux.def : gensymbol ../Makefile.system ../getarch.c | ||||
| perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) | |||||
| perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) | |||||
| osx.def : gensymbol ../Makefile.system ../getarch.c | osx.def : gensymbol ../Makefile.system ../getarch.c | ||||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) | |||||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) | |||||
| aix.def : gensymbol ../Makefile.system ../getarch.c | aix.def : gensymbol ../Makefile.system ../getarch.c | ||||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) | |||||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) | |||||
| symbol.S : gensymbol | symbol.S : gensymbol | ||||
| perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > symbol.S | |||||
| perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > symbol.S | |||||
| test : linktest.c | test : linktest.c | ||||
| $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | ||||
| rm -f linktest | rm -f linktest | ||||
| linktest.c : gensymbol ../Makefile.system ../getarch.c | linktest.c : gensymbol ../Makefile.system ../getarch.c | ||||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > linktest.c | |||||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > linktest.c | |||||
| clean :: | clean :: | ||||
| @rm -f *.def *.dylib __.SYMDEF* | @rm -f *.def *.dylib __.SYMDEF* | ||||
| @@ -32,11 +32,12 @@ if ($compiler eq "") { | |||||
| "pgf95", "pgf90", "pgf77", | "pgf95", "pgf90", "pgf77", | ||||
| "ifort"); | "ifort"); | ||||
| OUTER: | |||||
| foreach $lists (@lists) { | foreach $lists (@lists) { | ||||
| foreach $path (@path) { | foreach $path (@path) { | ||||
| if (-f $path . "/" . $lists) { | |||||
| if (-x $path . "/" . $lists) { | |||||
| $compiler = $lists; | $compiler = $lists; | ||||
| break; | |||||
| last OUTER; | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -210,6 +211,10 @@ if (!$?) { | |||||
| if ($?) { | if ($?) { | ||||
| $link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | $link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | ||||
| } | } | ||||
| #For gfortran MIPS | |||||
| if ($?) { | |||||
| $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||||
| } | |||||
| $binary = "" if ($?); | $binary = "" if ($?); | ||||
| } | } | ||||
| @@ -218,6 +223,10 @@ if (!$?) { | |||||
| if ($?) { | if ($?) { | ||||
| $link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | $link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | ||||
| } | } | ||||
| #For gfortran MIPS | |||||
| if ($?) { | |||||
| $link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||||
| } | |||||
| $binary = "" if ($?); | $binary = "" if ($?); | ||||
| } | } | ||||
| @@ -237,6 +246,8 @@ if ($link ne "") { | |||||
| $link =~ s/\-rpath\s+/\-rpath\@/g; | $link =~ s/\-rpath\s+/\-rpath\@/g; | ||||
| @flags = split(/[\s\,\n]/, $link); | @flags = split(/[\s\,\n]/, $link); | ||||
| # remove leading and trailing quotes from each flag. | |||||
| @flags = map {s/^['"]|['"]$//g; $_} @flags; | |||||
| foreach $flags (@flags) { | foreach $flags (@flags) { | ||||
| if ( | if ( | ||||
| @@ -1,5 +1,5 @@ | |||||
| /***************************************************************************** | /***************************************************************************** | ||||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -96,12 +96,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* #define FORCE_PENRYN */ | /* #define FORCE_PENRYN */ | ||||
| /* #define FORCE_DUNNINGTON */ | /* #define FORCE_DUNNINGTON */ | ||||
| /* #define FORCE_NEHALEM */ | /* #define FORCE_NEHALEM */ | ||||
| /* #define FORCE_SANDYBRIDGE */ | |||||
| /* #define FORCE_ATOM */ | |||||
| /* #define FORCE_ATHLON */ | /* #define FORCE_ATHLON */ | ||||
| /* #define FORCE_OPTERON */ | /* #define FORCE_OPTERON */ | ||||
| /* #define FORCE_OPTERON_SSE3 */ | /* #define FORCE_OPTERON_SSE3 */ | ||||
| /* #define FORCE_BARCELONA */ | /* #define FORCE_BARCELONA */ | ||||
| /* #define FORCE_SHANGHAI */ | /* #define FORCE_SHANGHAI */ | ||||
| /* #define FORCE_ISTANBUL */ | /* #define FORCE_ISTANBUL */ | ||||
| /* #define FORCE_BOBCAT */ | |||||
| /* #define FORCE_BULLDOZER */ | |||||
| /* #define FORCE_SSE_GENERIC */ | /* #define FORCE_SSE_GENERIC */ | ||||
| /* #define FORCE_VIAC3 */ | /* #define FORCE_VIAC3 */ | ||||
| /* #define FORCE_NANO */ | /* #define FORCE_NANO */ | ||||
| @@ -116,12 +120,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* #define FORCE_PPC440FP2 */ | /* #define FORCE_PPC440FP2 */ | ||||
| /* #define FORCE_CELL */ | /* #define FORCE_CELL */ | ||||
| /* #define FORCE_SICORTEX */ | /* #define FORCE_SICORTEX */ | ||||
| /* #define FORCE_LOONGSON3A */ | |||||
| /* #define FORCE_LOONGSON3B */ | |||||
| /* #define FORCE_LOONGSON3A */ | |||||
| /* #define FORCE_LOONGSON3B */ | |||||
| /* #define FORCE_ITANIUM2 */ | /* #define FORCE_ITANIUM2 */ | ||||
| /* #define FORCE_GENERIC */ | |||||
| /* #define FORCE_SPARC */ | /* #define FORCE_SPARC */ | ||||
| /* #define FORCE_SPARCV7 */ | /* #define FORCE_SPARCV7 */ | ||||
| /* #define FORCE_GENERIC */ | |||||
| #ifdef FORCE_P2 | #ifdef FORCE_P2 | ||||
| #define FORCE | #define FORCE | ||||
| @@ -137,32 +141,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "P5" | #define CORENAME "P5" | ||||
| #endif | #endif | ||||
| #ifdef FORCE_COPPERMINE | |||||
| #ifdef FORCE_KATMAI | |||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| #define ARCHITECTURE "X86" | #define ARCHITECTURE "X86" | ||||
| #define SUBARCHITECTURE "PENTIUM3" | #define SUBARCHITECTURE "PENTIUM3" | ||||
| #define ARCHCONFIG "-DPENTIUM3 " \ | #define ARCHCONFIG "-DPENTIUM3 " \ | ||||
| "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ | "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ | ||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | ||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " | "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " | ||||
| #define LIBNAME "coppermine" | |||||
| #define CORENAME "COPPERMINE" | |||||
| #define LIBNAME "katmai" | |||||
| #define CORENAME "KATMAI" | |||||
| #endif | #endif | ||||
| #ifdef FORCE_KATMAI | |||||
| #ifdef FORCE_COPPERMINE | |||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| #define ARCHITECTURE "X86" | #define ARCHITECTURE "X86" | ||||
| #define SUBARCHITECTURE "PENTIUM3" | #define SUBARCHITECTURE "PENTIUM3" | ||||
| #define ARCHCONFIG "-DPENTIUM3 " \ | #define ARCHCONFIG "-DPENTIUM3 " \ | ||||
| "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ | "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ | ||||
| "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | ||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " | "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " | ||||
| #define LIBNAME "katmai" | |||||
| #define CORENAME "KATMAI" | |||||
| #define LIBNAME "coppermine" | |||||
| #define CORENAME "COPPERMINE" | |||||
| #endif | #endif | ||||
| #ifdef FORCE_NORTHWOOD | #ifdef FORCE_NORTHWOOD | ||||
| @@ -278,6 +282,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "NEHALEM" | #define CORENAME "NEHALEM" | ||||
| #endif | #endif | ||||
| #ifdef FORCE_SANDYBRIDGE | |||||
| #define FORCE | |||||
| #define FORCE_INTEL | |||||
| #define ARCHITECTURE "X86" | |||||
| #define SUBARCHITECTURE "SANDYBRIDGE" | |||||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | |||||
| #define LIBNAME "sandybridge" | |||||
| #define CORENAME "SANDYBRIDGE" | |||||
| #endif | |||||
| #ifdef FORCE_ATOM | #ifdef FORCE_ATOM | ||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| @@ -342,13 +360,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ARCHCONFIG "-DBARCELONA " \ | #define ARCHCONFIG "-DBARCELONA " \ | ||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | ||||
| "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL3_SIZE=2097152 " \ | "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL3_SIZE=2097152 " \ | ||||
| "-DDTB_DEFAULT_ENTRIES=48 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ | |||||
| "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=48 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | |||||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" | "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" | ||||
| #define LIBNAME "barcelona" | #define LIBNAME "barcelona" | ||||
| #define CORENAME "BARCELONA" | #define CORENAME "BARCELONA" | ||||
| #endif | #endif | ||||
| #if defined(FORCE_BOBCAT) | |||||
| #define FORCE | |||||
| #define FORCE_INTEL | |||||
| #define ARCHITECTURE "X86" | |||||
| #define SUBARCHITECTURE "BOBCAT" | |||||
| #define ARCHCONFIG "-DBOBCAT " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \ | |||||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV" | |||||
| #define LIBNAME "bobcat" | |||||
| #define CORENAME "BOBCAT" | |||||
| #endif | |||||
| #if defined (FORCE_BULLDOZER) | |||||
| #define FORCE | |||||
| #define FORCE_INTEL | |||||
| #define ARCHITECTURE "X86" | |||||
| #define SUBARCHITECTURE "BULLDOZER" | |||||
| #define ARCHCONFIG "-DBULLDOZER " \ | |||||
| "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | |||||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ | |||||
| "-DHAVE_AVX -DHAVE_FMA4" | |||||
| #define LIBNAME "bulldozer" | |||||
| #define CORENAME "BULLDOZER" | |||||
| #endif | |||||
| #ifdef FORCE_SSE_GENERIC | #ifdef FORCE_SSE_GENERIC | ||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| @@ -34,7 +34,7 @@ int main(int argc, char **argv) { | |||||
| #ifdef USE64BITINT | #ifdef USE64BITINT | ||||
| printf("#define USE64BITINT\n"); | printf("#define USE64BITINT\n"); | ||||
| #endif | #endif | ||||
| printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD); | |||||
| printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD); | |||||
| } | } | ||||
| return 0; | return 0; | ||||
| @@ -318,7 +318,7 @@ CZBLAS3OBJS = \ | |||||
| ifndef NO_CBLAS | ifndef NO_CBLAS | ||||
| CFLAGS += -I. | |||||
| override CFLAGS += -I. | |||||
| SBLAS1OBJS += $(CSBLAS1OBJS) | SBLAS1OBJS += $(CSBLAS1OBJS) | ||||
| SBLAS2OBJS += $(CSBLAS2OBJS) | SBLAS2OBJS += $(CSBLAS2OBJS) | ||||
| @@ -400,7 +400,7 @@ all :: libs | |||||
| ifdef FUNCTION_PROFILE | ifdef FUNCTION_PROFILE | ||||
| $(BLASOBJS) $(BLASOBJS_P) : functable.h | $(BLASOBJS) $(BLASOBJS_P) : functable.h | ||||
| $(BLASOBJS) $(BLASOBJS_P) : CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) | |||||
| $(BLASOBJS) $(BLASOBJS_P) : override CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) | |||||
| functable.h : Makefile | functable.h : Makefile | ||||
| ./create $(FUNCALLFILES) > functable.h | ./create $(FUNCALLFILES) > functable.h | ||||
| @@ -420,7 +420,7 @@ level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $ | |||||
| $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ | $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ | ||||
| $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ | $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ | ||||
| $(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : CFLAGS += -DCBLAS | |||||
| $(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS | |||||
| srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c | srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c | ||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | $(CC) $(CFLAGS) -c $< -o $(@F) | ||||
| @@ -6,7 +6,7 @@ TOPDIR = .. | |||||
| include $(TOPDIR)/Makefile.system | include $(TOPDIR)/Makefile.system | ||||
| ifdef TARGET_CORE | ifdef TARGET_CORE | ||||
| CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||||
| BUILD_KERNEL = 1 | BUILD_KERNEL = 1 | ||||
| KDIR = | KDIR = | ||||
| TSUFFIX = _$(TARGET_CORE) | TSUFFIX = _$(TARGET_CORE) | ||||
| @@ -48,7 +48,7 @@ HPLOBJS = \ | |||||
| COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX) | COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX) | ||||
| ifdef DYNAMIC_ARCH | |||||
| ifeq ($(DYNAMIC_ARCH), 1) | |||||
| SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX) | SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX) | ||||
| CCOMMON_OPT += -DTS=$(TSUFFIX) | CCOMMON_OPT += -DTS=$(TSUFFIX) | ||||
| endif | endif | ||||
| @@ -0,0 +1,235 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||||
| { | |||||
| BLASLONG i,j; | |||||
| BLASLONG idx=0; | |||||
| BLASLONG ii; | |||||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||||
| for (j=0; j<col/4; j+=1) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src = src3+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src0[2]; | |||||
| dest0[9] = src0[3]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src2[2]; | |||||
| dest0[13] = src2[3]; | |||||
| dest0[14] = src3[2]; | |||||
| dest0[15] = src3[3]; | |||||
| dest0[16] = src0[4]; | |||||
| dest0[17] = src0[5]; | |||||
| dest0[18] = src1[4]; | |||||
| dest0[19] = src1[5]; | |||||
| dest0[20] = src2[4]; | |||||
| dest0[21] = src2[5]; | |||||
| dest0[22] = src3[4]; | |||||
| dest0[23] = src3[5]; | |||||
| dest0[24] = src0[6]; | |||||
| dest0[25] = src0[7]; | |||||
| dest0[26] = src1[6]; | |||||
| dest0[27] = src1[7]; | |||||
| dest0[28] = src2[6]; | |||||
| dest0[29] = src2[7]; | |||||
| dest0[30] = src3[6]; | |||||
| dest0[31] = src3[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| ii = (4<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src0[2]; | |||||
| dest0[9] = src0[3]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src2[2]; | |||||
| dest0[13] = src2[3]; | |||||
| dest0[14] = src3[2]; | |||||
| dest0[15] = src3[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| ii = (2<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| ii = (1<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src = src1+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<2); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src0[2]; | |||||
| dest0[5] = src0[3]; | |||||
| dest0[6] = src1[2]; | |||||
| dest0[7] = src1[3]; | |||||
| dest0[8] = src0[4]; | |||||
| dest0[9] = src0[5]; | |||||
| dest0[10] = src1[4]; | |||||
| dest0[11] = src1[5]; | |||||
| dest0[12] = src0[6]; | |||||
| dest0[13] = src0[7]; | |||||
| dest0[14] = src1[6]; | |||||
| dest0[15] = src1[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| ii = (4<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src0[2]; | |||||
| dest0[5] = src0[3]; | |||||
| dest0[6] = src1[2]; | |||||
| dest0[7] = src1[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| ii = (2<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| ii = (1<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| src0 = src; | |||||
| src = src0+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<1); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| src0 = src0+8; | |||||
| ii = (4<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| src0 = src0+4; | |||||
| ii = (2<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| src0 = src0+2; | |||||
| ii = (1<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,401 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||||
| { | |||||
| BLASLONG i,j; | |||||
| BLASLONG idx=0; | |||||
| BLASLONG ii; | |||||
| FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; | |||||
| for (j=0; j<col/8; j+=1) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src4 = src3+2*srcdim; | |||||
| src5 = src4+2*srcdim; | |||||
| src6 = src5+2*srcdim; | |||||
| src7 = src6+2*srcdim; | |||||
| src = src7+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<4); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src4[0]; | |||||
| dest0[9] = src4[1]; | |||||
| dest0[10] = src5[0]; | |||||
| dest0[11] = src5[1]; | |||||
| dest0[12] = src6[0]; | |||||
| dest0[13] = src6[1]; | |||||
| dest0[14] = src7[0]; | |||||
| dest0[15] = src7[1]; | |||||
| dest0[16] = src0[2]; | |||||
| dest0[17] = src0[3]; | |||||
| dest0[18] = src1[2]; | |||||
| dest0[19] = src1[3]; | |||||
| dest0[20] = src2[2]; | |||||
| dest0[21] = src2[3]; | |||||
| dest0[22] = src3[2]; | |||||
| dest0[23] = src3[3]; | |||||
| dest0[24] = src4[2]; | |||||
| dest0[25] = src4[3]; | |||||
| dest0[26] = src5[2]; | |||||
| dest0[27] = src5[3]; | |||||
| dest0[28] = src6[2]; | |||||
| dest0[29] = src6[3]; | |||||
| dest0[30] = src7[2]; | |||||
| dest0[31] = src7[3]; | |||||
| dest0[32] = src0[4]; | |||||
| dest0[33] = src0[5]; | |||||
| dest0[34] = src1[4]; | |||||
| dest0[35] = src1[5]; | |||||
| dest0[36] = src2[4]; | |||||
| dest0[37] = src2[5]; | |||||
| dest0[38] = src3[4]; | |||||
| dest0[39] = src3[5]; | |||||
| dest0[40] = src4[4]; | |||||
| dest0[41] = src4[5]; | |||||
| dest0[42] = src5[4]; | |||||
| dest0[43] = src5[5]; | |||||
| dest0[44] = src6[4]; | |||||
| dest0[45] = src6[5]; | |||||
| dest0[46] = src7[4]; | |||||
| dest0[47] = src7[5]; | |||||
| dest0[48] = src0[6]; | |||||
| dest0[49] = src0[7]; | |||||
| dest0[50] = src1[6]; | |||||
| dest0[51] = src1[7]; | |||||
| dest0[52] = src2[6]; | |||||
| dest0[53] = src2[7]; | |||||
| dest0[54] = src3[6]; | |||||
| dest0[55] = src3[7]; | |||||
| dest0[56] = src4[6]; | |||||
| dest0[57] = src4[7]; | |||||
| dest0[58] = src5[6]; | |||||
| dest0[59] = src5[7]; | |||||
| dest0[60] = src6[6]; | |||||
| dest0[61] = src6[7]; | |||||
| dest0[62] = src7[6]; | |||||
| dest0[63] = src7[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| src4 = src4+8; | |||||
| src5 = src5+8; | |||||
| src6 = src6+8; | |||||
| src7 = src7+8; | |||||
| ii = (4<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src4[0]; | |||||
| dest0[9] = src4[1]; | |||||
| dest0[10] = src5[0]; | |||||
| dest0[11] = src5[1]; | |||||
| dest0[12] = src6[0]; | |||||
| dest0[13] = src6[1]; | |||||
| dest0[14] = src7[0]; | |||||
| dest0[15] = src7[1]; | |||||
| dest0[16] = src0[2]; | |||||
| dest0[17] = src0[3]; | |||||
| dest0[18] = src1[2]; | |||||
| dest0[19] = src1[3]; | |||||
| dest0[20] = src2[2]; | |||||
| dest0[21] = src2[3]; | |||||
| dest0[22] = src3[2]; | |||||
| dest0[23] = src3[3]; | |||||
| dest0[24] = src4[2]; | |||||
| dest0[25] = src4[3]; | |||||
| dest0[26] = src5[2]; | |||||
| dest0[27] = src5[3]; | |||||
| dest0[28] = src6[2]; | |||||
| dest0[29] = src6[3]; | |||||
| dest0[30] = src7[2]; | |||||
| dest0[31] = src7[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| src4 = src4+4; | |||||
| src5 = src5+4; | |||||
| src6 = src6+4; | |||||
| src7 = src7+4; | |||||
| ii = (2<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src4[0]; | |||||
| dest0[9] = src4[1]; | |||||
| dest0[10] = src5[0]; | |||||
| dest0[11] = src5[1]; | |||||
| dest0[12] = src6[0]; | |||||
| dest0[13] = src6[1]; | |||||
| dest0[14] = src7[0]; | |||||
| dest0[15] = src7[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| src4 = src4+2; | |||||
| src5 = src5+2; | |||||
| src6 = src6+2; | |||||
| src7 = src7+2; | |||||
| ii = (1<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&4) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src = src3+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src0[2]; | |||||
| dest0[9] = src0[3]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src2[2]; | |||||
| dest0[13] = src2[3]; | |||||
| dest0[14] = src3[2]; | |||||
| dest0[15] = src3[3]; | |||||
| dest0[16] = src0[4]; | |||||
| dest0[17] = src0[5]; | |||||
| dest0[18] = src1[4]; | |||||
| dest0[19] = src1[5]; | |||||
| dest0[20] = src2[4]; | |||||
| dest0[21] = src2[5]; | |||||
| dest0[22] = src3[4]; | |||||
| dest0[23] = src3[5]; | |||||
| dest0[24] = src0[6]; | |||||
| dest0[25] = src0[7]; | |||||
| dest0[26] = src1[6]; | |||||
| dest0[27] = src1[7]; | |||||
| dest0[28] = src2[6]; | |||||
| dest0[29] = src2[7]; | |||||
| dest0[30] = src3[6]; | |||||
| dest0[31] = src3[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| ii = (4<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| dest0[8] = src0[2]; | |||||
| dest0[9] = src0[3]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src2[2]; | |||||
| dest0[13] = src2[3]; | |||||
| dest0[14] = src3[2]; | |||||
| dest0[15] = src3[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| ii = (2<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src2[0]; | |||||
| dest0[5] = src2[1]; | |||||
| dest0[6] = src3[0]; | |||||
| dest0[7] = src3[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| ii = (1<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src = src1+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<2); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src0[2]; | |||||
| dest0[5] = src0[3]; | |||||
| dest0[6] = src1[2]; | |||||
| dest0[7] = src1[3]; | |||||
| dest0[8] = src0[4]; | |||||
| dest0[9] = src0[5]; | |||||
| dest0[10] = src1[4]; | |||||
| dest0[11] = src1[5]; | |||||
| dest0[12] = src0[6]; | |||||
| dest0[13] = src0[7]; | |||||
| dest0[14] = src1[6]; | |||||
| dest0[15] = src1[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| ii = (4<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| dest0[4] = src0[2]; | |||||
| dest0[5] = src0[3]; | |||||
| dest0[6] = src1[2]; | |||||
| dest0[7] = src1[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| ii = (2<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src1[0]; | |||||
| dest0[3] = src1[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| ii = (1<<2); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| src0 = src; | |||||
| src = src0+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (row<<1); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<row/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| src0 = src0+8; | |||||
| ii = (4<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| src0 = src0+4; | |||||
| ii = (2<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| src0 = src0+2; | |||||
| ii = (1<<1); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,237 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||||
| { | |||||
| BLASLONG i,j; | |||||
| BLASLONG idx=0; | |||||
| BLASLONG ii; | |||||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||||
| FLOAT *dest1,*dest2; | |||||
| ii = col&-4; | |||||
| ii = ii*(2*row); | |||||
| dest2 = dest+ii; | |||||
| ii = col&-2; | |||||
| ii = ii*(2*row); | |||||
| dest1 = dest+ii; | |||||
| for (j=0; j<row/4; j+=1) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src = src3+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (4<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src1[0]; | |||||
| dest0[9] = src1[1]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src1[4]; | |||||
| dest0[13] = src1[5]; | |||||
| dest0[14] = src1[6]; | |||||
| dest0[15] = src1[7]; | |||||
| dest0[16] = src2[0]; | |||||
| dest0[17] = src2[1]; | |||||
| dest0[18] = src2[2]; | |||||
| dest0[19] = src2[3]; | |||||
| dest0[20] = src2[4]; | |||||
| dest0[21] = src2[5]; | |||||
| dest0[22] = src2[6]; | |||||
| dest0[23] = src2[7]; | |||||
| dest0[24] = src3[0]; | |||||
| dest0[25] = src3[1]; | |||||
| dest0[26] = src3[2]; | |||||
| dest0[27] = src3[3]; | |||||
| dest0[28] = src3[4]; | |||||
| dest0[29] = src3[5]; | |||||
| dest0[30] = src3[6]; | |||||
| dest0[31] = src3[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| ii = (row<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| dest2[4] = src1[0]; | |||||
| dest2[5] = src1[1]; | |||||
| dest2[6] = src1[2]; | |||||
| dest2[7] = src1[3]; | |||||
| dest2[8] = src2[0]; | |||||
| dest2[9] = src2[1]; | |||||
| dest2[10] = src2[2]; | |||||
| dest2[11] = src2[3]; | |||||
| dest2[12] = src3[0]; | |||||
| dest2[13] = src3[1]; | |||||
| dest2[14] = src3[2]; | |||||
| dest2[15] = src3[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| dest2 = dest2+16; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| dest1[2] = src1[0]; | |||||
| dest1[3] = src1[1]; | |||||
| dest1[4] = src2[0]; | |||||
| dest1[5] = src2[1]; | |||||
| dest1[6] = src3[0]; | |||||
| dest1[7] = src3[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| dest1 = dest1+8; | |||||
| } | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src = src1+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (2<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src1[0]; | |||||
| dest0[9] = src1[1]; | |||||
| dest0[10] = src1[2]; | |||||
| dest0[11] = src1[3]; | |||||
| dest0[12] = src1[4]; | |||||
| dest0[13] = src1[5]; | |||||
| dest0[14] = src1[6]; | |||||
| dest0[15] = src1[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| ii = (row<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| dest2[4] = src1[0]; | |||||
| dest2[5] = src1[1]; | |||||
| dest2[6] = src1[2]; | |||||
| dest2[7] = src1[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| dest2 = dest2+8; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| dest1[2] = src1[0]; | |||||
| dest1[3] = src1[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| dest1 = dest1+4; | |||||
| } | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| src0 = src; | |||||
| src = src0+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (1<<3); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/4; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| src0 = src0+8; | |||||
| ii = (row<<3); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| src0 = src0+4; | |||||
| dest2 = dest2+4; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| src0 = src0+2; | |||||
| dest1 = dest1+2; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,370 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||||
| be used to endorse or promote products derived from this software | |||||
| without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||||
| { | |||||
| BLASLONG i,j; | |||||
| BLASLONG idx=0; | |||||
| BLASLONG ii; | |||||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||||
| FLOAT *dest1,*dest2,*dest4; | |||||
| ii = col&-8; | |||||
| ii = ii*(2*row); | |||||
| dest4 = dest+ii; | |||||
| ii = col&-4; | |||||
| ii = ii*(2*row); | |||||
| dest2 = dest+ii; | |||||
| ii = col&-2; | |||||
| ii = ii*(2*row); | |||||
| dest1 = dest+ii; | |||||
| for (j=0; j<row/4; j+=1) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src2 = src1+2*srcdim; | |||||
| src3 = src2+2*srcdim; | |||||
| src = src3+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (4<<4); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/8; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src0[8]; | |||||
| dest0[9] = src0[9]; | |||||
| dest0[10] = src0[10]; | |||||
| dest0[11] = src0[11]; | |||||
| dest0[12] = src0[12]; | |||||
| dest0[13] = src0[13]; | |||||
| dest0[14] = src0[14]; | |||||
| dest0[15] = src0[15]; | |||||
| dest0[16] = src1[0]; | |||||
| dest0[17] = src1[1]; | |||||
| dest0[18] = src1[2]; | |||||
| dest0[19] = src1[3]; | |||||
| dest0[20] = src1[4]; | |||||
| dest0[21] = src1[5]; | |||||
| dest0[22] = src1[6]; | |||||
| dest0[23] = src1[7]; | |||||
| dest0[24] = src1[8]; | |||||
| dest0[25] = src1[9]; | |||||
| dest0[26] = src1[10]; | |||||
| dest0[27] = src1[11]; | |||||
| dest0[28] = src1[12]; | |||||
| dest0[29] = src1[13]; | |||||
| dest0[30] = src1[14]; | |||||
| dest0[31] = src1[15]; | |||||
| dest0[32] = src2[0]; | |||||
| dest0[33] = src2[1]; | |||||
| dest0[34] = src2[2]; | |||||
| dest0[35] = src2[3]; | |||||
| dest0[36] = src2[4]; | |||||
| dest0[37] = src2[5]; | |||||
| dest0[38] = src2[6]; | |||||
| dest0[39] = src2[7]; | |||||
| dest0[40] = src2[8]; | |||||
| dest0[41] = src2[9]; | |||||
| dest0[42] = src2[10]; | |||||
| dest0[43] = src2[11]; | |||||
| dest0[44] = src2[12]; | |||||
| dest0[45] = src2[13]; | |||||
| dest0[46] = src2[14]; | |||||
| dest0[47] = src2[15]; | |||||
| dest0[48] = src3[0]; | |||||
| dest0[49] = src3[1]; | |||||
| dest0[50] = src3[2]; | |||||
| dest0[51] = src3[3]; | |||||
| dest0[52] = src3[4]; | |||||
| dest0[53] = src3[5]; | |||||
| dest0[54] = src3[6]; | |||||
| dest0[55] = src3[7]; | |||||
| dest0[56] = src3[8]; | |||||
| dest0[57] = src3[9]; | |||||
| dest0[58] = src3[10]; | |||||
| dest0[59] = src3[11]; | |||||
| dest0[60] = src3[12]; | |||||
| dest0[61] = src3[13]; | |||||
| dest0[62] = src3[14]; | |||||
| dest0[63] = src3[15]; | |||||
| src0 = src0+16; | |||||
| src1 = src1+16; | |||||
| src2 = src2+16; | |||||
| src3 = src3+16; | |||||
| ii = (row<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&4) | |||||
| { | |||||
| dest4[0] = src0[0]; | |||||
| dest4[1] = src0[1]; | |||||
| dest4[2] = src0[2]; | |||||
| dest4[3] = src0[3]; | |||||
| dest4[4] = src0[4]; | |||||
| dest4[5] = src0[5]; | |||||
| dest4[6] = src0[6]; | |||||
| dest4[7] = src0[7]; | |||||
| dest4[8] = src1[0]; | |||||
| dest4[9] = src1[1]; | |||||
| dest4[10] = src1[2]; | |||||
| dest4[11] = src1[3]; | |||||
| dest4[12] = src1[4]; | |||||
| dest4[13] = src1[5]; | |||||
| dest4[14] = src1[6]; | |||||
| dest4[15] = src1[7]; | |||||
| dest4[16] = src2[0]; | |||||
| dest4[17] = src2[1]; | |||||
| dest4[18] = src2[2]; | |||||
| dest4[19] = src2[3]; | |||||
| dest4[20] = src2[4]; | |||||
| dest4[21] = src2[5]; | |||||
| dest4[22] = src2[6]; | |||||
| dest4[23] = src2[7]; | |||||
| dest4[24] = src3[0]; | |||||
| dest4[25] = src3[1]; | |||||
| dest4[26] = src3[2]; | |||||
| dest4[27] = src3[3]; | |||||
| dest4[28] = src3[4]; | |||||
| dest4[29] = src3[5]; | |||||
| dest4[30] = src3[6]; | |||||
| dest4[31] = src3[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| src2 = src2+8; | |||||
| src3 = src3+8; | |||||
| dest4 = dest4+32; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| dest2[4] = src1[0]; | |||||
| dest2[5] = src1[1]; | |||||
| dest2[6] = src1[2]; | |||||
| dest2[7] = src1[3]; | |||||
| dest2[8] = src2[0]; | |||||
| dest2[9] = src2[1]; | |||||
| dest2[10] = src2[2]; | |||||
| dest2[11] = src2[3]; | |||||
| dest2[12] = src3[0]; | |||||
| dest2[13] = src3[1]; | |||||
| dest2[14] = src3[2]; | |||||
| dest2[15] = src3[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| src2 = src2+4; | |||||
| src3 = src3+4; | |||||
| dest2 = dest2+16; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| dest1[2] = src1[0]; | |||||
| dest1[3] = src1[1]; | |||||
| dest1[4] = src2[0]; | |||||
| dest1[5] = src2[1]; | |||||
| dest1[6] = src3[0]; | |||||
| dest1[7] = src3[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| src2 = src2+2; | |||||
| src3 = src3+2; | |||||
| dest1 = dest1+8; | |||||
| } | |||||
| } | |||||
| if (row&2) | |||||
| { | |||||
| src0 = src; | |||||
| src1 = src0+2*srcdim; | |||||
| src = src1+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (2<<4); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/8; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src0[8]; | |||||
| dest0[9] = src0[9]; | |||||
| dest0[10] = src0[10]; | |||||
| dest0[11] = src0[11]; | |||||
| dest0[12] = src0[12]; | |||||
| dest0[13] = src0[13]; | |||||
| dest0[14] = src0[14]; | |||||
| dest0[15] = src0[15]; | |||||
| dest0[16] = src1[0]; | |||||
| dest0[17] = src1[1]; | |||||
| dest0[18] = src1[2]; | |||||
| dest0[19] = src1[3]; | |||||
| dest0[20] = src1[4]; | |||||
| dest0[21] = src1[5]; | |||||
| dest0[22] = src1[6]; | |||||
| dest0[23] = src1[7]; | |||||
| dest0[24] = src1[8]; | |||||
| dest0[25] = src1[9]; | |||||
| dest0[26] = src1[10]; | |||||
| dest0[27] = src1[11]; | |||||
| dest0[28] = src1[12]; | |||||
| dest0[29] = src1[13]; | |||||
| dest0[30] = src1[14]; | |||||
| dest0[31] = src1[15]; | |||||
| src0 = src0+16; | |||||
| src1 = src1+16; | |||||
| ii = (row<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&4) | |||||
| { | |||||
| dest4[0] = src0[0]; | |||||
| dest4[1] = src0[1]; | |||||
| dest4[2] = src0[2]; | |||||
| dest4[3] = src0[3]; | |||||
| dest4[4] = src0[4]; | |||||
| dest4[5] = src0[5]; | |||||
| dest4[6] = src0[6]; | |||||
| dest4[7] = src0[7]; | |||||
| dest4[8] = src1[0]; | |||||
| dest4[9] = src1[1]; | |||||
| dest4[10] = src1[2]; | |||||
| dest4[11] = src1[3]; | |||||
| dest4[12] = src1[4]; | |||||
| dest4[13] = src1[5]; | |||||
| dest4[14] = src1[6]; | |||||
| dest4[15] = src1[7]; | |||||
| src0 = src0+8; | |||||
| src1 = src1+8; | |||||
| dest4 = dest4+16; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| dest2[4] = src1[0]; | |||||
| dest2[5] = src1[1]; | |||||
| dest2[6] = src1[2]; | |||||
| dest2[7] = src1[3]; | |||||
| src0 = src0+4; | |||||
| src1 = src1+4; | |||||
| dest2 = dest2+8; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| dest1[2] = src1[0]; | |||||
| dest1[3] = src1[1]; | |||||
| src0 = src0+2; | |||||
| src1 = src1+2; | |||||
| dest1 = dest1+4; | |||||
| } | |||||
| } | |||||
| if (row&1) | |||||
| { | |||||
| src0 = src; | |||||
| src = src0+2*srcdim; | |||||
| dest0 = dest; | |||||
| ii = (1<<4); | |||||
| dest = dest+ii; | |||||
| for (i=0; i<col/8; i+=1) | |||||
| { | |||||
| dest0[0] = src0[0]; | |||||
| dest0[1] = src0[1]; | |||||
| dest0[2] = src0[2]; | |||||
| dest0[3] = src0[3]; | |||||
| dest0[4] = src0[4]; | |||||
| dest0[5] = src0[5]; | |||||
| dest0[6] = src0[6]; | |||||
| dest0[7] = src0[7]; | |||||
| dest0[8] = src0[8]; | |||||
| dest0[9] = src0[9]; | |||||
| dest0[10] = src0[10]; | |||||
| dest0[11] = src0[11]; | |||||
| dest0[12] = src0[12]; | |||||
| dest0[13] = src0[13]; | |||||
| dest0[14] = src0[14]; | |||||
| dest0[15] = src0[15]; | |||||
| src0 = src0+16; | |||||
| ii = (row<<4); | |||||
| dest0 = dest0+ii; | |||||
| } | |||||
| if (col&4) | |||||
| { | |||||
| dest4[0] = src0[0]; | |||||
| dest4[1] = src0[1]; | |||||
| dest4[2] = src0[2]; | |||||
| dest4[3] = src0[3]; | |||||
| dest4[4] = src0[4]; | |||||
| dest4[5] = src0[5]; | |||||
| dest4[6] = src0[6]; | |||||
| dest4[7] = src0[7]; | |||||
| src0 = src0+8; | |||||
| dest4 = dest4+8; | |||||
| } | |||||
| if (col&2) | |||||
| { | |||||
| dest2[0] = src0[0]; | |||||
| dest2[1] = src0[1]; | |||||
| dest2[2] = src0[2]; | |||||
| dest2[3] = src0[3]; | |||||
| src0 = src0+4; | |||||
| dest2 = dest2+4; | |||||
| } | |||||
| if (col&1) | |||||
| { | |||||
| dest1[0] = src0[0]; | |||||
| dest1[1] = src0[1]; | |||||
| src0 = src0+2; | |||||
| dest1 = dest1+2; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -634,10 +634,10 @@ static void init_parameter(void) { | |||||
| TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; | TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; | ||||
| #endif | #endif | ||||
| #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) | |||||
| #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON) | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| fprintf(stderr, "Katmai, Coppermine, Banias\n"); | |||||
| fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n"); | |||||
| #endif | #endif | ||||
| TABLE_NAME.sgemm_p = 64 * (l2 >> 7); | TABLE_NAME.sgemm_p = 64 * (l2 >> 7); | ||||
| @@ -746,6 +746,22 @@ static void init_parameter(void) { | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #ifdef DEBUG | |||||
| fprintf(stderr, "Sandybridge\n"); | |||||
| #endif | |||||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||||
| #ifdef EXPRECISION | |||||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||||
| #endif | |||||
| #endif | |||||
| #ifdef OPTERON | #ifdef OPTERON | ||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| @@ -778,6 +794,38 @@ static void init_parameter(void) { | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef BOBCAT | |||||
| #ifdef DEBUG | |||||
| fprintf(stderr, "Bobcate\n"); | |||||
| #endif | |||||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||||
| #ifdef EXPRECISION | |||||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||||
| #endif | |||||
| #endif | |||||
| #ifdef BULLDOZER | |||||
| #ifdef DEBUG | |||||
| fprintf(stderr, "Bulldozer\n"); | |||||
| #endif | |||||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||||
| #ifdef EXPRECISION | |||||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||||
| #endif | |||||
| #endif | |||||
| #ifdef NANO | #ifdef NANO | ||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| @@ -0,0 +1,59 @@ | |||||
| SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
| SGEMMINCOPY = | |||||
| SGEMMITCOPY = | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMINCOPYOBJ = | |||||
| SGEMMITCOPYOBJ = | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
| CGEMMINCOPY = | |||||
| CGEMMITCOPY = | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMINCOPYOBJ = | |||||
| CGEMMITCOPYOBJ = | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S | |||||
| @@ -0,0 +1,59 @@ | |||||
| SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
| SGEMMINCOPY = | |||||
| SGEMMITCOPY = | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMINCOPYOBJ = | |||||
| SGEMMITCOPYOBJ = | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
| CGEMMINCOPY = | |||||
| CGEMMITCOPY = | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMINCOPYOBJ = | |||||
| CGEMMITCOPYOBJ = | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S | |||||
| @@ -0,0 +1 @@ | |||||
| include $(KERNELDIR)/KERNEL.PENRYN | |||||
| @@ -495,7 +495,6 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L999: | .L999: | ||||
| RESTOREREGISTERS | |||||
| subl $8, %esp | subl $8, %esp | ||||
| movss %xmm0, (%esp) | movss %xmm0, (%esp) | ||||
| @@ -76,6 +76,12 @@ | |||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE (8 * 1 - 4) | |||||
| #define PREFETCHW prefetcht0 | |||||
| #define PREFETCHB prefetcht0 | |||||
| #endif | |||||
| #ifndef PREFETCH | #ifndef PREFETCH | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #endif | #endif | ||||
| @@ -596,7 +596,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 4 * SIZE(BB), %xmm2 | movsd 4 * SIZE(BB), %xmm2 | ||||
| @@ -842,7 +842,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -1168,7 +1168,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -1198,7 +1198,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -1347,7 +1347,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -1531,7 +1531,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -1778,7 +1778,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -1793,7 +1793,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -1924,7 +1924,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -2069,7 +2069,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -69,6 +69,12 @@ | |||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef SANDYBRIDGE | |||||
| #define PREFETCHSIZE (16 * 1 - 8) | |||||
| #define PREFETCHW prefetcht0 | |||||
| #define PREFETCHB prefetcht0 | |||||
| #endif | |||||
| #ifndef PREFETCH | #ifndef PREFETCH | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #endif | #endif | ||||
| @@ -262,7 +268,7 @@ | |||||
| movaps -16 * SIZE(AA), %xmm0 | movaps -16 * SIZE(AA), %xmm0 | ||||
| addps %xmm2, %xmm7 | addps %xmm2, %xmm7 | ||||
| #ifndef NEHALEM | |||||
| #if !(defined(NEHALEM) || defined(SANDYBRIDGE)) | |||||
| PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| pshufd $0x93, %xmm1, %xmm2 | pshufd $0x93, %xmm1, %xmm2 | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define PREFETCHSIZE (16 * 4) | #define PREFETCHSIZE (16 * 4) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 7) | #define PREFETCHSIZE (16 * 7) | ||||
| @@ -89,17 +89,22 @@ | |||||
| #endif | #endif | ||||
| #define STACKSIZE 16 | #define STACKSIZE 16 | ||||
| #define M 4 + STACKSIZE(%esp) | |||||
| #define N 8 + STACKSIZE(%esp) | |||||
| #define ALPHA 16 + STACKSIZE(%esp) | |||||
| #define A 20 + STACKSIZE(%esp) | |||||
| #define STACK_LDA 24 + STACKSIZE(%esp) | |||||
| #define STACK_X 28 + STACKSIZE(%esp) | |||||
| #define STACK_INCX 32 + STACKSIZE(%esp) | |||||
| #define Y 36 + STACKSIZE(%esp) | |||||
| #define STACK_INCY 40 + STACKSIZE(%esp) | |||||
| #define BUFFER 44 + STACKSIZE(%esp) | |||||
| #define ARGS 16 | |||||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||||
| #define ALPHA 16 + STACKSIZE+ARGS(%esp) | |||||
| #define A 20 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_LDA 24 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_X 28 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCX 32 + STACKSIZE+ARGS(%esp) | |||||
| #define Y 36 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCY 40 + STACKSIZE+ARGS(%esp) | |||||
| #define BUFFER 44 + STACKSIZE+ARGS(%esp) | |||||
| #define MMM 0+ARGS(%esp) | |||||
| #define YY 4+ARGS(%esp) | |||||
| #define AA 8+ARGS(%esp) | |||||
| #define LDAX 12+ARGS(%esp) | |||||
| #define I %eax | #define I %eax | ||||
| #define J %ebx | #define J %ebx | ||||
| @@ -114,6 +119,7 @@ | |||||
| PROLOGUE | PROLOGUE | ||||
| subl $ARGS,%esp | |||||
| pushl %ebp | pushl %ebp | ||||
| pushl %edi | pushl %edi | ||||
| pushl %esi | pushl %esi | ||||
| @@ -121,7 +127,34 @@ | |||||
| PROFCODE | PROFCODE | ||||
| movl Y,J | |||||
| movl J,YY # backup Y | |||||
| movl A,J | |||||
| movl J,AA # backup A | |||||
| movl M,J | |||||
| movl J,MMM # backup MM | |||||
| .L0t: | |||||
| xorl J,J | |||||
| addl $1,J | |||||
| sall $21,J | |||||
| subl J,MMM | |||||
| movl J,M | |||||
| jge .L00t | |||||
| ALIGN_4 | |||||
| movl MMM,%eax | |||||
| addl J,%eax | |||||
| jle .L999x | |||||
| movl %eax,M | |||||
| .L00t: | |||||
| movl AA,%eax | |||||
| movl %eax,A | |||||
| movl YY,J | |||||
| movl J,Y | |||||
| movl STACK_LDA, LDA | movl STACK_LDA, LDA | ||||
| movl STACK_X, X | movl STACK_X, X | ||||
| movl STACK_INCX, INCX | movl STACK_INCX, INCX | ||||
| @@ -651,12 +684,22 @@ | |||||
| addss 0 * SIZE(X), %xmm0 | addss 0 * SIZE(X), %xmm0 | ||||
| movss %xmm0, (Y1) | movss %xmm0, (Y1) | ||||
| ALIGN_3 | ALIGN_3 | ||||
| .L999: | .L999: | ||||
| movl M,J | |||||
| leal (,J,SIZE),%eax | |||||
| addl %eax,AA | |||||
| movl YY,J | |||||
| addl %eax,J | |||||
| movl J,YY | |||||
| jmp .L0t | |||||
| ALIGN_4 | |||||
| .L999x: | |||||
| popl %ebx | popl %ebx | ||||
| popl %esi | popl %esi | ||||
| popl %edi | popl %edi | ||||
| popl %ebp | popl %ebp | ||||
| addl $ARGS,%esp | |||||
| ret | ret | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -45,7 +45,7 @@ | |||||
| #define PREFETCHSIZE (8 * 2) | #define PREFETCHSIZE (8 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 7) | #define PREFETCHSIZE (8 * 7) | ||||
| @@ -76,17 +76,22 @@ | |||||
| #endif | #endif | ||||
| #define STACKSIZE 16 | #define STACKSIZE 16 | ||||
| #define M 4 + STACKSIZE(%esp) | |||||
| #define N 8 + STACKSIZE(%esp) | |||||
| #define ALPHA 16 + STACKSIZE(%esp) | |||||
| #define A 24 + STACKSIZE(%esp) | |||||
| #define STACK_LDA 28 + STACKSIZE(%esp) | |||||
| #define STACK_X 32 + STACKSIZE(%esp) | |||||
| #define STACK_INCX 36 + STACKSIZE(%esp) | |||||
| #define Y 40 + STACKSIZE(%esp) | |||||
| #define STACK_INCY 44 + STACKSIZE(%esp) | |||||
| #define BUFFER 48 + STACKSIZE(%esp) | |||||
| #define ARGS 16 | |||||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||||
| #define ALPHA 16 + STACKSIZE+ARGS(%esp) | |||||
| #define A 24 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_LDA 28 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_X 32 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCX 36 + STACKSIZE+ARGS(%esp) | |||||
| #define Y 40 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) | |||||
| #define BUFFER 48 + STACKSIZE+ARGS(%esp) | |||||
| #define MMM 0+ARGS(%esp) | |||||
| #define YY 4+ARGS(%esp) | |||||
| #define AA 8+ARGS(%esp) | |||||
| #define I %eax | #define I %eax | ||||
| #define J %ebx | #define J %ebx | ||||
| @@ -101,6 +106,8 @@ | |||||
| PROLOGUE | PROLOGUE | ||||
| subl $ARGS,%esp | |||||
| pushl %ebp | pushl %ebp | ||||
| pushl %edi | pushl %edi | ||||
| pushl %esi | pushl %esi | ||||
| @@ -108,6 +115,33 @@ | |||||
| PROFCODE | PROFCODE | ||||
| movl Y,J | |||||
| movl J,YY # backup Y | |||||
| movl A,J | |||||
| movl J,AA # backup A | |||||
| movl M,J | |||||
| movl J,MMM # backup MM | |||||
| .L0t: | |||||
| xorl J,J | |||||
| addl $1,J | |||||
| sall $20,J | |||||
| subl J,MMM | |||||
| movl J,M | |||||
| jge .L00t | |||||
| ALIGN_4 | |||||
| movl MMM,%eax | |||||
| addl J,%eax | |||||
| jle .L999x | |||||
| movl %eax,M | |||||
| .L00t: | |||||
| movl AA,%eax | |||||
| movl %eax,A | |||||
| movl YY,J | |||||
| movl J,Y | |||||
| movl STACK_LDA, LDA | movl STACK_LDA, LDA | ||||
| movl STACK_X, X | movl STACK_X, X | ||||
| movl STACK_INCX, INCX | movl STACK_INCX, INCX | ||||
| @@ -677,10 +711,22 @@ | |||||
| ALIGN_3 | ALIGN_3 | ||||
| .L999: | .L999: | ||||
| movl M,J | |||||
| leal (,J,SIZE),%eax | |||||
| addl %eax,AA | |||||
| movl YY,J | |||||
| addl %eax,J | |||||
| movl J,YY | |||||
| jmp .L0t | |||||
| ALIGN_4 | |||||
| .L999x: | |||||
| popl %ebx | popl %ebx | ||||
| popl %esi | popl %esi | ||||
| popl %edi | popl %edi | ||||
| popl %ebp | popl %ebp | ||||
| addl $ARGS,%esp | |||||
| ret | ret | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define PREFETCHSIZE (16 * 4) | #define PREFETCHSIZE (16 * 4) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 7) | #define PREFETCHSIZE (16 * 7) | ||||
| @@ -89,17 +89,24 @@ | |||||
| #endif | #endif | ||||
| #define STACKSIZE 16 | #define STACKSIZE 16 | ||||
| #define M 4 + STACKSIZE(%esp) | |||||
| #define N 8 + STACKSIZE(%esp) | |||||
| #define ALPHA 16 + STACKSIZE(%esp) | |||||
| #define A 20 + STACKSIZE(%esp) | |||||
| #define STACK_LDA 24 + STACKSIZE(%esp) | |||||
| #define STACK_X 28 + STACKSIZE(%esp) | |||||
| #define STACK_INCX 32 + STACKSIZE(%esp) | |||||
| #define Y 36 + STACKSIZE(%esp) | |||||
| #define STACK_INCY 40 + STACKSIZE(%esp) | |||||
| #define BUFFER 44 + STACKSIZE(%esp) | |||||
| #define ARGS 20 | |||||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||||
| #define ALPHA 16 + STACKSIZE+ARGS(%esp) | |||||
| #define A 20 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_LDA 24 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_X 28 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCX 32 + STACKSIZE+ARGS(%esp) | |||||
| #define Y 36 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCY 40 + STACKSIZE+ARGS(%esp) | |||||
| #define BUFFER 44 + STACKSIZE+ARGS(%esp) | |||||
| #define MMM 0+STACKSIZE(%esp) | |||||
| #define NN 4+STACKSIZE(%esp) | |||||
| #define AA 8+STACKSIZE(%esp) | |||||
| #define LDAX 12+STACKSIZE(%esp) | |||||
| #define XX 16+STACKSIZE(%esp) | |||||
| #define I %eax | #define I %eax | ||||
| #define J %ebx | #define J %ebx | ||||
| @@ -114,6 +121,7 @@ | |||||
| PROLOGUE | PROLOGUE | ||||
| subl $ARGS,%esp | |||||
| pushl %ebp | pushl %ebp | ||||
| pushl %edi | pushl %edi | ||||
| pushl %esi | pushl %esi | ||||
| @@ -122,7 +130,42 @@ | |||||
| PROFCODE | PROFCODE | ||||
| movl STACK_LDA, LDA | movl STACK_LDA, LDA | ||||
| movl LDA,LDAX # backup LDA | |||||
| movl STACK_X, X | movl STACK_X, X | ||||
| movl X,XX | |||||
| movl N,J | |||||
| movl J,NN # backup N | |||||
| movl A,J | |||||
| movl J,AA # backup A | |||||
| movl M,J | |||||
| movl J,MMM # mov M to MMM | |||||
| .L0t: | |||||
| xorl J,J | |||||
| addl $1,J | |||||
| sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) | |||||
| subl $8, J # Don't use last 8 float in the buffer. | |||||
| # Now, split M by block J | |||||
| subl J,MMM # MMM=MMM-J | |||||
| movl J,M | |||||
| jge .L00t | |||||
| ALIGN_4 | |||||
| movl MMM,%eax | |||||
| addl J,%eax | |||||
| jle .L999x | |||||
| movl %eax,M | |||||
| .L00t: | |||||
| movl AA,%eax | |||||
| movl %eax,A # mov AA to A | |||||
| movl NN,%eax | |||||
| movl %eax,N # reset N | |||||
| movl LDAX, LDA # reset LDA | |||||
| movl XX,X | |||||
| movl STACK_INCX, INCX | movl STACK_INCX, INCX | ||||
| movl STACK_INCY, INCY | movl STACK_INCY, INCY | ||||
| @@ -198,6 +241,20 @@ | |||||
| jg .L06 | jg .L06 | ||||
| ALIGN_4 | ALIGN_4 | ||||
| //Padding zero to prevent loading the dirty number from buffer. | |||||
| movl M, I | |||||
| movl $8, J | |||||
| andl $7, I | |||||
| xorps %xmm0, %xmm0 | |||||
| subl I, J | |||||
| ALIGN_2 | |||||
| .L07: | |||||
| movss %xmm0, 0 * SIZE(Y1) | |||||
| addl $SIZE, Y1 | |||||
| decl J | |||||
| jg .L07 | |||||
| ALIGN_4 | |||||
| .L10: | .L10: | ||||
| movl Y, Y1 | movl Y, Y1 | ||||
| @@ -628,10 +685,22 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L999: | .L999: | ||||
| movl M,J | |||||
| leal (,J,SIZE),%eax | |||||
| addl %eax,AA | |||||
| movl XX,J | |||||
| addl %eax,J | |||||
| movl J,XX | |||||
| jmp .L0t | |||||
| ALIGN_4 | |||||
| .L999x: | |||||
| popl %ebx | popl %ebx | ||||
| popl %esi | popl %esi | ||||
| popl %edi | popl %edi | ||||
| popl %ebp | popl %ebp | ||||
| addl $ARGS,%esp | |||||
| ret | ret | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -45,7 +45,7 @@ | |||||
| #define PREFETCHSIZE (8 * 2) | #define PREFETCHSIZE (8 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 7) | #define PREFETCHSIZE (8 * 7) | ||||
| @@ -76,18 +76,24 @@ | |||||
| #endif | #endif | ||||
| #define STACKSIZE 16 | #define STACKSIZE 16 | ||||
| #define ARGS 16 | |||||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||||
| #define ALPHA 16 + STACKSIZE+ARGS(%esp) | |||||
| #define A 24 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_LDA 28 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_X 32 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCX 36 + STACKSIZE+ARGS(%esp) | |||||
| #define Y 40 + STACKSIZE+ARGS(%esp) | |||||
| #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) | |||||
| #define BUFFER 48 + STACKSIZE+ARGS(%esp) | |||||
| #define MMM 0+STACKSIZE(%esp) | |||||
| #define AA 4+STACKSIZE(%esp) | |||||
| #define LDAX 8+STACKSIZE(%esp) | |||||
| #define NN 12+STACKSIZE(%esp) | |||||
| #define M 4 + STACKSIZE(%esp) | |||||
| #define N 8 + STACKSIZE(%esp) | |||||
| #define ALPHA 16 + STACKSIZE(%esp) | |||||
| #define A 24 + STACKSIZE(%esp) | |||||
| #define STACK_LDA 28 + STACKSIZE(%esp) | |||||
| #define STACK_X 32 + STACKSIZE(%esp) | |||||
| #define STACK_INCX 36 + STACKSIZE(%esp) | |||||
| #define Y 40 + STACKSIZE(%esp) | |||||
| #define STACK_INCY 44 + STACKSIZE(%esp) | |||||
| #define BUFFER 48 + STACKSIZE(%esp) | |||||
| #define I %eax | #define I %eax | ||||
| #define J %ebx | #define J %ebx | ||||
| @@ -101,6 +107,8 @@ | |||||
| PROLOGUE | PROLOGUE | ||||
| subl $ARGS,%esp | |||||
| pushl %ebp | pushl %ebp | ||||
| pushl %edi | pushl %edi | ||||
| pushl %esi | pushl %esi | ||||
| @@ -108,7 +116,40 @@ | |||||
| PROFCODE | PROFCODE | ||||
| movl STACK_LDA, LDA | movl STACK_LDA, LDA | ||||
| movl LDA,LDAX # backup LDA | |||||
| movl N,J | |||||
| movl J,NN # backup N | |||||
| movl A,J | |||||
| movl J,AA # backup A | |||||
| movl M,J | |||||
| movl J,MMM # mov M to MMM | |||||
| .L0t: | |||||
| xorl J,J | |||||
| addl $1,J | |||||
| sall $21,J # J=2^21*sizeof(double)=buffer size(16MB) | |||||
| subl $4, J # Don't use last 4 double in the buffer. | |||||
| # Now, split M by block J | |||||
| subl J,MMM # MMM=MMM-J | |||||
| movl J,M | |||||
| jge .L00t | |||||
| ALIGN_4 | |||||
| movl MMM,%eax | |||||
| addl J,%eax | |||||
| jle .L999x | |||||
| movl %eax,M | |||||
| .L00t: | |||||
| movl AA,%eax | |||||
| movl %eax,A # mov AA to A | |||||
| movl NN,%eax | |||||
| movl %eax,N # reset N | |||||
| movl LDAX, LDA # reset LDA | |||||
| movl STACK_X, X | movl STACK_X, X | ||||
| movl STACK_INCX, INCX | movl STACK_INCX, INCX | ||||
| movl STACK_INCY, INCY | movl STACK_INCY, INCY | ||||
| @@ -117,6 +158,7 @@ | |||||
| leal (,INCY, SIZE), INCY | leal (,INCY, SIZE), INCY | ||||
| leal (,LDA, SIZE), LDA | leal (,LDA, SIZE), LDA | ||||
| subl $-16 * SIZE, A | subl $-16 * SIZE, A | ||||
| cmpl $0, N | cmpl $0, N | ||||
| @@ -560,10 +602,19 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L999: | .L999: | ||||
| movl M,J | |||||
| leal (,J,SIZE),%eax | |||||
| addl %eax,AA | |||||
| jmp .L0t | |||||
| ALIGN_4 | |||||
| .L999x: | |||||
| popl %ebx | popl %ebx | ||||
| popl %esi | popl %esi | ||||
| popl %edi | popl %edi | ||||
| popl %ebp | popl %ebp | ||||
| addl $ARGS,%esp | |||||
| ret | ret | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -269,7 +269,7 @@ | |||||
| sarl $5, I | sarl $5, I | ||||
| jle .L113 | jle .L113 | ||||
| #if defined(BARCELONA) | |||||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||||
| movaps %xmm0, %xmm1 | movaps %xmm0, %xmm1 | ||||
| mulps -32 * SIZE(X), %xmm1 | mulps -32 * SIZE(X), %xmm1 | ||||
| @@ -76,7 +76,8 @@ | |||||
| xorps %xmm1, %xmm1 | xorps %xmm1, %xmm1 | ||||
| comisd %xmm0, %xmm1 | comisd %xmm0, %xmm1 | ||||
| jne .L100 # Alpha != ZERO | jne .L100 # Alpha != ZERO | ||||
| jp .L100 # For Alpha = NaN | |||||
| /* Alpha == ZERO */ | /* Alpha == ZERO */ | ||||
| cmpl $SIZE, INCX | cmpl $SIZE, INCX | ||||
| jne .L50 | jne .L50 | ||||
| @@ -252,7 +253,7 @@ | |||||
| sarl $4, I | sarl $4, I | ||||
| jle .L113 | jle .L113 | ||||
| #if defined(BARCELONA) | |||||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||||
| movaps %xmm0, %xmm1 | movaps %xmm0, %xmm1 | ||||
| mulpd -16 * SIZE(X), %xmm1 | mulpd -16 * SIZE(X), %xmm1 | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -439,7 +439,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -488,7 +488,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -1697,7 +1697,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1727,7 +1727,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -437,7 +437,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -833,7 +833,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -1848,7 +1848,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2109,7 +2109,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2429,7 +2429,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -2459,7 +2459,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -2952,7 +2952,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -3148,7 +3148,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -3389,7 +3389,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -3404,7 +3404,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -910,7 +910,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -959,7 +959,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -1439,7 +1439,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1469,7 +1469,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -872,7 +872,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -1316,7 +1316,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -1855,7 +1855,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -1885,7 +1885,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -2249,7 +2249,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2562,7 +2562,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2957,7 +2957,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -2972,7 +2972,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -3280,7 +3280,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -3515,7 +3515,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -1036,7 +1036,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1066,7 +1066,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -2224,7 +2224,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -2273,7 +2273,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -62,7 +62,7 @@ | |||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 21 + 4) | #define PREFETCHSIZE (8 * 21 + 4) | ||||
| #endif | #endif | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -439,7 +439,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -454,7 +454,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -758,7 +758,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -993,7 +993,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -1324,7 +1324,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -1354,7 +1354,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -1718,7 +1718,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2031,7 +2031,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2859,7 +2859,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -3303,7 +3303,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -1541,6 +1541,16 @@ | |||||
| popl %ebx | popl %ebx | ||||
| popl %esi | popl %esi | ||||
| popl %edi | popl %edi | ||||
| /*remove the hidden return value address from the stack.*/ | |||||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) | |||||
| #ifdef MS_ABI | |||||
| /* For MingW GCC >= 4.7. It is compatible with MSVC ABI. http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36834 */ | |||||
| ret | |||||
| #else | |||||
| /* remove the hidden return value address from the stack. For MingW GCC < 4.7 */ | |||||
| ret $0x4 | ret $0x4 | ||||
| #endif | |||||
| #else | |||||
| /*remove the hidden return value address from the stack on Linux.*/ | |||||
| ret $0x4 | |||||
| #endif | |||||
| EPILOGUE | EPILOGUE | ||||
| @@ -74,7 +74,7 @@ | |||||
| #define BB %ecx | #define BB %ecx | ||||
| #define LDC %ebp | #define LDC %ebp | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| @@ -625,7 +625,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 4 * SIZE(BB), %xmm2 | movsd 4 * SIZE(BB), %xmm2 | ||||
| @@ -870,7 +870,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -1173,7 +1173,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -1203,7 +1203,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -1359,7 +1359,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -1536,7 +1536,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -1794,7 +1794,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -1809,7 +1809,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -1936,7 +1936,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -2069,7 +2069,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCHSIZE (8 * 1 - 4) | #define PREFETCHSIZE (8 * 1 - 4) | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCHSIZE (16 * 1 + 8) | #define PREFETCHSIZE (16 * 1 + 8) | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHB prefetcht0 | #define PREFETCHB prefetcht0 | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define PREFETCHSIZE (16 * 2) | #define PREFETCHSIZE (16 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 7) | #define PREFETCHSIZE (16 * 7) | ||||
| @@ -71,7 +71,7 @@ | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| #ifdef BARCELONA | |||||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||||
| #define PREFETCH prefetchnta | #define PREFETCH prefetchnta | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 5) | #define PREFETCHSIZE (16 * 5) | ||||
| @@ -45,7 +45,7 @@ | |||||
| #define PREFETCHSIZE (8 * 2) | #define PREFETCHSIZE (8 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 7) | #define PREFETCHSIZE (8 * 7) | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| #ifdef BARCELONA | |||||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||||
| #define PREFETCH prefetchnta | #define PREFETCH prefetchnta | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (8 * 5) | #define PREFETCHSIZE (8 * 5) | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define PREFETCHSIZE (16 * 2) | #define PREFETCHSIZE (16 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (16 * 7) | #define PREFETCHSIZE (16 * 7) | ||||
| @@ -71,7 +71,7 @@ | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| #ifdef BARCELONA | |||||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||||
| #define PREFETCH prefetchnta | #define PREFETCH prefetchnta | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 5) | #define PREFETCHSIZE (16 * 5) | ||||
| @@ -45,7 +45,7 @@ | |||||
| #define PREFETCHSIZE (8 * 2) | #define PREFETCHSIZE (8 * 2) | ||||
| #endif | #endif | ||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| #define PREFETCHSIZE (8 * 7) | #define PREFETCHSIZE (8 * 7) | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| #ifdef BARCELONA | |||||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||||
| #define PREFETCH prefetchnta | #define PREFETCH prefetchnta | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (8 * 5) | #define PREFETCHSIZE (8 * 5) | ||||
| @@ -55,7 +55,7 @@ | |||||
| #define XX %edi | #define XX %edi | ||||
| #define FLAG %ebp | #define FLAG %ebp | ||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) | |||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) | |||||
| #define USE_PSHUFD | #define USE_PSHUFD | ||||
| #else | #else | ||||
| #define USE_PSHUFD_HALF | #define USE_PSHUFD_HALF | ||||
| @@ -697,7 +697,7 @@ | |||||
| cmpl $2 * SIZE, INCX | cmpl $2 * SIZE, INCX | ||||
| jne .L120 | jne .L120 | ||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) | |||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) | |||||
| PSHUFD2($0, %xmm0, %xmm6) | PSHUFD2($0, %xmm0, %xmm6) | ||||
| PSHUFD2($0, %xmm1, %xmm1) | PSHUFD2($0, %xmm1, %xmm1) | ||||
| @@ -57,7 +57,7 @@ | |||||
| #include "l1param.h" | #include "l1param.h" | ||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) | |||||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) | |||||
| #define USE_PSHUFD | #define USE_PSHUFD | ||||
| #else | #else | ||||
| #define USE_PSHUFD_HALF | #define USE_PSHUFD_HALF | ||||
| @@ -860,7 +860,7 @@ | |||||
| cmpl $2 * SIZE, INCX | cmpl $2 * SIZE, INCX | ||||
| jne .L220 | jne .L220 | ||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) | |||||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) | |||||
| #ifdef HAVE_SSE3 | #ifdef HAVE_SSE3 | ||||
| movddup %xmm0, %xmm6 | movddup %xmm0, %xmm6 | ||||
| @@ -61,7 +61,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -533,7 +533,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -63,7 +63,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -61,7 +61,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -994,7 +994,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -63,7 +63,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -61,7 +61,7 @@ | |||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| #ifdef NEHALEM | |||||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||||
| #define PREFETCH prefetcht1 | #define PREFETCH prefetcht1 | ||||
| #define PREFETCHSIZE 84 | #define PREFETCHSIZE 84 | ||||
| #endif | #endif | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -1820,7 +1820,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -0,0 +1,62 @@ | |||||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||||
| SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
| SGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
| DGEMMINCOPY = | |||||
| DGEMMITCOPY = | |||||
| DGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
| DGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
| DGEMMINCOPYOBJ = | |||||
| DGEMMITCOPYOBJ = | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMONCOPY = zgemm_ncopy_2.S | |||||
| CGEMMOTCOPY = zgemm_tcopy_2.S | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
| ZGEMMINCOPY = | |||||
| ZGEMMITCOPY = | |||||
| ZGEMMONCOPY = zgemm_ncopy_2.S | |||||
| ZGEMMOTCOPY = zgemm_tcopy_2.S | |||||
| ZGEMMINCOPYOBJ = | |||||
| ZGEMMITCOPYOBJ = | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| @@ -0,0 +1,62 @@ | |||||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||||
| SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
| SGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S | |||||
| DGEMMINCOPY = | |||||
| DGEMMITCOPY = | |||||
| DGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
| DGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
| DGEMMINCOPYOBJ = | |||||
| DGEMMITCOPYOBJ = | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMONCOPY = zgemm_ncopy_2.S | |||||
| CGEMMOTCOPY = zgemm_tcopy_2.S | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
| ZGEMMINCOPY = | |||||
| ZGEMMITCOPY = | |||||
| ZGEMMONCOPY = zgemm_ncopy_2.S | |||||
| ZGEMMOTCOPY = zgemm_tcopy_2.S | |||||
| ZGEMMINCOPYOBJ = | |||||
| ZGEMMITCOPYOBJ = | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| @@ -0,0 +1,84 @@ | |||||
| SGEMMKERNEL = sgemm_kernel_8x8_sandy.S | |||||
| SGEMMINCOPY = | |||||
| SGEMMITCOPY = | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMINCOPYOBJ = | |||||
| SGEMMITCOPYOBJ = | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = dgemm_kernel_4x8_sandy.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
| #DGEMMONCOPY = gemm_ncopy_4.S | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| #DGEMMOTCOPY = gemm_tcopy_4.S | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| #CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S | |||||
| CGEMMKERNEL = cgemm_kernel_4x8_sandy.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| #ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S | |||||
| ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S | |||||
| ZGEMMINCOPY = | |||||
| ZGEMMITCOPY = | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
| ZGEMMINCOPYOBJ = | |||||
| ZGEMMITCOPYOBJ = | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| #STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S | |||||
| #STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | |||||
| #STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | |||||
| #STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S | |||||
| #DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S | |||||
| #DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S | |||||
| #DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S | |||||
| #DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S | |||||
| #CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S | |||||
| #CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S | |||||
| #CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S | |||||
| #CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S | |||||
| #ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| #ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| #ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S | |||||
| #ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S | |||||
| @@ -69,7 +69,7 @@ | |||||
| #endif | #endif | ||||
| movaps %xmm0, ALPHA | movaps %xmm0, ALPHA | ||||
| #else | #else | ||||
| movaps %xmm3, ALPHA | |||||
| movq 40(%rsp), X | movq 40(%rsp), X | ||||
| movq 48(%rsp), INCX | movq 48(%rsp), INCX | ||||
| @@ -79,6 +79,10 @@ | |||||
| SAVEREGISTERS | SAVEREGISTERS | ||||
| #ifdef WINDOWS_ABI | |||||
| movaps %xmm3, ALPHA | |||||
| #endif | |||||
| shufps $0, ALPHA, ALPHA | shufps $0, ALPHA, ALPHA | ||||
| leaq (, INCX, SIZE), INCX | leaq (, INCX, SIZE), INCX | ||||
| @@ -69,7 +69,6 @@ | |||||
| #endif | #endif | ||||
| movaps %xmm0, ALPHA | movaps %xmm0, ALPHA | ||||
| #else | #else | ||||
| movaps %xmm3, ALPHA | |||||
| movq 40(%rsp), X | movq 40(%rsp), X | ||||
| movq 48(%rsp), INCX | movq 48(%rsp), INCX | ||||
| @@ -79,6 +78,10 @@ | |||||
| SAVEREGISTERS | SAVEREGISTERS | ||||
| #ifdef WINDOWS_ABI | |||||
| movaps %xmm3, ALPHA | |||||
| #endif | |||||
| unpcklpd ALPHA, ALPHA | unpcklpd ALPHA, ALPHA | ||||
| leaq (, INCX, SIZE), INCX | leaq (, INCX, SIZE), INCX | ||||