| @@ -1,16 +1,25 @@ | |||
| *.obj | |||
| *.lib | |||
| *.dll | |||
| *.dylib | |||
| *.def | |||
| *.o | |||
| lapack-3.1.1 | |||
| lapack-3.1.1.tgz | |||
| lapack-3.4.1 | |||
| lapack-3.4.1.tgz | |||
| lapack-3.4.2 | |||
| lapack-3.4.2.tgz | |||
| *.so | |||
| *.a | |||
| .svn | |||
| *~ | |||
| lib.grd | |||
| nohup.out | |||
| config.h | |||
| Makefile.conf | |||
| Makefile.conf_last | |||
| config_last.h | |||
| getarch | |||
| getarch_2nd | |||
| utest/openblas_utest | |||
| @@ -1,4 +1,115 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.2.6 | |||
| 2-Mar-2013 | |||
| common: | |||
| * Improved OpenMP performance slightly. (d744c9) | |||
| * Improved cblas.h compatibility with Intel MKL.(#185) | |||
| * Fixed the overflowing bug in single thread cholesky factorization. | |||
| * Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174) | |||
| x86/x86-64: | |||
| * Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) | |||
| We will tune the performance in future. | |||
| * Auto-detect Intel Xeon E7540. | |||
| * Fixed the overflowing buffer bug of gemv. (#173) | |||
| * Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189) | |||
| MIPS64: | |||
| ==================================================================== | |||
| Version 0.2.5 | |||
| 26-Nov-2012 | |||
| common: | |||
| * Added NO_SHARED flag to disable generating the shared library. | |||
| * Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158) | |||
| * Export LAPACK 3.4.2 symbols in shared library. (#147) | |||
| * Only detect the number of physical CPU cores on Mac OSX. (#157) | |||
| * Fixed NetBSD build. (#155) | |||
| * Fixed compilation with TARGET=GENERIC. (#160) | |||
| x86/x86-64: | |||
| * Restore the original CPU affinity when calling | |||
| openblas_set_num_threads(1) (#153) | |||
| * Fixed a SEGFAULT bug in dgemv_t when m is very large.(#154) | |||
| MIPS64: | |||
| ==================================================================== | |||
| Version 0.2.4 | |||
| 8-Oct-2012 | |||
| common: | |||
| * Upgraded LAPACK to 3.4.2 version. (#145) | |||
| * Provided support for passing CFLAGS, FFLAGS, PFLAGS, | |||
| FPFLAGS to make. (#137) | |||
| * f77blas.h:compatibility for compilers without C99 complex | |||
| number support. (#141) | |||
| x86/x86-64: | |||
| * Added NO_AVX flag. Check OS supporting AVX on runtime. (#139) | |||
| * Fixed zdot incompatibility ABI issue with GCC 4.7 on | |||
| Windows 32-bit. (#140) | |||
| MIPS64: | |||
| * Fixed the generation of shared library bug. | |||
| * Fixed the detection bug on the Loongson 3A server. | |||
| ==================================================================== | |||
| Version 0.2.3 | |||
| 20-Aug-2012 | |||
| common: | |||
| * Fixed LAPACK unstable bug about ?laswp. (#130) | |||
| * Fixed the shared library bug about unloading the library on | |||
| Linux (#132). | |||
| * Fixed the compilation failure on BlueGene/P (TARGET=PPC440FP2) | |||
| Please use gcc and IBM xlf. (#134) | |||
| x86/x86-64: | |||
| * Supported goto_set_num_threads and openblas_set_num_threads | |||
| APIs in Windows. They can set the number of threads on runtime. | |||
| ==================================================================== | |||
| Version 0.2.2 | |||
| 6-July-2012 | |||
| common: | |||
| * Fixed exporting DLL functions bug on Windows/MingW | |||
| * Support GNU Hurd (Thank Sylvestre Ledru) | |||
| * Support kfreebsd kernel (Thank Sylvestre Ledru) | |||
| x86/x86-64: | |||
| * Support Intel Sandy Bridge 22nm desktop/mobile CPU | |||
| SPARC: | |||
| * Improve the detection of SPARC (Thank Sylvestre Ledru) | |||
| ==================================================================== | |||
| Version 0.2.1 | |||
| 30-Jun-2012 | |||
| common: | |||
| x86/x86-64: | |||
| * Fixed the SEGFAULT bug about hyper-theading | |||
| * Support AMD Bulldozer by using GotoBLAS2 AMD Barcelona codes | |||
| ==================================================================== | |||
| Version 0.2.0 | |||
| 26-Jun-2012 | |||
| common: | |||
| * Removed the limitation (64) of numbers of CPU cores. | |||
| Now, it supports 256 cores at max. | |||
| * Supported clang compiler. | |||
| * Fixed some build bugs on FreeBSD | |||
| x86/x86-64: | |||
| * Optimized Level-3 BLAS on Intel Sandy Bridge x86-64 by AVX instructions. | |||
| Please use gcc >= 4.6 or clang >=3.1. | |||
| * Support AMD Bobcat by using GotoBLAS2 AMD Barcelona codes. | |||
| ==================================================================== | |||
| Version 0.1.1 | |||
| 29-Apr-2012 | |||
| common: | |||
| * Upgraded LAPACK to 3.4.1 version. (Thank Zaheer Chothia) | |||
| * Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia) | |||
| * Fixed the build bug (MD5 and download) on Mac OSX. | |||
| * Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1. | |||
| * Fxied the compatibility issue for compilers without C99 complex number | |||
| (e.g. Visual Studio) | |||
| x86/x86_64: | |||
| * Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX. | |||
| * Test alpha=Nan in dscale. | |||
| * Fixed a SEGFAULT bug in samax on x86 windows. | |||
| ==================================================================== | |||
| Version 0.1.0 | |||
| 23-Mar-2012 | |||
| @@ -90,6 +90,15 @@ | |||
| number of threads will consume extra resource. I recommend you to | |||
| specify minimum number of threads. | |||
| 1.9 Q I have segfaults when I compile with USE_OPENMP=1. What's wrong? | |||
| A This may be related to a bug in the Linux kernel 2.6.32. Try applying | |||
| the patch segaults.patch using | |||
| patch < segfaults.patch | |||
| and see if the crashes persist. Note that this patch will lead to many | |||
| compiler warnings. | |||
| 2. Architecture Specific issue or Implementation | |||
| @@ -1,4 +1,4 @@ | |||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| @@ -3,7 +3,7 @@ include ./Makefile.system | |||
| BLASDIRS = interface driver/level2 driver/level3 driver/others | |||
| ifndef DYNAMIC_ARCH | |||
| ifneq ($(DYNAMIC_ARCH), 1) | |||
| BLASDIRS += kernel | |||
| endif | |||
| @@ -26,7 +26,7 @@ endif | |||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | |||
| .PHONY : all libs netlib test ctest shared install | |||
| .PHONY : all libs netlib test ctest shared install | |||
| .NOTPARALLEL : all libs prof lapack-test install | |||
| all :: libs netlib tests shared | |||
| @@ -80,6 +80,7 @@ endif | |||
| @echo | |||
| shared : | |||
| ifndef NO_SHARED | |||
| ifeq ($(OSNAME), Linux) | |||
| $(MAKE) -C exports so | |||
| -ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @@ -99,11 +100,10 @@ ifeq ($(OSNAME), Darwin) | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| $(MAKE) -C exports dll | |||
| -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| $(MAKE) -C exports dll | |||
| -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll | |||
| endif | |||
| endif | |||
| tests : | |||
| @@ -147,7 +147,7 @@ ifeq ($(EXPRECISION), 1) | |||
| echo "#define EXPRECISION">> config_last.h | |||
| endif | |||
| ## | |||
| ifdef DYNAMIC_ARCH | |||
| ifeq ($(DYNAMIC_ARCH), 1) | |||
| $(MAKE) -C kernel commonlibs || exit 1 | |||
| for d in $(DYNAMIC_CORE) ; \ | |||
| do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ | |||
| @@ -165,7 +165,7 @@ prof_blas : | |||
| $(MAKE) -C $$d prof || exit 1 ; \ | |||
| fi; \ | |||
| done | |||
| ifdef DYNAMIC_ARCH | |||
| ifeq ($(DYNAMIC_ARCH), 1) | |||
| $(MAKE) -C kernel commonprof || exit 1 | |||
| endif | |||
| @@ -184,7 +184,7 @@ hpl : | |||
| $(MAKE) -C $$d $(@F) || exit 1 ; \ | |||
| fi; \ | |||
| done | |||
| ifdef DYNAMIC_ARCH | |||
| ifeq ($(DYNAMIC_ARCH), 1) | |||
| $(MAKE) -C kernel commonlibs || exit 1 | |||
| for d in $(DYNAMIC_CORE) ; \ | |||
| do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ | |||
| @@ -203,47 +203,73 @@ ifeq ($(NO_LAPACK), 1) | |||
| netlib : | |||
| else | |||
| netlib : lapack-3.4.0 patch.for_lapack-3.4.0 lapack-3.4.0/make.inc | |||
| netlib : lapack-3.4.2 patch.for_lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc | |||
| ifndef NOFORTRAN | |||
| -@$(MAKE) -C lapack-3.4.0 lapacklib | |||
| -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib | |||
| endif | |||
| ifndef NO_LAPACKE | |||
| -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib | |||
| endif | |||
| endif | |||
| prof_lapack : lapack-3.4.0 lapack-3.4.0/make.inc | |||
| -@$(MAKE) -C lapack-3.4.0 lapack_prof | |||
| prof_lapack : lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | |||
| lapack-3.4.0/make.inc : | |||
| $(NETLIB_LAPACK_DIR)/make.inc : | |||
| ifndef NOFORTRAN | |||
| -@echo "FORTRAN = $(FC)" > lapack-3.4.0/make.inc | |||
| -@echo "OPTS = $(FFLAGS)" >> lapack-3.4.0/make.inc | |||
| -@echo "POPTS = $(FPFLAGS)" >> lapack-3.4.0/make.inc | |||
| -@echo "NOOPT = $(FFLAGS) -O0" >> lapack-3.4.0/make.inc | |||
| -@echo "PNOOPT = $(FPFLAGS) -O0" >> lapack-3.4.0/make.inc | |||
| -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> lapack-3.4.0/make.inc | |||
| -@echo "ARCH = $(AR)" >> lapack-3.4.0/make.inc | |||
| -@echo "RANLIB = $(RANLIB)" >> lapack-3.4.0/make.inc | |||
| -@echo "LAPACKLIB = ../$(LIBNAME)" >> lapack-3.4.0/make.inc | |||
| -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> lapack-3.4.0/make.inc | |||
| -@echo "SUFFIX = $(SUFFIX)" >> lapack-3.4.0/make.inc | |||
| -@echo "PSUFFIX = $(PSUFFIX)" >> lapack-3.4.0/make.inc | |||
| # -@echo "CEXTRALIB = $(CEXTRALIB)" >> lapack-3.4.0/make.inc | |||
| -@cat make.inc >> lapack-3.4.0/make.inc | |||
| endif | |||
| lapack-3.4.0 : lapack-3.4.0.tgz | |||
| -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "POPTS = $(FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "NOOPT = $(FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| ifdef INTERFACE64 | |||
| -@echo "override CFLAGS = $(CFLAGS) -DHAVE_LAPACK_CONFIG_H -DLAPACK_ILP64" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| else | |||
| -@echo "override CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| endif | |||
| -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| endif | |||
| lapack-3.4.2 : lapack-3.4.2.tgz | |||
| ifndef NOFORTRAN | |||
| @if test `$(MD5SUM) lapack-3.4.0.tgz | $(AWK) '{print $$1}'` = 02d5706ec03ba885fc246e5fa10d8c70; then \ | |||
| ifndef NO_LAPACK | |||
| @if test `$(MD5SUM) lapack-3.4.2.tgz | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \ | |||
| echo $(TAR) zxf $< ;\ | |||
| $(TAR) zxf $< && (cd lapack-3.4.0; $(PATCH) -p1 < ../patch.for_lapack-3.4.0) ;\ | |||
| $(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.2) ;\ | |||
| rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\ | |||
| else \ | |||
| echo " lapack-3.4.0.tgz check sum is wrong (Please use orignal)." ;\ | |||
| rm -rf lapack-3.4.0 ;\ | |||
| rm -rf $(NETLIB_LAPACK_DIR) ;\ | |||
| echo " Cannot download lapack-3.4.2.tgz or the MD5 check sum is wrong (Please use orignal)."; \ | |||
| exit 1; \ | |||
| fi | |||
| endif | |||
| endif | |||
| LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.2.tgz | |||
| lapack-3.4.0.tgz : | |||
| lapack-3.4.2.tgz : | |||
| ifndef NOFORTRAN | |||
| -wget http://www.netlib.org/lapack/lapack-3.4.0.tgz | |||
| #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD)) | |||
| curl -O $(LAPACK_URL) | |||
| else | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| fetch $(LAPACK_URL) | |||
| else | |||
| wget -O $@ $(LAPACK_URL) | |||
| endif | |||
| endif | |||
| endif | |||
| large.tgz : | |||
| @@ -256,21 +282,21 @@ ifndef NOFORTRAN | |||
| -wget http://www.netlib.org/lapack/timing/timing.tgz | |||
| endif | |||
| lapack-timing : lapack-3.4.0 large.tgz timing.tgz | |||
| lapack-timing : lapack-3.4.2 large.tgz timing.tgz | |||
| ifndef NOFORTRAN | |||
| (cd lapack-3.4.0; $(TAR) zxf ../timing.tgz TIMING) | |||
| (cd lapack-3.4.0/TIMING; $(TAR) zxf ../../large.tgz ) | |||
| make -C lapack-3.4.0 tmglib | |||
| make -C lapack-3.4.0/TIMING | |||
| (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) | |||
| (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) | |||
| make -C $(NETLIB_LAPACK_DIR) tmglib | |||
| make -C $(NETLIB_LAPACK_DIR)/TIMING | |||
| endif | |||
| lapack-test : | |||
| $(MAKE) -C lapack-3.4.0 tmglib | |||
| $(MAKE) -C lapack-3.4.0/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintsts xlintstz xlintstzc | |||
| @rm -f lapack-3.4.0/TESTING/*.out | |||
| $(MAKE) -j 1 -C lapack-3.4.0/TESTING | |||
| $(GREP) failed lapack-3.4.0/TESTING/*.out | |||
| $(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib | |||
| $(MAKE) -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintsts xlintstz xlintstzc | |||
| @rm -f $(NETLIB_LAPACK_DIR)/TESTING/*.out | |||
| $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING | |||
| $(GREP) failed $(NETLIB_LAPACK_DIR)/TESTING/*.out | |||
| dummy : | |||
| @@ -288,10 +314,10 @@ clean :: | |||
| #endif | |||
| @$(MAKE) -C reference clean | |||
| @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h | |||
| @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib | |||
| @if test -d lapack-3.4.0; then \ | |||
| echo deleting lapack-3.4.0; \ | |||
| rm -rf lapack-3.4.0 ;\ | |||
| @rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib | |||
| @if test -d $(NETLIB_LAPACK_DIR); then \ | |||
| echo deleting $(NETLIB_LAPACK_DIR); \ | |||
| rm -rf $(NETLIB_LAPACK_DIR) ;\ | |||
| fi | |||
| @rm -f *.grd Makefile.conf_last config_last.h | |||
| @echo Done. | |||
| @echo Done. | |||
| @@ -23,7 +23,7 @@ install : lib.grd | |||
| @cat config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @cat openblas_config_template.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo \#endif >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
| @echo Generating f77blas.h in $(OPENBLAS_INCLUDE_DIR) | |||
| @echo \#ifndef OPENBLAS_F77BLAS_H > $(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||
| @@ -32,8 +32,18 @@ install : lib.grd | |||
| @cat common_interface.h >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||
| @echo \#endif >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h | |||
| ifndef NO_CBLAS | |||
| @echo Generating cblas.h in $(OPENBLAS_INCLUDE_DIR) | |||
| @sed 's/common/openblas_config/g' cblas.h > $(OPENBLAS_INCLUDE_DIR)/cblas.h | |||
| endif | |||
| ifndef NO_LAPACKE | |||
| @echo Copying LAPACKE header files to $(OPENBLAS_LIBRARY_DIR) | |||
| @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(OPENBLAS_INCLUDE_DIR)/lapacke.h | |||
| @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(OPENBLAS_INCLUDE_DIR)/lapacke_config.h | |||
| @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h | |||
| @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h | |||
| endif | |||
| #for install static library | |||
| @echo Copy the static library to $(OPENBLAS_LIBRARY_DIR) | |||
| @@ -61,11 +71,9 @@ ifeq ($(OSNAME), Darwin) | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) | |||
| -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) | |||
| -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll | |||
| endif | |||
| @echo Install OK! | |||
| @@ -1,3 +1,5 @@ | |||
| # This is triggered by Makefile.system and runs before any of the code is built. | |||
| export BINARY | |||
| export USE_OPENMP | |||
| @@ -15,7 +17,7 @@ ifdef CPUIDEMU | |||
| EXFLAGS = -DCPUIDEMU -DVENDOR=99 | |||
| endif | |||
| all: getarch_2nd | |||
| all: getarch_2nd cblas_noconst.h | |||
| ./getarch_2nd 0 >> $(TARGET_MAKE) | |||
| ./getarch_2nd 1 >> $(TARGET_CONF) | |||
| @@ -36,4 +38,7 @@ else | |||
| $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c | |||
| endif | |||
| cblas_noconst.h : cblas.h | |||
| perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h | |||
| dummy: | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.1.0 | |||
| VERSION = 0.2.6 | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -24,10 +24,13 @@ VERSION = 0.1.0 | |||
| # Fortran compiler. Default is g77. | |||
| # FC = gfortran | |||
| # Even you can specify cross compiler | |||
| # Even you can specify cross compiler. Meanwhile, please set HOSTCC. | |||
| # CC = x86_64-w64-mingw32-gcc | |||
| # FC = x86_64-w64-mingw32-gfortran | |||
| # If you use the cross compiler, please set this host compiler. | |||
| # HOSTCC = gcc | |||
| # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 | |||
| # BINARY=64 | |||
| @@ -45,12 +48,19 @@ VERSION = 0.1.0 | |||
| # automatically detected by the the script. | |||
| # NUM_THREADS = 24 | |||
| # if you don't need generate the shared library, please comment it in. | |||
| # NO_SHARED = 1 | |||
| # If you don't need CBLAS interface, please comment it in. | |||
| # NO_CBLAS = 1 | |||
| # If you don't need LAPACK, please comment it in. | |||
| # If you don't need LAPACK, please comment it in. | |||
| # If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. | |||
| # NO_LAPACK = 1 | |||
| # If you don't need LAPACKE (C Interface to LAPACK), please comment it in. | |||
| # NO_LAPACKE = 1 | |||
| # If you want to use legacy threaded Level 3 implementation. | |||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| @@ -67,6 +77,10 @@ VERSION = 0.1.0 | |||
| # If you want to disable CPU/Memory affinity on Linux. | |||
| # NO_AFFINITY = 1 | |||
| # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers | |||
| # and OS. However, the performance is low. | |||
| # NO_AVX = 1 | |||
| # If you would like to know minute performance report of GotoBLAS. | |||
| # FUNCTION_PROFILE = 1 | |||
| @@ -90,8 +104,8 @@ VERSION = 0.1.0 | |||
| # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | |||
| # with single thread. You can use this flag to avoid the overhead of multi-threading | |||
| # in small matrix sizes. The default value is 4. | |||
| # GEMM_MULTITHREAD_THRESHOLD = 4 | |||
| # in small matrix sizes. The default value is 50. | |||
| # GEMM_MULTITHREAD_THRESHOLD = 50 | |||
| # If you need santy check by comparing reference BLAS. It'll be very | |||
| # slow (Not implemented yet). | |||
| @@ -104,19 +118,16 @@ VERSION = 0.1.0 | |||
| # The installation directory. | |||
| # PREFIX = /opt/OpenBLAS | |||
| # Common Optimization Flag; -O2 is enough. | |||
| # DEBUG = 1 | |||
| ifeq ($(DEBUG), 1) | |||
| COMMON_OPT += -g | |||
| # -DDEBUG | |||
| else | |||
| COMMON_OPT += -O2 | |||
| endif | |||
| # Common Optimization Flag; | |||
| # The default -O2 is enough. | |||
| # COMMON_OPT = -O2 | |||
| # Profiling flags | |||
| COMMON_PROF = -pg | |||
| # Build Debug version | |||
| # DEBUG = 1 | |||
| # | |||
| # End of user configuration | |||
| # | |||
| @@ -9,8 +9,20 @@ ifndef TOPDIR | |||
| TOPDIR = . | |||
| endif | |||
| ifndef NETLIB_LAPACK_DIR | |||
| NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.2 | |||
| endif | |||
| # Default C compiler | |||
| # - Only set if not specified on the command line or inherited from the environment. | |||
| # - CC is an implicit variable so neither '?=' or 'ifndef' can be used. | |||
| # http://stackoverflow.com/questions/4029274/mingw-and-make-variables | |||
| # - Default value is 'cc' which is not always a valid command (e.g. MinGW). | |||
| ifeq ($(origin CC),default) | |||
| CC = gcc | |||
| endif | |||
| # Default Fortran compiler (FC) is selected by f_check. | |||
| ifndef MAKEFILE_RULE | |||
| include $(TOPDIR)/Makefile.rule | |||
| @@ -41,16 +53,24 @@ GETARCH_FLAGS += -DUSE64BITINT | |||
| endif | |||
| ifndef GEMM_MULTITHREAD_THRESHOLD | |||
| GEMM_MULTITHREAD_THRESHOLD=4 | |||
| GEMM_MULTITHREAD_THRESHOLD=50 | |||
| endif | |||
| GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) | |||
| ifeq ($(NO_AVX), 1) | |||
| GETARCH_FLAGS += -DNO_AVX | |||
| endif | |||
| ifeq ($(DEBUG), 1) | |||
| GETARCH_FLAGS += -g | |||
| endif | |||
| # This operation is expensive, so execution should be once. | |||
| ifndef GOTOBLAS_MAKEFILE | |||
| export GOTOBLAS_MAKEFILE = 1 | |||
| # Generating Makefile.conf and config.h | |||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) | |||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) | |||
| ifndef TARGET_CORE | |||
| include $(TOPDIR)/Makefile.conf | |||
| @@ -101,6 +121,15 @@ DLLWRAP = $(CROSS_SUFFIX)dllwrap | |||
| ifeq ($(OSNAME), Darwin) | |||
| export MACOSX_DEPLOYMENT_TARGET=10.2 | |||
| MD5SUM = md5 -r | |||
| endif | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| MD5SUM = md5 -r | |||
| endif | |||
| ifeq ($(OSNAME), NetBSD) | |||
| MD5SUM = md5 -n | |||
| endif | |||
| ifeq ($(OSNAME), Linux) | |||
| @@ -120,6 +149,26 @@ EXTRALIB += -defaultlib:advapi32 | |||
| SUFFIX = obj | |||
| PSUFFIX = pobj | |||
| LIBSUFFIX = lib | |||
| ifeq ($(C_COMPILER), GCC) | |||
| #Test for supporting MS_ABI | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) | |||
| ifeq ($(GCCVERSIONGT4), 1) | |||
| # GCC Majar version > 4 | |||
| # It is compatible with MSVC ABI. | |||
| CCOMMON_OPT += -DMS_ABI | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ4), 1) | |||
| ifeq ($(GCCMINORVERSIONGTEQ7), 1) | |||
| # GCC Version >=4.7 | |||
| # It is compatible with MSVC ABI. | |||
| CCOMMON_OPT += -DMS_ABI | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), Interix) | |||
| @@ -223,14 +272,20 @@ endif | |||
| endif | |||
| ifdef DYNAMIC_ARCH | |||
| ifeq ($(DYNAMIC_ARCH), 1) | |||
| ifeq ($(ARCH), x86) | |||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | |||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
| ifneq ($(NO_AVX), 1) | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), x86_64) | |||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
| ifneq ($(NO_AVX), 1) | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||
| endif | |||
| endif | |||
| ifndef DYNAMIC_CORE | |||
| @@ -459,11 +514,28 @@ ifdef INTERFACE64 | |||
| FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), mips64) | |||
| ifndef BINARY64 | |||
| FCOMMON_OPT += -n32 | |||
| else | |||
| FCOMMON_OPT += -n64 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3A) | |||
| FCOMMON_OPT += -loongson3 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3B) | |||
| FCOMMON_OPT += -loongson3 | |||
| endif | |||
| else | |||
| ifndef BINARY64 | |||
| FCOMMON_OPT += -m32 | |||
| else | |||
| FCOMMON_OPT += -m64 | |||
| endif | |||
| endif | |||
| ifdef USE_OPENMP | |||
| FEXTRALIB += -lstdc++ | |||
| @@ -472,12 +544,30 @@ endif | |||
| endif | |||
| ifeq ($(C_COMPILER), OPEN64) | |||
| ifeq ($(ARCH), mips64) | |||
| ifndef BINARY64 | |||
| CCOMMON_OPT += -n32 | |||
| else | |||
| CCOMMON_OPT += -n64 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3A) | |||
| CCOMMON_OPT += -loongson3 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3B) | |||
| CCOMMON_OPT += -loongson3 | |||
| endif | |||
| else | |||
| ifndef BINARY64 | |||
| CCOMMON_OPT += -m32 | |||
| else | |||
| CCOMMON_OPT += -m64 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(C_COMPILER), SUN) | |||
| CCOMMON_OPT += -w | |||
| @@ -533,6 +623,16 @@ endif | |||
| ifeq ($(NO_LAPACK), 1) | |||
| CCOMMON_OPT += -DNO_LAPACK | |||
| #Disable LAPACK C interface | |||
| NO_LAPACKE = 1 | |||
| endif | |||
| ifeq ($(NO_LAPACKE), 1) | |||
| CCOMMON_OPT += -DNO_LAPACKE | |||
| endif | |||
| ifeq ($(NO_AVX), 1) | |||
| CCOMMON_OPT += -DNO_AVX | |||
| endif | |||
| ifdef SMP | |||
| @@ -651,17 +751,30 @@ PATCH = patch | |||
| GREP = grep | |||
| endif | |||
| ifndef MD5SUM | |||
| MD5SUM = md5sum | |||
| endif | |||
| AWK = awk | |||
| REVISION = -r$(VERSION) | |||
| MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) | |||
| CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) | |||
| PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) | |||
| ifeq ($(DEBUG), 1) | |||
| COMMON_OPT += -g | |||
| endif | |||
| ifndef COMMON_OPT | |||
| COMMON_OPT = -O2 | |||
| endif | |||
| override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) | |||
| override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) | |||
| FFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) | |||
| FPFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) | |||
| override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) | |||
| override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) | |||
| #MAKEOVERRIDES = | |||
| ifndef SUFFIX | |||
| SUFFIX = o | |||
| @@ -675,7 +788,7 @@ ifndef LIBSUFFIX | |||
| LIBSUFFIX = a | |||
| endif | |||
| ifndef DYNAMIC_ARCH | |||
| ifneq ($(DYNAMIC_ARCH), 1) | |||
| ifndef SMP | |||
| LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) | |||
| LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) | |||
| @@ -694,8 +807,8 @@ endif | |||
| endif | |||
| LIBDLLNAME = $(LIBPREFIX).dll | |||
| LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) | |||
| LIBDLLNAME = $(LIBNAME:.$(LIBSUFFIX)=.dll) | |||
| LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) | |||
| LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) | |||
| LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) | |||
| @@ -740,6 +853,7 @@ export HAVE_SSE4_1 | |||
| export HAVE_SSE4_2 | |||
| export HAVE_SSE4A | |||
| export HAVE_SSE5 | |||
| export HAVE_AVX | |||
| export KERNELDIR | |||
| export FUNCTION_PROFILE | |||
| export TARGET_CORE | |||
| @@ -22,19 +22,19 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) | |||
| BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) | |||
| endif | |||
| $(SBLASOBJS) $(SBLASOBJS_P) : CFLAGS += -UDOUBLE -UCOMPLEX | |||
| $(DBLASOBJS) $(DBLASOBJS_P) : CFLAGS += -DDOUBLE -UCOMPLEX | |||
| $(QBLASOBJS) $(QBLASOBJS_P) : CFLAGS += -DXDOUBLE -UCOMPLEX | |||
| $(CBLASOBJS) $(CBLASOBJS_P) : CFLAGS += -UDOUBLE -DCOMPLEX | |||
| $(ZBLASOBJS) $(ZBLASOBJS_P) : CFLAGS += -DDOUBLE -DCOMPLEX | |||
| $(XBLASOBJS) $(XBLASOBJS_P) : CFLAGS += -DXDOUBLE -DCOMPLEX | |||
| $(SBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(DBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(QBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(CBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(ZBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(XBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX | |||
| $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX | |||
| $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX | |||
| $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX | |||
| $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX | |||
| $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX | |||
| $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| libs :: $(BLASOBJS) $(COMMONOBJS) | |||
| $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ | |||
| @@ -1,83 +0,0 @@ | |||
| OpenBLAS Readme | |||
| 1.Introduction | |||
| OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn) | |||
| 2.Intallation | |||
| Download from project homepage. http://xianyi.github.com/OpenBLAS/ | |||
| Or, | |||
| check out codes from git://github.com/xianyi/OpenBLAS.git | |||
| 1)Normal compile | |||
| (a) type "make" to detect the CPU automatically. | |||
| or | |||
| (b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. | |||
| 2)Cross compile | |||
| Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. | |||
| examples: | |||
| On X86 box, compile this library for loongson3a CPU. | |||
| make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A | |||
| 3)Debug version | |||
| make DEBUG=1 | |||
| 4)Intall to the directory (Optional) | |||
| e.g. | |||
| make install PREFIX=your_installation_directory | |||
| The default directory is /opt/OpenBLAS | |||
| 3.Support CPU & OS | |||
| Please read GotoBLAS_01Readme.txt | |||
| Additional support CPU: | |||
| x86_64: | |||
| Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. | |||
| MIPS64: | |||
| ICT Loongson 3A //The initial version used GotoBLAS2 MIPS64 kernels. Thus, the performance is not good. | |||
| 4.Usages | |||
| Link with libopenblas.a or -lopenblas for shared library. | |||
| 4.1 Set the number of threads with environment variables. for example, | |||
| export OPENBLAS_NUM_THREADS=4 | |||
| or | |||
| export GOTO_NUM_THREADS=4 | |||
| or | |||
| export OMP_NUM_THREADS=4 | |||
| The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. | |||
| If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. | |||
| 4.2 Set the number of threads with calling functions. for example, | |||
| void goto_set_num_threads(int num_threads); | |||
| or | |||
| void openblas_set_num_threads(int num_threads); | |||
| If you compile this lib with USE_OPENMP=1, you should use the above functions, too. | |||
| 5.Report Bugs | |||
| Please add a issue in https://github.com/xianyi/OpenBLAS/issues | |||
| 6.To-Do List: | |||
| Optimization on ICT Loongson 3A CPU | |||
| 7.Contact | |||
| OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas | |||
| 8.ChangeLog | |||
| Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | |||
| 9.Known Issues | |||
| * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit | |||
| is 64. On 32 bits, it is 32. | |||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. | |||
| 10. Specification of Git Branches | |||
| We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | |||
| Now, there are 4 branches in github.com. | |||
| * The master branch. This a main branch to reflect a production-ready state. | |||
| * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. | |||
| * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. | |||
| * The gh-pages branch. This is for web pages | |||
| @@ -0,0 +1,117 @@ | |||
| # OpenBLAS | |||
| ## Introduction | |||
| OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS <http://www.rdcps.ac.cn>. | |||
| Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>. | |||
| ## Installation | |||
| Download from project homepage. http://xianyi.github.com/OpenBLAS/ | |||
| Or, check out codes from git://github.com/xianyi/OpenBLAS.git | |||
| ### Normal compile | |||
| * type "make" to detect the CPU automatically. | |||
| or | |||
| * type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. | |||
| ### Cross compile | |||
| Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. | |||
| Examples: | |||
| On X86 box, compile this library for loongson3a CPU. | |||
| make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A | |||
| On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler. | |||
| make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 | |||
| ### Debug version | |||
| make DEBUG=1 | |||
| ### Install to the directory (Optional) | |||
| Example: | |||
| make install PREFIX=your_installation_directory | |||
| The default directory is /opt/OpenBLAS | |||
| ## Support CPU & OS | |||
| Please read GotoBLAS_01Readme.txt | |||
| ### Additional support CPU: | |||
| #### x86/x86-64: | |||
| - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. | |||
| - **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. | |||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | |||
| - **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) | |||
| #### MIPS64: | |||
| - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. | |||
| - **ICT Loongson 3B**: Experimental | |||
| ### Support OS: | |||
| - **GNU/Linux** | |||
| - **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | |||
| - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. | |||
| - **FreeBSD**: Supportted by community. We didn't test the library on this OS. | |||
| ## Usages | |||
| Link with libopenblas.a or -lopenblas for shared library. | |||
| ### Set the number of threads with environment variables. | |||
| Examples: | |||
| export OPENBLAS_NUM_THREADS=4 | |||
| or | |||
| export GOTO_NUM_THREADS=4 | |||
| or | |||
| export OMP_NUM_THREADS=4 | |||
| The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. | |||
| If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. | |||
| ### Set the number of threads on runtime. | |||
| We provided the below functions to controll the number of threads on runtime. | |||
| void goto_set_num_threads(int num_threads); | |||
| void openblas_set_num_threads(int num_threads); | |||
| If you compile this lib with USE_OPENMP=1, you should use the above functions, too. | |||
| ## Report Bugs | |||
| Please add a issue in https://github.com/xianyi/OpenBLAS/issues | |||
| ## Contact | |||
| OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas | |||
| ## ChangeLog | |||
| Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | |||
| ## Troubleshooting | |||
| * Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. | |||
| * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. | |||
| * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. | |||
| * The number of CPUs/Cores should less than or equal to 256. | |||
| * On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1. | |||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. | |||
| ## Specification of Git Branches | |||
| We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | |||
| Now, there are 4 branches in github.com. | |||
| * The master branch. This a main branch to reflect a production-ready state. | |||
| * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. | |||
| * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. | |||
| * The gh-pages branch. This is for web pages | |||
| @@ -8,8 +8,8 @@ Supported List: | |||
| 1.X86/X86_64 | |||
| a)Intel CPU: | |||
| P2 | |||
| COPPERMINE | |||
| KATMAI | |||
| COPPERMINE | |||
| NORTHWOOD | |||
| PRESCOTT | |||
| BANIAS | |||
| @@ -18,6 +18,7 @@ CORE2 | |||
| PENRYN | |||
| DUNNINGTON | |||
| NEHALEM | |||
| SANDYBRIDGE | |||
| ATOM | |||
| b)AMD CPU: | |||
| @@ -27,6 +28,8 @@ OPTERON_SSE3 | |||
| BARCELONA | |||
| SHANGHAI | |||
| ISTANBUL | |||
| BOBCAT | |||
| BULLDOZER | |||
| c)VIA CPU: | |||
| SSE_GENERIC | |||
| @@ -47,6 +50,7 @@ CELL | |||
| 3.MIPS64 CPU: | |||
| SICORTEX | |||
| LOONGSON3A | |||
| LOONGSON3B | |||
| 4.IA64 CPU: | |||
| ITANIUM2 | |||
| @@ -43,14 +43,14 @@ $compiler = DEC if ($data =~ /COMPILER_DEC/); | |||
| $compiler = GCC if ($compiler eq ""); | |||
| $os = Linux if ($data =~ /OS_LINUX/); | |||
| $os = FreeBSD if ($data =~ /OS_FreeBSD/); | |||
| $os = NetBSD if ($data =~ /OS_NetBSD/); | |||
| $os = Darwin if ($data =~ /OS_Darwin/); | |||
| $os = SunOS if ($data =~ /OS_SunOS/); | |||
| $os = FreeBSD if ($data =~ /OS_FREEBSD/); | |||
| $os = NetBSD if ($data =~ /OS_NETBSD/); | |||
| $os = Darwin if ($data =~ /OS_DARWIN/); | |||
| $os = SunOS if ($data =~ /OS_SUNOS/); | |||
| $os = AIX if ($data =~ /OS_AIX/); | |||
| $os = osf if ($data =~ /OS_OSF/); | |||
| $os = WINNT if ($data =~ /OS_WINNT/); | |||
| $os = CYGWIN_NT if ($data =~ /OS_CYGWIN/); | |||
| $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); | |||
| $os = Interix if ($data =~ /OS_INTERIX/); | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| @@ -174,6 +174,8 @@ $linker_a = ""; | |||
| $link =~ s/\-Y\sP\,/\-Y/g; | |||
| @flags = split(/[\s\,\n]/, $link); | |||
| # remove leading and trailing quotes from each flag. | |||
| @flags = map {s/^['"]|['"]$//g; $_} @flags; | |||
| foreach $flags (@flags) { | |||
| if ( | |||
| @@ -1,287 +1,293 @@ | |||
| #ifndef CBLAS_H | |||
| #define CBLAS_H | |||
| #include <stddef.h> | |||
| #include "common.h" | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| /* Assume C declarations for C++ */ | |||
| #endif /* __cplusplus */ | |||
| #include <stddef.h> | |||
| #include "common.h" | |||
| /*Set the number of threads on runtime.*/ | |||
| void openblas_set_num_threads(int num_threads); | |||
| void goto_set_num_threads(int num_threads); | |||
| /*Get the build configure on runtime.*/ | |||
| char* openblas_get_config(void); | |||
| #define CBLAS_INDEX size_t | |||
| enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; | |||
| enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114}; | |||
| enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; | |||
| enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; | |||
| enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; | |||
| float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy); | |||
| double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); | |||
| float _Complex cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| float _Complex cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| double _Complex cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); | |||
| double _Complex cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); | |||
| void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, float _Complex *ret); | |||
| void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, float _Complex *ret); | |||
| void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, double _Complex *ret); | |||
| void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, double _Complex *ret); | |||
| float cblas_sasum (blasint n, float *x, blasint incx); | |||
| double cblas_dasum (blasint n, double *x, blasint incx); | |||
| float cblas_scasum(blasint n, float *x, blasint incx); | |||
| double cblas_dzasum(blasint n, double *x, blasint incx); | |||
| float cblas_snrm2 (blasint N, float *X, blasint incX); | |||
| double cblas_dnrm2 (blasint N, double *X, blasint incX); | |||
| float cblas_scnrm2(blasint N, float *X, blasint incX); | |||
| double cblas_dznrm2(blasint N, double *X, blasint incX); | |||
| CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); | |||
| CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); | |||
| CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); | |||
| CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); | |||
| void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy); | |||
| void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy); | |||
| void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy); | |||
| void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy); | |||
| void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); | |||
| void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); | |||
| void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); | |||
| void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); | |||
| void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); | |||
| void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); | |||
| typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; | |||
| typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; | |||
| typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; | |||
| typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; | |||
| typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; | |||
| float cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy); | |||
| double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); | |||
| float cblas_sdot(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); | |||
| double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); | |||
| openblas_complex_float cblas_cdotu(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); | |||
| openblas_complex_float cblas_cdotc(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); | |||
| openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); | |||
| openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); | |||
| void cblas_cdotu_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret); | |||
| void cblas_cdotc_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret); | |||
| void cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); | |||
| void cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); | |||
| float cblas_sasum (const blasint n, const float *x, const blasint incx); | |||
| double cblas_dasum (const blasint n, const double *x, const blasint incx); | |||
| float cblas_scasum(const blasint n, const float *x, const blasint incx); | |||
| double cblas_dzasum(const blasint n, const double *x, const blasint incx); | |||
| float cblas_snrm2 (const blasint N, const float *X, const blasint incX); | |||
| double cblas_dnrm2 (const blasint N, const double *X, const blasint incX); | |||
| float cblas_scnrm2(const blasint N, const float *X, const blasint incX); | |||
| double cblas_dznrm2(const blasint N, const double *X, const blasint incX); | |||
| CBLAS_INDEX cblas_isamax(const blasint n, const float *x, const blasint incx); | |||
| CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx); | |||
| CBLAS_INDEX cblas_icamax(const blasint n, const float *x, const blasint incx); | |||
| CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx); | |||
| void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy); | |||
| void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy); | |||
| void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy); | |||
| void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy); | |||
| void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); | |||
| void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); | |||
| void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); | |||
| void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); | |||
| void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); | |||
| void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); | |||
| void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); | |||
| void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); | |||
| void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s); | |||
| void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double s); | |||
| void cblas_srotg(float *a, float *b, float *c, float *s); | |||
| void cblas_drotg(double *a, double *b, double *c, double *s); | |||
| void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); | |||
| void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); | |||
| void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); | |||
| void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); | |||
| void cblas_sscal(blasint N, float alpha, float *X, blasint incX); | |||
| void cblas_dscal(blasint N, double alpha, double *X, blasint incX); | |||
| void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); | |||
| void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); | |||
| void cblas_csscal(blasint N, float alpha, float *X, blasint incX); | |||
| void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); | |||
| void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||
| float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); | |||
| void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||
| double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); | |||
| void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||
| float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); | |||
| void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||
| double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); | |||
| void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||
| void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||
| void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||
| void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||
| void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||
| void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||
| void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); | |||
| void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); | |||
| void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); | |||
| void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); | |||
| void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, | |||
| blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||
| void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, | |||
| blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||
| void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, | |||
| float *Y, blasint incY, float *A, blasint lda); | |||
| void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, | |||
| double *Y, blasint incY, double *A, blasint lda); | |||
| void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||
| blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | |||
| void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||
| blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | |||
| void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||
| blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||
| void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||
| blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||
| void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, | |||
| blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | |||
| void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, | |||
| blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | |||
| void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, float *Ap, float *X, blasint incX); | |||
| void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, double *Ap, double *X, blasint incX); | |||
| void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, float *Ap, float *X, blasint incX); | |||
| void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, double *Ap, double *X, blasint incX); | |||
| void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, float *Ap, float *X, blasint incX); | |||
| void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, double *Ap, double *X, blasint incX); | |||
| void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, float *Ap, float *X, blasint incX); | |||
| void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, double *Ap, double *X, blasint incX); | |||
| void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, | |||
| blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | |||
| void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, | |||
| blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | |||
| void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, | |||
| blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||
| void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, | |||
| blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||
| void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, | |||
| float *X, blasint incX, float beta, float *Y, blasint incY); | |||
| void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, | |||
| double *X, blasint incX, double beta, double *Y, blasint incY); | |||
| void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); | |||
| void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); | |||
| void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); | |||
| void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); | |||
| void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); | |||
| void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); | |||
| void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); | |||
| void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); | |||
| void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, | |||
| float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||
| void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, | |||
| double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||
| void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, | |||
| float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||
| void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, | |||
| double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||
| void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||
| float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||
| void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||
| double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||
| void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||
| void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||
| void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||
| float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||
| void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||
| double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||
| void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||
| void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||
| void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); | |||
| void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); | |||
| void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); | |||
| void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); | |||
| void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||
| void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||
| void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||
| void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||
| void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); | |||
| void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); | |||
| void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); | |||
| void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); | |||
| void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); | |||
| void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); | |||
| void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); | |||
| void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); | |||
| void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||
| void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||
| void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||
| float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); | |||
| void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||
| double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); | |||
| void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||
| void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||
| void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P); | |||
| void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P); | |||
| void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); | |||
| void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); | |||
| void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX); | |||
| void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX); | |||
| void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX); | |||
| void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX); | |||
| void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX); | |||
| void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX); | |||
| void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, | |||
| const float alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float beta, float *y, const blasint incy); | |||
| void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, | |||
| const double alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double beta, double *y, const blasint incy); | |||
| void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, | |||
| const float *alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float *beta, float *y, const blasint incy); | |||
| void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, | |||
| const double *alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double *beta, double *y, const blasint incy); | |||
| void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); | |||
| void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | |||
| void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); | |||
| void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); | |||
| void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | |||
| void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | |||
| void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); | |||
| void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); | |||
| void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); | |||
| void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); | |||
| void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X, | |||
| const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); | |||
| void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, | |||
| const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | |||
| void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, | |||
| const float *Y, const blasint incY, float *A, const blasint lda); | |||
| void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, | |||
| const double *Y, const blasint incY, double *A, const blasint lda); | |||
| void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | |||
| const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | |||
| void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | |||
| const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | |||
| void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | |||
| const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | |||
| void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | |||
| const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | |||
| void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A, | |||
| const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | |||
| void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A, | |||
| const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | |||
| void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const float *Ap, float *X, const blasint incX); | |||
| void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const double *Ap, double *X, const blasint incX); | |||
| void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const float *Ap, float *X, const blasint incX); | |||
| void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const double *Ap, double *X, const blasint incX); | |||
| void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const float *Ap, float *X, const blasint incX); | |||
| void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const double *Ap, double *X, const blasint incX); | |||
| void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const float *Ap, float *X, const blasint incX); | |||
| void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const double *Ap, double *X, const blasint incX); | |||
| void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A, | |||
| const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | |||
| void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A, | |||
| const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | |||
| void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A, | |||
| const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | |||
| void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A, | |||
| const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | |||
| void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap, | |||
| const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | |||
| void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap, | |||
| const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | |||
| void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap); | |||
| void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap); | |||
| void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A); | |||
| void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A); | |||
| void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A); | |||
| void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A); | |||
| void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap); | |||
| void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap); | |||
| void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, | |||
| const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | |||
| void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, | |||
| const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | |||
| void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, | |||
| const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | |||
| void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, | |||
| const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | |||
| void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | |||
| const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | |||
| void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | |||
| const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | |||
| void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | |||
| const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | |||
| void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | |||
| const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | |||
| void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||
| const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | |||
| void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||
| const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | |||
| void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||
| const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | |||
| void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||
| const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | |||
| void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); | |||
| void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); | |||
| void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc); | |||
| void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc); | |||
| void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | |||
| void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | |||
| void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | |||
| void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | |||
| void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); | |||
| void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); | |||
| void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); | |||
| void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); | |||
| void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); | |||
| void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); | |||
| void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); | |||
| void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); | |||
| void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||
| const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | |||
| void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||
| const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | |||
| void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | |||
| const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); | |||
| void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | |||
| const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); | |||
| void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | |||
| const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | |||
| void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | |||
| const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | |||
| void cblas_xerbla(blasint p, char *rout, char *form, ...); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif /* __cplusplus */ | |||
| #endif | |||
| @@ -68,7 +68,7 @@ extern "C" { | |||
| #define SMP | |||
| #endif | |||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_Interix) | |||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) | |||
| #define WINDOWS_ABI | |||
| #define OS_WINDOWS | |||
| @@ -89,7 +89,7 @@ extern "C" { | |||
| #include <sched.h> | |||
| #endif | |||
| #ifdef OS_DARWIN | |||
| #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) | |||
| #include <sched.h> | |||
| #endif | |||
| @@ -351,7 +351,12 @@ typedef int blasint; | |||
| #endif | |||
| #define MMAP_ACCESS (PROT_READ | PROT_WRITE) | |||
| #ifdef __NetBSD__ | |||
| #define MMAP_POLICY (MAP_PRIVATE | MAP_ANON) | |||
| #else | |||
| #define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) | |||
| #endif | |||
| #include "param.h" | |||
| #include "common_param.h" | |||
| @@ -374,6 +379,31 @@ typedef int blasint; | |||
| #endif | |||
| #endif | |||
| #ifndef ASSEMBLER | |||
| #ifndef NOINCLUDE | |||
| /* Inclusion of a standard header file is needed for definition of __STDC_* | |||
| predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs | |||
| as a side effect of including either <features.h> or <stdc-predef.h>. */ | |||
| #include <stdio.h> | |||
| #endif // NOINCLUDE | |||
| /* C99 supports complex floating numbers natively, which GCC also offers as an | |||
| extension since version 3.0. If neither are available, use a compatible | |||
| structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ | |||
| #if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ | |||
| (__GNUC__ >= 3 && !defined(__cplusplus))) | |||
| #define OPENBLAS_COMPLEX_C99 | |||
| typedef float _Complex openblas_complex_float; | |||
| typedef double _Complex openblas_complex_double; | |||
| typedef xdouble _Complex openblas_complex_xdouble; | |||
| #else | |||
| #define OPENBLAS_COMPLEX_STRUCT | |||
| typedef struct { float real, imag; } openblas_complex_float; | |||
| typedef struct { double real, imag; } openblas_complex_double; | |||
| typedef struct { xdouble real, imag; } openblas_complex_xdouble; | |||
| #endif | |||
| #endif // ASSEMBLER | |||
| #ifndef IFLUSH | |||
| #define IFLUSH | |||
| #endif | |||
| @@ -528,7 +558,8 @@ typedef struct { | |||
| #include "common_level3.h" | |||
| #include "common_lapack.h" | |||
| #ifdef CBLAS | |||
| #include "cblas.h" | |||
| /* This header file is generated from "cblas.h" (see Makefile.prebuild). */ | |||
| #include "cblas_noconst.h" | |||
| #endif | |||
| #ifndef ASSEMBLER | |||
| @@ -45,6 +45,8 @@ extern "C" { | |||
| int BLASFUNC(xerbla)(char *, blasint *info, blasint); | |||
| void openblas_set_num_threads_(int *); | |||
| FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); | |||
| FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); | |||
| @@ -74,19 +76,19 @@ myxcomplex_t BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, | |||
| myxcomplex_t BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||
| #elif defined RETURN_BY_STACK | |||
| void BLASFUNC(cdotu) (float _Complex *, blasint *, float * , blasint *, float *, blasint *); | |||
| void BLASFUNC(cdotc) (float _Complex *, blasint *, float *, blasint *, float *, blasint *); | |||
| void BLASFUNC(zdotu) (double _Complex *, blasint *, double *, blasint *, double *, blasint *); | |||
| void BLASFUNC(zdotc) (double _Complex *, blasint *, double *, blasint *, double *, blasint *); | |||
| void BLASFUNC(xdotu) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||
| void BLASFUNC(xdotc) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||
| void BLASFUNC(cdotu) (openblas_complex_float *, blasint *, float * , blasint *, float *, blasint *); | |||
| void BLASFUNC(cdotc) (openblas_complex_float *, blasint *, float *, blasint *, float *, blasint *); | |||
| void BLASFUNC(zdotu) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *); | |||
| void BLASFUNC(zdotc) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *); | |||
| void BLASFUNC(xdotu) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||
| void BLASFUNC(xdotc) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||
| #else | |||
| float _Complex BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); | |||
| float _Complex BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); | |||
| double _Complex BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); | |||
| double _Complex BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); | |||
| xdouble _Complex BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||
| xdouble _Complex BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||
| openblas_complex_float BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); | |||
| openblas_complex_float BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); | |||
| openblas_complex_double BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); | |||
| openblas_complex_double BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); | |||
| openblas_complex_xdouble BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||
| openblas_complex_xdouble BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||
| #endif | |||
| void BLASFUNC(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *); | |||
| @@ -640,6 +642,8 @@ int BLASFUNC(zgemc)(char *, char *, blasint *, blasint *, blasint *, double *, | |||
| int BLASFUNC(xgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *, | |||
| xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); | |||
| /* Lapack routines */ | |||
| int BLASFUNC(sgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); | |||
| int BLASFUNC(dgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); | |||
| int BLASFUNC(qgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); | |||
| @@ -675,6 +679,13 @@ int BLASFUNC(cgesv)(blasint *, blasint *, float *, blasint *, blasint *, float | |||
| int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *); | |||
| int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *); | |||
| int BLASFUNC(sgesvd)(char *, char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(dgesvd)(char *, char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(qgesvd)(char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); | |||
| int BLASFUNC(cgesvd)(char *, char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(zgesvd)(char *, char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(xgesvd)(char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); | |||
| int BLASFUNC(spotf2)(char *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *); | |||
| @@ -689,6 +700,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); | |||
| int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); | |||
| int BLASFUNC(cpotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(zpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(xpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); | |||
| int BLASFUNC(slauu2)(char *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(dlauu2)(char *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(qlauu2)(char *, blasint *, xdouble *, blasint *, blasint *); | |||
| @@ -86,7 +86,13 @@ static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned | |||
| return syscall(SYS_set_mempolicy, mode, addr, flag); | |||
| } | |||
| static inline int my_gettid(void) { return syscall(SYS_gettid); } | |||
| static inline int my_gettid(void) { | |||
| #ifdef SYS_gettid | |||
| return syscall(SYS_gettid); | |||
| #else | |||
| return getpid(); | |||
| #endif | |||
| } | |||
| #endif | |||
| #endif | |||
| @@ -63,5 +63,7 @@ double _Complex BLASFUNC_REF(zdotc) (blasint *, double *, blasint *, double | |||
| void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *); | |||
| double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*); | |||
| FLOATRET BLASFUNC_REF(samax) (blasint *, float *, blasint *); | |||
| #endif | |||
| @@ -135,7 +135,7 @@ static __inline int num_cpu_avail(int level) { | |||
| int openmp_nthreads=0; | |||
| #endif | |||
| if ((blas_cpu_number == 1) | |||
| if (blas_cpu_number == 1 | |||
| #ifdef USE_OPENMP | |||
| || omp_in_parallel() | |||
| @@ -254,7 +254,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| #define PROFCODE | |||
| #endif | |||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) | |||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) | |||
| #define SAVEREGISTERS \ | |||
| subl $32, %esp;\ | |||
| movups %xmm6, 0(%esp);\ | |||
| @@ -269,7 +269,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| #define RESTOREREGISTERS | |||
| #endif | |||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) | |||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) | |||
| #define PROLOGUE \ | |||
| .text; \ | |||
| .align 16; \ | |||
| @@ -282,7 +282,7 @@ REALNAME: | |||
| #define EPILOGUE .end REALNAME | |||
| #endif | |||
| #if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) | |||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) | |||
| #define PROLOGUE \ | |||
| .text; \ | |||
| .align 16; \ | |||
| @@ -356,4 +356,11 @@ REALNAME: | |||
| #ifndef ALIGN_6 | |||
| #define ALIGN_6 .align 64 | |||
| // ffreep %st(0). | |||
| // Because Clang didn't support ffreep, we directly use the opcode. | |||
| // Please check out http://www.sandpile.org/x86/opc_fpu.htm | |||
| #ifndef ffreep | |||
| #define ffreep .byte 0xdf, 0xc0 # | |||
| #endif | |||
| #endif | |||
| @@ -353,7 +353,7 @@ REALNAME: | |||
| #define EPILOGUE .end REALNAME | |||
| #endif | |||
| #if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) | |||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) | |||
| #define PROLOGUE \ | |||
| .text; \ | |||
| .align 512; \ | |||
| @@ -425,6 +425,7 @@ REALNAME: | |||
| #define ALIGN_2 .align 2 | |||
| #define ALIGN_3 .align 3 | |||
| #define ALIGN_4 .align 4 | |||
| #define ALIGN_5 .align 5 | |||
| #define ffreep fstp | |||
| #endif | |||
| @@ -448,4 +449,10 @@ REALNAME: | |||
| #define ALIGN_6 .align 64 | |||
| #endif | |||
| // ffreep %st(0). | |||
| // Because Clang didn't support ffreep, we directly use the opcode. | |||
| // Please check out http://www.sandpile.org/x86/opc_fpu.htm | |||
| #ifndef ffreep | |||
| #define ffreep .byte 0xdf, 0xc0 # | |||
| #endif | |||
| #endif | |||
| @@ -103,6 +103,9 @@ | |||
| #define CORE_NEHALEM 17 | |||
| #define CORE_ATOM 18 | |||
| #define CORE_NANO 19 | |||
| #define CORE_SANDYBRIDGE 20 | |||
| #define CORE_BOBCAT 21 | |||
| #define CORE_BULLDOZER 22 | |||
| #define HAVE_SSE (1 << 0) | |||
| #define HAVE_SSE2 (1 << 1) | |||
| @@ -122,6 +125,8 @@ | |||
| #define HAVE_MISALIGNSSE (1 << 15) | |||
| #define HAVE_128BITFPU (1 << 16) | |||
| #define HAVE_FASTMOVU (1 << 17) | |||
| #define HAVE_AVX (1 << 18) | |||
| #define HAVE_FMA4 (1 << 19) | |||
| #define CACHE_INFO_L1_I 1 | |||
| #define CACHE_INFO_L1_D 2 | |||
| @@ -188,4 +193,7 @@ typedef struct { | |||
| #define CPUTYPE_NSGEODE 41 | |||
| #define CPUTYPE_VIAC3 42 | |||
| #define CPUTYPE_NANO 43 | |||
| #define CPUTYPE_SANDYBRIDGE 44 | |||
| #define CPUTYPE_BOBCAT 45 | |||
| #define CPUTYPE_BULLDOZER 46 | |||
| #endif | |||
| @@ -1,5 +1,5 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| @@ -101,12 +101,14 @@ int detect(void){ | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A")){ | |||
| return CPU_LOONGSON3A; | |||
| }else if(strstr(p, "Loongson-3B")){ | |||
| return CPU_LOONGSON3B; | |||
| }else if (strstr(p, "Loongson-3")){ | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| p = (char *)NULL; | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("system type", buffer, 11)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| @@ -119,6 +121,24 @@ int detect(void){ | |||
| }else{ | |||
| return CPU_SICORTEX; | |||
| } | |||
| } | |||
| //Check model name for Loongson3 | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| p = (char *)NULL; | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("model name", buffer, 10)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A")){ | |||
| return CPU_LOONGSON3A; | |||
| }else if(strstr(p, "Loongson-3B")){ | |||
| return CPU_LOONGSON3B; | |||
| } | |||
| } | |||
| #endif | |||
| return CPU_UNKNOWN; | |||
| } | |||
| @@ -40,6 +40,13 @@ | |||
| #include <string.h> | |||
| #include "cpuid.h" | |||
| #ifdef NO_AVX | |||
| #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM | |||
| #define CORE_SANDYBRIDGE CORE_NEHALEM | |||
| #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | |||
| #define CORE_BULLDOZER CORE_BARCELONA | |||
| #endif | |||
| #ifndef CPUIDEMU | |||
| #if defined(__APPLE__) && defined(__i386__) | |||
| @@ -109,6 +116,33 @@ static inline int have_excpuid(void){ | |||
| return eax & 0xffff; | |||
| } | |||
| #ifndef NO_AVX | |||
| static inline void xgetbv(int op, int * eax, int * edx){ | |||
| //Use binary code for xgetbv | |||
| __asm__ __volatile__ | |||
| (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); | |||
| } | |||
| #endif | |||
| int support_avx(){ | |||
| #ifndef NO_AVX | |||
| int eax, ebx, ecx, edx; | |||
| int ret=0; | |||
| cpuid(1, &eax, &ebx, &ecx, &edx); | |||
| if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){ | |||
| xgetbv(0, &eax, &edx); | |||
| if((eax & 6) == 6){ | |||
| ret=1; //OS support AVX | |||
| } | |||
| } | |||
| return ret; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| int get_vendor(void){ | |||
| int eax, ebx, ecx, edx; | |||
| char vendor[13]; | |||
| @@ -189,11 +223,17 @@ int get_cputype(int gettype){ | |||
| if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; | |||
| if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; | |||
| if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; | |||
| #ifndef NO_AVX | |||
| if (support_avx()) feature |= HAVE_AVX; | |||
| #endif | |||
| if (have_excpuid() >= 0x01) { | |||
| cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | |||
| if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; | |||
| if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; | |||
| #ifndef NO_AVX | |||
| if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4; | |||
| #endif | |||
| if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; | |||
| if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; | |||
| } | |||
| @@ -974,21 +1014,44 @@ int get_cpuname(void){ | |||
| return CPUTYPE_DUNNINGTON; | |||
| } | |||
| break; | |||
| case 2: | |||
| switch (model) { | |||
| case 5: | |||
| //Intel Core (Clarkdale) / Core (Arrandale) | |||
| // Pentium (Clarkdale) / Pentium Mobile (Arrandale) | |||
| // Xeon (Clarkdale), 32nm | |||
| return CPUTYPE_NEHALEM; | |||
| case 10: | |||
| //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | |||
| return CPUTYPE_NEHALEM; | |||
| case 12: | |||
| //Xeon Processor 5600 (Westmere-EP) | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 2: | |||
| switch (model) { | |||
| case 5: | |||
| //Intel Core (Clarkdale) / Core (Arrandale) | |||
| // Pentium (Clarkdale) / Pentium Mobile (Arrandale) | |||
| // Xeon (Clarkdale), 32nm | |||
| return CPUTYPE_NEHALEM; | |||
| case 10: | |||
| //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; //OS doesn't support AVX | |||
| case 12: | |||
| //Xeon Processor 5600 (Westmere-EP) | |||
| return CPUTYPE_NEHALEM; | |||
| case 13: | |||
| //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 14: | |||
| // Xeon E7540 | |||
| case 15: | |||
| //Xeon Processor E7 (Westmere-EX) | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 3: | |||
| switch (model) { | |||
| case 10: | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| } | |||
| break; | |||
| case 0x7: | |||
| @@ -1021,6 +1084,13 @@ int get_cpuname(void){ | |||
| case 1: | |||
| case 10: | |||
| return CPUTYPE_BARCELONA; | |||
| case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
| if(support_avx()) | |||
| return CPUTYPE_BULLDOZER; | |||
| else | |||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||
| case 5: | |||
| return CPUTYPE_BOBCAT; | |||
| } | |||
| break; | |||
| } | |||
| @@ -1140,6 +1210,9 @@ static char *cpuname[] = { | |||
| "NSGEODE", | |||
| "VIAC3", | |||
| "NANO", | |||
| "SANDYBRIDGE", | |||
| "BOBCAT", | |||
| "BULLDOZER", | |||
| }; | |||
| static char *lowercpuname[] = { | |||
| @@ -1186,6 +1259,9 @@ static char *lowercpuname[] = { | |||
| "tms3x00", | |||
| "nsgeode", | |||
| "nano", | |||
| "sandybridge", | |||
| "bobcat", | |||
| "bulldozer", | |||
| }; | |||
| static char *corename[] = { | |||
| @@ -1209,6 +1285,9 @@ static char *corename[] = { | |||
| "NEHALEM", | |||
| "ATOM", | |||
| "NANO", | |||
| "SANDYBRIDGE", | |||
| "BOBCAT", | |||
| "BULLDOZER", | |||
| }; | |||
| static char *corename_lower[] = { | |||
| @@ -1232,6 +1311,9 @@ static char *corename_lower[] = { | |||
| "nehalem", | |||
| "atom", | |||
| "nano", | |||
| "sandybridge", | |||
| "bobcat", | |||
| "bulldozer", | |||
| }; | |||
| @@ -1315,10 +1397,33 @@ int get_coretype(void){ | |||
| return CORE_NEHALEM; | |||
| case 10: | |||
| //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | |||
| return CORE_NEHALEM; | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; //OS doesn't support AVX | |||
| case 12: | |||
| //Xeon Processor 5600 (Westmere-EP) | |||
| return CORE_NEHALEM; | |||
| case 13: | |||
| //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; //OS doesn't support AVX | |||
| case 14: | |||
| //Xeon E7540 | |||
| case 15: | |||
| //Xeon Processor E7 (Westmere-EX) | |||
| return CORE_NEHALEM; | |||
| } | |||
| break; | |||
| case 3: | |||
| switch (model) { | |||
| case 10: | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; //OS doesn't support AVX | |||
| } | |||
| break; | |||
| } | |||
| @@ -1334,7 +1439,15 @@ int get_coretype(void){ | |||
| if (family <= 0x5) return CORE_80486; | |||
| if (family <= 0xe) return CORE_ATHLON; | |||
| if (family == 0xf){ | |||
| if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; | |||
| if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | |||
| else if (exfamily == 5) return CORE_BOBCAT; | |||
| else if (exfamily == 6) { | |||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
| if(support_avx()) | |||
| return CORE_BULLDOZER; | |||
| else | |||
| return CORE_BARCELONA; //OS don't support AVX. Use old kernels. | |||
| }else return CORE_BARCELONA; | |||
| } | |||
| } | |||
| @@ -1400,6 +1513,9 @@ void get_cpuconfig(void){ | |||
| printf("#define DTB_SIZE %d\n", info.size * 1024); | |||
| printf("#define DTB_ASSOCIATIVE %d\n", info.associative); | |||
| printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize); | |||
| } else { | |||
| //fall back for some virtual machines. | |||
| printf("#define DTB_DEFAULT_ENTRIES 32\n"); | |||
| } | |||
| features = get_cputype(GET_FEATURE); | |||
| @@ -1414,8 +1530,10 @@ void get_cpuconfig(void){ | |||
| if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); | |||
| if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); | |||
| if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); | |||
| if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); | |||
| if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | |||
| if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | |||
| if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | |||
| if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); | |||
| if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); | |||
| if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); | |||
| @@ -1479,7 +1597,9 @@ void get_sse(void){ | |||
| if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); | |||
| if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); | |||
| if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); | |||
| if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); | |||
| if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | |||
| if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | |||
| if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | |||
| } | |||
| @@ -34,20 +34,20 @@ COMPILER_GNU | |||
| OS_LINUX | |||
| #endif | |||
| #if defined(__FreeBSD__) | |||
| OS_FreeBSD | |||
| #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) | |||
| OS_FREEBSD | |||
| #endif | |||
| #if defined(__NetBSD__) | |||
| OS_NetBSD | |||
| OS_NETBSD | |||
| #endif | |||
| #if defined(__sun) | |||
| OS_SunOS | |||
| OS_SUNOS | |||
| #endif | |||
| #if defined(__APPLE__) | |||
| OS_Darwin | |||
| OS_DARWIN | |||
| #endif | |||
| #if defined(_AIX) | |||
| @@ -63,13 +63,18 @@ OS_WINNT | |||
| #endif | |||
| #if defined(__CYGWIN__) | |||
| OS_CYGWIN | |||
| OS_CYGWIN_NT | |||
| #endif | |||
| #if defined(__INTERIX) | |||
| OS_INTERIX | |||
| #endif | |||
| #if defined(__gnu_hurd__) | |||
| /* Hurd is very similar to GNU/Linux, it should work out of the box */ | |||
| OS_LINUX | |||
| #endif | |||
| #if defined(__i386) || defined(_X86) | |||
| ARCH_X86 | |||
| #endif | |||
| @@ -5,7 +5,7 @@ | |||
| TOPDIR = .. | |||
| include $(TOPDIR)/Makefile.system | |||
| CFLAGS += -DADD$(BU) -DCBLAS | |||
| override CFLAGS += -DADD$(BU) -DCBLAS | |||
| LIB = $(TOPDIR)/$(LIBNAME) | |||
| @@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||
| a = (FLOAT *)args -> a; | |||
| x = (FLOAT *)args -> b; | |||
| y = (FLOAT *)args -> c; | |||
| lda = args -> lda; | |||
| incx = args -> ldb; | |||
| @@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||
| n_from = 0; | |||
| n_to = n; | |||
| //Use y as each thread's n* COMPSIZE elements in sb buffer | |||
| y = buffer; | |||
| buffer += ((COMPSIZE * n + 1023) & ~1023); | |||
| if (range_m) { | |||
| n_from = *(range_m + 0); | |||
| n_to = *(range_m + 1); | |||
| @@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||
| a += n_from * lda * COMPSIZE; | |||
| } | |||
| if (range_n) y += *range_n * COMPSIZE; | |||
| if (incx != 1) { | |||
| COPY_K(n, x, incx, buffer, 1); | |||
| @@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
| if (num_cpu) { | |||
| queue[0].sa = NULL; | |||
| queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; | |||
| queue[0].sb = buffer; | |||
| queue[num_cpu - 1].next = NULL; | |||
| exec_blas(num_cpu, queue); | |||
| @@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
| #else | |||
| ONE, ZERO, | |||
| #endif | |||
| buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); | |||
| (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0); | |||
| } | |||
| AXPYU_K(n, 0, 0, | |||
| @@ -71,7 +71,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||
| queue[num_cpu].args = arg; | |||
| queue[num_cpu].range_m = range_m; | |||
| queue[num_cpu].range_n = &range[num_cpu]; | |||
| #if defined(LOONGSON3A) | |||
| #if 0 //defined(LOONGSON3A) | |||
| queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; | |||
| queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; | |||
| #else | |||
| @@ -83,7 +83,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||
| } | |||
| if (num_cpu) { | |||
| #if defined(LOONGSON3A) | |||
| #if 0 //defined(LOONGSON3A) | |||
| queue[0].sa = sa; | |||
| queue[0].sb = sa + GEMM_OFFSET_A1 * 5; | |||
| #else | |||
| @@ -1,12 +1,12 @@ | |||
| TOPDIR = ../.. | |||
| include ../../Makefile.system | |||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) | |||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) | |||
| COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | |||
| ifdef SMP | |||
| COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) | |||
| COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) | |||
| ifndef NO_AFFINITY | |||
| COMMONOBJS += init.$(SUFFIX) | |||
| endif | |||
| @@ -14,7 +14,7 @@ endif | |||
| # COMMONOBJS += info.$(SUFFIX) | |||
| ifdef DYNAMIC_ARCH | |||
| ifeq ($(DYNAMIC_ARCH), 1) | |||
| COMMONOBJS += dynamic.$(SUFFIX) | |||
| else | |||
| COMMONOBJS += parameter.$(SUFFIX) | |||
| @@ -70,7 +70,7 @@ ifndef BLAS_SERVER | |||
| BLAS_SERVER = blas_server.c | |||
| endif | |||
| ifdef DYNAMIC_ARCH | |||
| ifeq ($(DYNAMIC_ARCH), 1) | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | |||
| @@ -103,6 +103,9 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../. | |||
| openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| openblas_get_config.$(SUFFIX) : openblas_get_config.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| @@ -215,7 +218,7 @@ info.$(SUFFIX) : info.c info.h ../../common.h ../../param.h | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| hpl : CFLAGS += -DHPL | |||
| hpl_p : CFLAGS += -DHPL | |||
| hpl : override CFLAGS += -DHPL | |||
| hpl_p : override CFLAGS += -DHPL | |||
| include $(TOPDIR)/Makefile.tail | |||
| @@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){ | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| } | |||
| } | |||
| queue->sb=sb; | |||
| } | |||
| #ifdef MONITOR | |||
| @@ -435,7 +436,7 @@ static int blas_thread_server(void *arg){ | |||
| blas_memory_free(buffer); | |||
| pthread_exit(NULL); | |||
| //pthread_exit(NULL); | |||
| return 0; | |||
| } | |||
| @@ -770,6 +771,19 @@ void goto_set_num_threads(int num_threads) { | |||
| if (num_threads < 1) num_threads = blas_num_threads; | |||
| #ifndef NO_AFFINITY | |||
| if (num_threads == 1) { | |||
| if (blas_cpu_number == 1){ | |||
| //OpenBLAS is already single thread. | |||
| return; | |||
| }else{ | |||
| //From multi-threads to single thread | |||
| //Restore the original affinity mask | |||
| gotoblas_set_affinity(-1); | |||
| } | |||
| } | |||
| #endif | |||
| if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | |||
| if (num_threads > blas_num_threads) { | |||
| @@ -800,6 +814,13 @@ void goto_set_num_threads(int num_threads) { | |||
| UNLOCK_COMMAND(&server_lock); | |||
| } | |||
| #ifndef NO_AFFINITY | |||
| if(blas_cpu_number == 1 && num_threads > 1){ | |||
| //Restore the thread 0 affinity. | |||
| gotoblas_set_affinity(0); | |||
| } | |||
| #endif | |||
| blas_cpu_number = num_threads; | |||
| #if defined(ARCH_MIPS64) | |||
| @@ -49,8 +49,12 @@ | |||
| int blas_server_avail = 0; | |||
| static void * blas_thread_buffer[MAX_CPU_NUMBER]; | |||
| void goto_set_num_threads(int num_threads) { | |||
| int i=0; | |||
| if (num_threads < 1) num_threads = blas_num_threads; | |||
| if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | |||
| @@ -62,7 +66,19 @@ void goto_set_num_threads(int num_threads) { | |||
| blas_cpu_number = num_threads; | |||
| omp_set_num_threads(blas_cpu_number); | |||
| //adjust buffer for each thread | |||
| for(i=0; i<blas_cpu_number; i++){ | |||
| if(blas_thread_buffer[i]==NULL){ | |||
| blas_thread_buffer[i]=blas_memory_alloc(2); | |||
| } | |||
| } | |||
| for(; i<MAX_CPU_NUMBER; i++){ | |||
| if(blas_thread_buffer[i]!=NULL){ | |||
| blas_memory_free(blas_thread_buffer[i]); | |||
| blas_thread_buffer[i]=NULL; | |||
| } | |||
| } | |||
| #if defined(ARCH_MIPS64) | |||
| //set parameters for different number of threads. | |||
| blas_set_parameter(); | |||
| @@ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) { | |||
| int blas_thread_init(void){ | |||
| int i=0; | |||
| blas_get_cpu_number(); | |||
| blas_server_avail = 1; | |||
| for(i=0; i<blas_num_threads; i++){ | |||
| blas_thread_buffer[i]=blas_memory_alloc(2); | |||
| } | |||
| for(; i<MAX_CPU_NUMBER; i++){ | |||
| blas_thread_buffer[i]=NULL; | |||
| } | |||
| return 0; | |||
| } | |||
| int BLASFUNC(blas_thread_shutdown)(void){ | |||
| int i=0; | |||
| blas_server_avail = 0; | |||
| for(i=0; i<MAX_CPU_NUMBER; i++){ | |||
| if(blas_thread_buffer[i]!=NULL){ | |||
| blas_memory_free(blas_thread_buffer[i]); | |||
| blas_thread_buffer[i]=NULL; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -177,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| static void exec_threads(blas_queue_t *queue){ | |||
| void *buffer, *sa, *sb; | |||
| int pos=0, release_flag=0; | |||
| buffer = NULL; | |||
| sa = queue -> sa; | |||
| sb = queue -> sb; | |||
| @@ -189,7 +222,14 @@ static void exec_threads(blas_queue_t *queue){ | |||
| if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { | |||
| buffer = blas_memory_alloc(2); | |||
| pos = omp_get_thread_num(); | |||
| buffer = blas_thread_buffer[pos]; | |||
| //fallback | |||
| if(buffer==NULL) { | |||
| buffer = blas_memory_alloc(2); | |||
| release_flag=1; | |||
| } | |||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
| @@ -224,6 +264,7 @@ static void exec_threads(blas_queue_t *queue){ | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| } | |||
| } | |||
| queue->sb=sb; | |||
| } | |||
| } | |||
| @@ -241,7 +282,7 @@ static void exec_threads(blas_queue_t *queue){ | |||
| } | |||
| if (buffer != NULL) blas_memory_free(buffer); | |||
| if (release_flag) blas_memory_free(buffer); | |||
| } | |||
| @@ -63,6 +63,8 @@ static blas_pool_t pool; | |||
| static HANDLE blas_threads [MAX_CPU_NUMBER]; | |||
| static DWORD blas_threads_id[MAX_CPU_NUMBER]; | |||
| static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| if (!(mode & BLAS_COMPLEX)){ | |||
| @@ -179,7 +181,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| do { | |||
| action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); | |||
| } while ((action != WAIT_OBJECT_0) && (action == WAIT_OBJECT_0 + 1)); | |||
| } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1)); | |||
| if (action == WAIT_OBJECT_0 + 1) break; | |||
| @@ -251,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| } | |||
| } | |||
| queue->sb=sb; | |||
| } | |||
| #ifdef MONITOR | |||
| @@ -263,7 +266,9 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| } else { | |||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||
| } | |||
| } | |||
| }else{ | |||
| continue; //if queue == NULL | |||
| } | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); | |||
| @@ -425,7 +430,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| /* Shutdown procedure, but user don't have to call this routine. The */ | |||
| /* kernel automatically kill threads. */ | |||
| int blas_thread_shutdown_(void){ | |||
| int BLASFUNC(blas_thread_shutdown)(void){ | |||
| int i; | |||
| @@ -437,7 +442,7 @@ int blas_thread_shutdown_(void){ | |||
| SetEvent(pool.killed); | |||
| for(i = 0; i < blas_cpu_number - 1; i++){ | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| WaitForSingleObject(blas_threads[i], INFINITE); | |||
| } | |||
| @@ -448,3 +453,47 @@ int blas_thread_shutdown_(void){ | |||
| return 0; | |||
| } | |||
| void goto_set_num_threads(int num_threads) | |||
| { | |||
| long i; | |||
| if (num_threads < 1) num_threads = blas_cpu_number; | |||
| if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | |||
| if (num_threads > blas_num_threads) { | |||
| LOCK_COMMAND(&server_lock); | |||
| //increased_threads = 1; | |||
| if (!blas_server_avail){ | |||
| InitializeCriticalSection(&pool.lock); | |||
| pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); | |||
| pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); | |||
| pool.shutdown = 0; | |||
| pool.queue = NULL; | |||
| blas_server_avail = 1; | |||
| } | |||
| for(i = blas_num_threads - 1; i < num_threads - 1; i++){ | |||
| blas_threads[i] = CreateThread(NULL, 0, | |||
| blas_thread_server, (void *)i, | |||
| 0, &blas_threads_id[i]); | |||
| } | |||
| blas_num_threads = num_threads; | |||
| UNLOCK_COMMAND(&server_lock); | |||
| } | |||
| blas_cpu_number = num_threads; | |||
| } | |||
| void openblas_set_num_threads(int num) | |||
| { | |||
| goto_set_num_threads(num); | |||
| } | |||
| @@ -60,6 +60,16 @@ extern gotoblas_t gotoblas_NEHALEM; | |||
| extern gotoblas_t gotoblas_OPTERON; | |||
| extern gotoblas_t gotoblas_OPTERON_SSE3; | |||
| extern gotoblas_t gotoblas_BARCELONA; | |||
| extern gotoblas_t gotoblas_BOBCAT; | |||
| #ifndef NO_AVX | |||
| extern gotoblas_t gotoblas_SANDYBRIDGE; | |||
| extern gotoblas_t gotoblas_BULLDOZER; | |||
| #else | |||
| //Use NEHALEM kernels for sandy bridge | |||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | |||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | |||
| #endif | |||
| #define VENDOR_INTEL 1 | |||
| #define VENDOR_AMD 2 | |||
| @@ -68,6 +78,32 @@ extern gotoblas_t gotoblas_BARCELONA; | |||
| #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | |||
| #ifndef NO_AVX | |||
| static inline void xgetbv(int op, int * eax, int * edx){ | |||
| //Use binary code for xgetbv | |||
| __asm__ __volatile__ | |||
| (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); | |||
| } | |||
| #endif | |||
| int support_avx(){ | |||
| #ifndef NO_AVX | |||
| int eax, ebx, ecx, edx; | |||
| int ret=0; | |||
| cpuid(1, &eax, &ebx, &ecx, &edx); | |||
| if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){ | |||
| xgetbv(0, &eax, &edx); | |||
| if((eax & 6) == 6){ | |||
| ret=1; //OS support AVX | |||
| } | |||
| } | |||
| return ret; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| static int get_vendor(void){ | |||
| int eax, ebx, ecx, edx; | |||
| char vendor[13]; | |||
| @@ -122,15 +158,39 @@ static gotoblas_t *get_coretype(void){ | |||
| if (model == 12) return &gotoblas_ATOM; | |||
| return NULL; | |||
| case 2: | |||
| //Intel Core (Clarkdale) / Core (Arrandale) | |||
| // Pentium (Clarkdale) / Pentium Mobile (Arrandale) | |||
| // Xeon (Clarkdale), 32nm | |||
| if (model == 5) return &gotoblas_NEHALEM; | |||
| case 2: | |||
| //Intel Core (Clarkdale) / Core (Arrandale) | |||
| // Pentium (Clarkdale) / Pentium Mobile (Arrandale) | |||
| // Xeon (Clarkdale), 32nm | |||
| if (model == 5) return &gotoblas_NEHALEM; | |||
| //Intel Xeon Processor 5600 (Westmere-EP) | |||
| if (model == 12) return &gotoblas_NEHALEM; | |||
| return NULL; | |||
| //Intel Xeon Processor 5600 (Westmere-EP) | |||
| //Xeon Processor E7 (Westmere-EX) | |||
| //Xeon E7540 | |||
| if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; | |||
| //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | |||
| //Intel Core i7-3000 / Xeon E5 | |||
| if (model == 10 || model == 13) { | |||
| if(support_avx()) | |||
| return &gotoblas_SANDYBRIDGE; | |||
| else{ | |||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| return NULL; | |||
| case 3: | |||
| //Intel Sandy Bridge 22nm (Ivy Bridge?) | |||
| if (model == 10) { | |||
| if(support_avx()) | |||
| return &gotoblas_SANDYBRIDGE; | |||
| else{ | |||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| return NULL; | |||
| } | |||
| case 0xf: | |||
| if (model <= 0x2) return &gotoblas_NORTHWOOD; | |||
| @@ -144,7 +204,17 @@ static gotoblas_t *get_coretype(void){ | |||
| if ((exfamily == 0) || (exfamily == 2)) { | |||
| if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; | |||
| else return &gotoblas_OPTERON; | |||
| } else { | |||
| } else if (exfamily == 5) { | |||
| return &gotoblas_BOBCAT; | |||
| } else if (exfamily == 6) { | |||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
| if(support_avx()) | |||
| return &gotoblas_BULLDOZER; | |||
| else{ | |||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"); | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } else { | |||
| return &gotoblas_BARCELONA; | |||
| } | |||
| } | |||
| @@ -178,6 +248,9 @@ static char *corename[] = { | |||
| "Opteron(SSE3)", | |||
| "Barcelona", | |||
| "Nano", | |||
| "Sandybridge", | |||
| "Bobcat", | |||
| "Bulldozer", | |||
| }; | |||
| char *gotoblas_corename(void) { | |||
| @@ -197,7 +270,10 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_OPTERON) return corename[13]; | |||
| if (gotoblas == &gotoblas_BARCELONA) return corename[14]; | |||
| if (gotoblas == &gotoblas_NANO) return corename[15]; | |||
| if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | |||
| if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | |||
| if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | |||
| return corename[0]; | |||
| } | |||
| @@ -211,12 +287,21 @@ void gotoblas_dynamic_init(void) { | |||
| if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; | |||
| #else | |||
| if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; | |||
| /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ | |||
| if (sizeof(void*) == 8) { | |||
| if (gotoblas == &gotoblas_KATMAI || | |||
| gotoblas == &gotoblas_COPPERMINE || | |||
| gotoblas == &gotoblas_NORTHWOOD || | |||
| gotoblas == &gotoblas_BANIAS || | |||
| gotoblas == &gotoblas_ATHLON) | |||
| gotoblas = &gotoblas_PRESCOTT; | |||
| } | |||
| #endif | |||
| if (gotoblas && gotoblas -> init) { | |||
| gotoblas -> init(); | |||
| } else { | |||
| fprintf(stderr, "GotoBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||
| fprintf(stderr, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||
| exit(1); | |||
| } | |||
| @@ -1,5 +1,5 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| @@ -85,6 +85,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define MAX_NODES 16 | |||
| #define MAX_CPUS 256 | |||
| #define NCPUBITS (8*sizeof(unsigned long)) | |||
| #define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS) | |||
| #define CPUELT(cpu) ((cpu) / NCPUBITS) | |||
| #define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS)) | |||
| #define SH_MAGIC 0x510510 | |||
| @@ -103,10 +108,10 @@ typedef struct { | |||
| int num_nodes; | |||
| int num_procs; | |||
| int final_num_procs; | |||
| unsigned long avail; | |||
| unsigned long avail [MAX_BITMASK_LEN]; | |||
| int avail_count; | |||
| unsigned long cpu_info [MAX_CPUS]; | |||
| unsigned long node_info [MAX_NODES]; | |||
| unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN]; | |||
| int cpu_use[MAX_CPUS]; | |||
| } shm_t; | |||
| @@ -126,7 +131,8 @@ static shm_t *common = (void *)-1; | |||
| static int shmid, pshmid; | |||
| static void *paddr; | |||
| static unsigned long lprocmask, lnodemask; | |||
| static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask; | |||
| static int lprocmask_count = 0; | |||
| static int numprocs = 1; | |||
| static int numnodes = 1; | |||
| @@ -177,70 +183,114 @@ static inline int rcount(unsigned long number) { | |||
| than sizeof(unsigned long). On 64 bits, the limit | |||
| is 64. On 32 bits, it is 32. | |||
| ***/ | |||
| static inline unsigned long get_cpumap(int node) { | |||
| static inline void get_cpumap(int node, unsigned long * node_info) { | |||
| int infile; | |||
| unsigned long affinity; | |||
| unsigned long affinity[32]; | |||
| char name[160]; | |||
| char cpumap[160]; | |||
| char *p, *dummy; | |||
| char *dummy; | |||
| int i=0; | |||
| int count=0; | |||
| int k=0; | |||
| sprintf(name, CPUMAP_NAME, node); | |||
| infile = open(name, O_RDONLY); | |||
| for(i=0; i<32; i++){ | |||
| affinity[i] = 0; | |||
| } | |||
| affinity = 0; | |||
| if (infile != -1) { | |||
| read(infile, cpumap, sizeof(cpumap)); | |||
| p = cpumap; | |||
| while (*p != '\n' && i<160){ | |||
| if(*p != ',') { | |||
| name[i++]=*p; | |||
| } | |||
| p++; | |||
| } | |||
| p = name; | |||
| // while ((*p == '0') || (*p == ',')) p++; | |||
| for(i=0; i<160; i++){ | |||
| if(cpumap[i] == '\n') | |||
| break; | |||
| if(cpumap[i] != ','){ | |||
| name[k++]=cpumap[i]; | |||
| //Enough data for Hex | |||
| if(k >= NCPUBITS/4){ | |||
| affinity[count++] = strtoul(name, &dummy, 16); | |||
| k=0; | |||
| } | |||
| } | |||
| affinity = strtoul(p, &dummy, 16); | |||
| } | |||
| if(k!=0){ | |||
| name[k]='\0'; | |||
| affinity[count++] = strtoul(name, &dummy, 16); | |||
| k=0; | |||
| } | |||
| // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... | |||
| // revert the sequence | |||
| for(i=0; i<count && i<MAX_BITMASK_LEN; i++){ | |||
| node_info[i]=affinity[count-i-1]; | |||
| } | |||
| close(infile); | |||
| } | |||
| return affinity; | |||
| return ; | |||
| } | |||
| static inline unsigned long get_share(int cpu, int level) { | |||
| static inline void get_share(int cpu, int level, unsigned long * share) { | |||
| int infile; | |||
| unsigned long affinity; | |||
| unsigned long affinity[32]; | |||
| char cpumap[160]; | |||
| char name[160]; | |||
| char *p; | |||
| char *dummy; | |||
| int count=0; | |||
| int i=0,k=0; | |||
| int bitmask_idx = 0; | |||
| sprintf(name, SHARE_NAME, cpu, level); | |||
| infile = open(name, O_RDONLY); | |||
| affinity = (1UL << cpu); | |||
| // Init share | |||
| for(i=0; i<MAX_BITMASK_LEN; i++){ | |||
| share[i]=0; | |||
| } | |||
| bitmask_idx = CPUELT(cpu); | |||
| share[bitmask_idx] = CPUMASK(cpu); | |||
| if (infile != -1) { | |||
| read(infile, name, sizeof(name)); | |||
| p = name; | |||
| read(infile, cpumap, sizeof(cpumap)); | |||
| while ((*p == '0') || (*p == ',')) p++; | |||
| for(i=0; i<160; i++){ | |||
| if(cpumap[i] == '\n') | |||
| break; | |||
| if(cpumap[i] != ','){ | |||
| name[k++]=cpumap[i]; | |||
| //Enough data | |||
| if(k >= NCPUBITS/4){ | |||
| affinity[count++] = strtoul(name, &dummy, 16); | |||
| k=0; | |||
| } | |||
| } | |||
| affinity = strtol(p, &p, 16); | |||
| } | |||
| if(k!=0){ | |||
| name[k]='\0'; | |||
| affinity[count++] = strtoul(name, &dummy, 16); | |||
| k=0; | |||
| } | |||
| // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... | |||
| // revert the sequence | |||
| for(i=0; i<count && i<MAX_BITMASK_LEN; i++){ | |||
| share[i]=affinity[count-i-1]; | |||
| } | |||
| close(infile); | |||
| } | |||
| return affinity; | |||
| return ; | |||
| } | |||
| static int numa_check(void) { | |||
| @@ -248,6 +298,7 @@ static int numa_check(void) { | |||
| DIR *dp; | |||
| struct dirent *dir; | |||
| int node; | |||
| int j; | |||
| common -> num_nodes = 0; | |||
| @@ -258,7 +309,9 @@ static int numa_check(void) { | |||
| return 0; | |||
| } | |||
| for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0; | |||
| for (node = 0; node < MAX_NODES; node ++) { | |||
| for (j = 0; j<MAX_BITMASK_LEN; j++) common -> node_info[node][j] = 0; | |||
| } | |||
| while ((dir = readdir(dp)) != NULL) { | |||
| if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { | |||
| @@ -266,12 +319,12 @@ static int numa_check(void) { | |||
| node = atoi(&dir -> d_name[4]); | |||
| if (node > MAX_NODES) { | |||
| fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n"); | |||
| fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n"); | |||
| exit(1); | |||
| } | |||
| common -> num_nodes ++; | |||
| common -> node_info[node] = get_cpumap(node); | |||
| get_cpumap(node, common->node_info[node]); | |||
| } | |||
| } | |||
| @@ -284,7 +337,7 @@ static int numa_check(void) { | |||
| fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); | |||
| for (node = 0; node < common -> num_nodes; node ++) | |||
| fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]); | |||
| fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]); | |||
| #endif | |||
| return common -> num_nodes; | |||
| @@ -296,11 +349,13 @@ static void numa_mapping(void) { | |||
| int i, j, h; | |||
| unsigned long work, bit; | |||
| int count = 0; | |||
| int bitmask_idx = 0; | |||
| for (node = 0; node < common -> num_nodes; node ++) { | |||
| core = 0; | |||
| for (cpu = 0; cpu < common -> num_procs; cpu ++) { | |||
| if (common -> node_info[node] & common -> avail & (1UL << cpu)) { | |||
| bitmask_idx = CPUELT(cpu); | |||
| if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) { | |||
| common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); | |||
| count ++; | |||
| core ++; | |||
| @@ -357,58 +412,92 @@ static void numa_mapping(void) { | |||
| static void disable_hyperthread(void) { | |||
| unsigned long share; | |||
| unsigned long share[MAX_BITMASK_LEN]; | |||
| int cpu; | |||
| int bitmask_idx = 0; | |||
| int i=0, count=0; | |||
| bitmask_idx = CPUELT(common -> num_procs); | |||
| if(common->num_procs > 64){ | |||
| fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); | |||
| exit(1); | |||
| }else if(common->num_procs == 64){ | |||
| common -> avail = 0xFFFFFFFFFFFFFFFFUL; | |||
| }else | |||
| common -> avail = (1UL << common -> num_procs) - 1; | |||
| for(i=0; i< bitmask_idx; i++){ | |||
| common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL; | |||
| } | |||
| if(CPUMASK(common -> num_procs) != 1){ | |||
| common -> avail[count++] = CPUMASK(common -> num_procs) - 1; | |||
| } | |||
| common -> avail_count = count; | |||
| /* if(common->num_procs > 64){ */ | |||
| /* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */ | |||
| /* exit(1); */ | |||
| /* }else if(common->num_procs == 64){ */ | |||
| /* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */ | |||
| /* }else */ | |||
| /* common -> avail = (1UL << common -> num_procs) - 1; */ | |||
| #ifdef DEBUG | |||
| fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); | |||
| fprintf(stderr, "\nAvail CPUs : "); | |||
| for(i=0; i<count; i++) | |||
| fprintf(stderr, "%04lx ", common -> avail[i]); | |||
| fprintf(stderr, ".\n"); | |||
| #endif | |||
| for (cpu = 0; cpu < common -> num_procs; cpu ++) { | |||
| share = (get_share(cpu, 1) & common -> avail); | |||
| if (popcount(share) > 1) { | |||
| get_share(cpu, 1, share); | |||
| //When the shared cpu are in different element of share & avail array, this may be a bug. | |||
| for (i = 0; i < count ; i++){ | |||
| share[i] &= common->avail[i]; | |||
| if (popcount(share[i]) > 1) { | |||
| #ifdef DEBUG | |||
| fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", | |||
| cpu, share & ~(1UL << cpu)); | |||
| fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", | |||
| cpu, share[i] & ~(CPUMASK(cpu))); | |||
| #endif | |||
| common -> avail &= ~((share & ~(1UL << cpu))); | |||
| common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu))); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| static void disable_affinity(void) { | |||
| int i=0; | |||
| int bitmask_idx=0; | |||
| int count=0; | |||
| #ifdef DEBUG | |||
| fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail); | |||
| fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]); | |||
| fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); | |||
| #endif | |||
| if(common->final_num_procs > 64){ | |||
| fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); | |||
| exit(1); | |||
| }else if(common->final_num_procs == 64){ | |||
| lprocmask = 0xFFFFFFFFFFFFFFFFUL; | |||
| }else | |||
| lprocmask = (1UL << common -> final_num_procs) - 1; | |||
| /* if(common->final_num_procs > 64){ */ | |||
| /* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */ | |||
| /* exit(1); */ | |||
| /* }else if(common->final_num_procs == 64){ */ | |||
| /* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */ | |||
| /* }else */ | |||
| /* lprocmask = (1UL << common -> final_num_procs) - 1; */ | |||
| bitmask_idx = CPUELT(common -> final_num_procs); | |||
| for(i=0; i< bitmask_idx; i++){ | |||
| lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL; | |||
| } | |||
| if(CPUMASK(common -> final_num_procs) != 1){ | |||
| lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1; | |||
| } | |||
| lprocmask_count = count; | |||
| #ifndef USE_OPENMP | |||
| lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; | |||
| for(i=0; i< count; i++){ | |||
| lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i]; | |||
| } | |||
| #endif | |||
| #ifdef DEBUG | |||
| fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask); | |||
| fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]); | |||
| #endif | |||
| } | |||
| @@ -498,7 +587,7 @@ static void create_pshmem(void) { | |||
| static void local_cpu_map(void) { | |||
| int cpu, id, mapping; | |||
| int bitmask_idx = 0; | |||
| cpu = 0; | |||
| mapping = 0; | |||
| @@ -508,8 +597,9 @@ static void local_cpu_map(void) { | |||
| if (id > 0) { | |||
| if (is_dead(id)) common -> cpu_use[cpu] = 0; | |||
| } | |||
| if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) { | |||
| bitmask_idx = CPUELT(cpu); | |||
| if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) { | |||
| common -> cpu_use[cpu] = pshmid; | |||
| cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); | |||
| @@ -595,6 +685,7 @@ void gotoblas_affinity_init(void) { | |||
| #ifndef USE_OPENMP | |||
| cpu_set_t cpu_mask; | |||
| #endif | |||
| int i; | |||
| if (initialized) return; | |||
| @@ -646,6 +737,11 @@ void gotoblas_affinity_init(void) { | |||
| common -> num_procs = get_nprocs(); | |||
| if(common -> num_procs > MAX_CPUS) { | |||
| fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); | |||
| exit(1); | |||
| } | |||
| for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; | |||
| numa_check(); | |||
| @@ -654,7 +750,8 @@ void gotoblas_affinity_init(void) { | |||
| if (common -> num_nodes > 1) numa_mapping(); | |||
| common -> final_num_procs = popcount(common -> avail); | |||
| common -> final_num_procs = 0; | |||
| for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]); | |||
| for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; | |||
| @@ -664,7 +761,8 @@ void gotoblas_affinity_init(void) { | |||
| disable_affinity(); | |||
| num_avail = popcount(lprocmask); | |||
| num_avail = 0; | |||
| for(i=0; i<lprocmask_count; i++) num_avail += popcount(lprocmask[i]); | |||
| if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail; | |||
| @@ -1,5 +1,5 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| @@ -103,7 +103,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <sys/syscall.h> | |||
| #endif | |||
| #if defined(OS_FreeBSD) || defined(OS_Darwin) | |||
| #if defined(OS_FREEBSD) || defined(OS_DARWIN) | |||
| #include <sys/sysctl.h> | |||
| #endif | |||
| @@ -185,7 +185,7 @@ int get_num_procs(void) { | |||
| #endif | |||
| #if defined(OS_FreeBSD) || defined(OS_Darwin) | |||
| #if defined(OS_FREEBSD) | |||
| int get_num_procs(void) { | |||
| @@ -206,7 +206,27 @@ int get_num_procs(void) { | |||
| #endif | |||
| #if defined(OS_DARWIN) | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| size_t len; | |||
| if (nums == 0){ | |||
| len = sizeof(int); | |||
| sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0); | |||
| } | |||
| return nums; | |||
| } | |||
| #endif | |||
| /* | |||
| OpenBLAS uses the numbers of CPU cores in multithreading. | |||
| It can be set by openblas_set_num_threads(int num_threads); | |||
| */ | |||
| int blas_cpu_number = 0; | |||
| /* | |||
| The numbers of threads in the thread pool. | |||
| This value is equal or large than blas_cpu_number. This means some threads are sleep. | |||
| */ | |||
| int blas_num_threads = 0; | |||
| int goto_get_num_procs (void) { | |||
| @@ -215,7 +235,7 @@ int goto_get_num_procs (void) { | |||
| int blas_get_cpu_number(void){ | |||
| char *p; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) | |||
| int max_num; | |||
| #endif | |||
| int blas_goto_num = 0; | |||
| @@ -223,7 +243,7 @@ int blas_get_cpu_number(void){ | |||
| if (blas_num_threads) return blas_num_threads; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) | |||
| max_num = get_num_procs(); | |||
| #endif | |||
| @@ -250,7 +270,7 @@ int blas_get_cpu_number(void){ | |||
| else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | |||
| else blas_num_threads = MAX_CPU_NUMBER; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) | |||
| if (blas_num_threads > max_num) blas_num_threads = max_num; | |||
| #endif | |||
| @@ -1128,7 +1148,7 @@ static BLASULONG init_lock = 0UL; | |||
| static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, | |||
| void *sa, void *sb, BLASLONG pos) { | |||
| #ifndef ARCH_POWER | |||
| #if !defined(ARCH_POWER) && !defined(ARCH_SPARC) | |||
| long size; | |||
| BLASULONG buffer; | |||
| @@ -1289,6 +1309,7 @@ void DESTRUCTOR gotoblas_quit(void) { | |||
| moncontrol (1); | |||
| #endif | |||
| blas_shutdown(); | |||
| } | |||
| #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) | |||
| @@ -0,0 +1,59 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||
| be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include "common.h" | |||
| static char* openblas_config_str="" | |||
| #ifdef USE64BITINT | |||
| "USE64BITINT " | |||
| #endif | |||
| #ifdef NO_CBLAS | |||
| "NO_CBLAS " | |||
| #endif | |||
| #ifdef NO_LAPACK | |||
| "NO_LAPACK " | |||
| #endif | |||
| #ifdef NO_LAPACKE | |||
| "NO_LAPACKE " | |||
| #endif | |||
| #ifdef DYNAMIC_ARCH | |||
| "DYNAMIC_ARCH " | |||
| #endif | |||
| #ifdef NO_AFFINITY | |||
| "NO_AFFINITY " | |||
| #endif | |||
| ; | |||
| char* CNAME() { | |||
| return openblas_config_str; | |||
| } | |||
| @@ -1,5 +1,5 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| @@ -33,13 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #ifdef SMP_SERVER | |||
| #ifdef OS_LINUX | |||
| extern void openblas_set_num_threads(int num_threads) ; | |||
| void NAME(int* num_threads){ | |||
| void openblas_set_num_threads_(int* num_threads){ | |||
| openblas_set_num_threads(*num_threads); | |||
| } | |||
| #endif | |||
| #else | |||
| //Single thread | |||
| void openblas_set_num_threads(int num_threads) { | |||
| } | |||
| void openblas_set_num_threads_(int* num_threads){ | |||
| } | |||
| #endif | |||
| @@ -163,9 +163,9 @@ int get_L2_size(void){ | |||
| int eax, ebx, ecx, edx; | |||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ | |||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ | |||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | |||
| defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC) | |||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) | |||
| cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | |||
| @@ -384,6 +384,17 @@ void blas_set_parameter(void){ | |||
| #endif | |||
| #endif | |||
| #if defined(SANDYBRIDGE) | |||
| sgemm_p = 1024; | |||
| dgemm_p = 512; | |||
| cgemm_p = 512; | |||
| zgemm_p = 256; | |||
| #ifdef EXPRECISION | |||
| qgemm_p = 256; | |||
| xgemm_p = 128; | |||
| #endif | |||
| #endif | |||
| #if defined(CORE_PRESCOTT) || defined(GENERIC) | |||
| size >>= 6; | |||
| @@ -435,7 +446,7 @@ void blas_set_parameter(void){ | |||
| #endif | |||
| #endif | |||
| #if defined(CORE_BARCELONA) | |||
| #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT) | |||
| size >>= 8; | |||
| sgemm_p = 232 * size; | |||
| @@ -10,10 +10,23 @@ ifndef NO_CBLAS | |||
| NO_CBLAS = 0 | |||
| endif | |||
| ifndef NO_LAPACK | |||
| NO_LAPACK = 0 | |||
| endif | |||
| ifndef NO_LAPACKE | |||
| NO_LAPACKE = 0 | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| EXTRALIB += -lgfortran | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(C_COMPILER), GCC) | |||
| EXTRALIB += -lgomp | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| @@ -58,15 +71,20 @@ dll : ../$(LIBDLLNAME) | |||
| dll2 : libgoto2_shared.dll | |||
| # On Windows, we only generate a DLL without a version suffix. This is because | |||
| # applications which link against the dynamic library reference a fixed DLL name | |||
| # in their import table. By instead using a stable name it is possible to | |||
| # upgrade between library versions, without needing to re-link an application. | |||
| # For more details see: https://github.com/xianyi/OpenBLAS/issues/127. | |||
| ../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX) | |||
| $(RANLIB) ../$(LIBNAME) | |||
| ifeq ($(BINARY32), 1) | |||
| $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ | |||
| --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) | |||
| --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB) | |||
| -lib /machine:i386 /def:libopenblas.def | |||
| else | |||
| $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ | |||
| --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) | |||
| --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB) | |||
| -lib /machine:X64 /def:libopenblas.def | |||
| endif | |||
| @@ -76,13 +94,13 @@ libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def | |||
| -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) | |||
| libopenblas.def : gensymbol | |||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) | |||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) | |||
| libgoto2_shared.def : gensymbol | |||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) | |||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) | |||
| libgoto_hpl.def : gensymbol | |||
| perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) | |||
| perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) | |||
| $(LIBDYNNAME) : ../$(LIBNAME) osx.def | |||
| $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| @@ -106,14 +124,15 @@ so : ../$(LIBSONAME) | |||
| endif | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD)) | |||
| so : ../$(LIBSONAME) | |||
| ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c | |||
| $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ | |||
| -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ | |||
| -Wl,--retain-symbols-file=linux.def $(EXTRALIB) | |||
| -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB) | |||
| $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
| rm -f linktest | |||
| @@ -163,23 +182,23 @@ static : ../$(LIBNAME) | |||
| rm -f goto.$(SUFFIX) | |||
| linux.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) | |||
| perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) | |||
| osx.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) | |||
| aix.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) | |||
| symbol.S : gensymbol | |||
| perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > symbol.S | |||
| perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > symbol.S | |||
| test : linktest.c | |||
| $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | |||
| rm -f linktest | |||
| linktest.c : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > linktest.c | |||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > linktest.c | |||
| clean :: | |||
| @rm -f *.def *.dylib __.SYMDEF* | |||
| @@ -32,11 +32,12 @@ if ($compiler eq "") { | |||
| "pgf95", "pgf90", "pgf77", | |||
| "ifort"); | |||
| OUTER: | |||
| foreach $lists (@lists) { | |||
| foreach $path (@path) { | |||
| if (-f $path . "/" . $lists) { | |||
| if (-x $path . "/" . $lists) { | |||
| $compiler = $lists; | |||
| break; | |||
| last OUTER; | |||
| } | |||
| } | |||
| } | |||
| @@ -210,6 +211,10 @@ if (!$?) { | |||
| if ($?) { | |||
| $link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| #For gfortran MIPS | |||
| if ($?) { | |||
| $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| $binary = "" if ($?); | |||
| } | |||
| @@ -218,6 +223,10 @@ if (!$?) { | |||
| if ($?) { | |||
| $link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| #For gfortran MIPS | |||
| if ($?) { | |||
| $link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| $binary = "" if ($?); | |||
| } | |||
| @@ -237,6 +246,8 @@ if ($link ne "") { | |||
| $link =~ s/\-rpath\s+/\-rpath\@/g; | |||
| @flags = split(/[\s\,\n]/, $link); | |||
| # remove leading and trailing quotes from each flag. | |||
| @flags = map {s/^['"]|['"]$//g; $_} @flags; | |||
| foreach $flags (@flags) { | |||
| if ( | |||
| @@ -1,5 +1,5 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| @@ -96,12 +96,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* #define FORCE_PENRYN */ | |||
| /* #define FORCE_DUNNINGTON */ | |||
| /* #define FORCE_NEHALEM */ | |||
| /* #define FORCE_SANDYBRIDGE */ | |||
| /* #define FORCE_ATOM */ | |||
| /* #define FORCE_ATHLON */ | |||
| /* #define FORCE_OPTERON */ | |||
| /* #define FORCE_OPTERON_SSE3 */ | |||
| /* #define FORCE_BARCELONA */ | |||
| /* #define FORCE_SHANGHAI */ | |||
| /* #define FORCE_ISTANBUL */ | |||
| /* #define FORCE_BOBCAT */ | |||
| /* #define FORCE_BULLDOZER */ | |||
| /* #define FORCE_SSE_GENERIC */ | |||
| /* #define FORCE_VIAC3 */ | |||
| /* #define FORCE_NANO */ | |||
| @@ -116,12 +120,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* #define FORCE_PPC440FP2 */ | |||
| /* #define FORCE_CELL */ | |||
| /* #define FORCE_SICORTEX */ | |||
| /* #define FORCE_LOONGSON3A */ | |||
| /* #define FORCE_LOONGSON3B */ | |||
| /* #define FORCE_LOONGSON3A */ | |||
| /* #define FORCE_LOONGSON3B */ | |||
| /* #define FORCE_ITANIUM2 */ | |||
| /* #define FORCE_GENERIC */ | |||
| /* #define FORCE_SPARC */ | |||
| /* #define FORCE_SPARCV7 */ | |||
| /* #define FORCE_GENERIC */ | |||
| #ifdef FORCE_P2 | |||
| #define FORCE | |||
| @@ -137,32 +141,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "P5" | |||
| #endif | |||
| #ifdef FORCE_COPPERMINE | |||
| #ifdef FORCE_KATMAI | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "PENTIUM3" | |||
| #define ARCHCONFIG "-DPENTIUM3 " \ | |||
| "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||
| "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " | |||
| #define LIBNAME "coppermine" | |||
| #define CORENAME "COPPERMINE" | |||
| #define LIBNAME "katmai" | |||
| #define CORENAME "KATMAI" | |||
| #endif | |||
| #ifdef FORCE_KATMAI | |||
| #ifdef FORCE_COPPERMINE | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "PENTIUM3" | |||
| #define ARCHCONFIG "-DPENTIUM3 " \ | |||
| "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " | |||
| #define LIBNAME "katmai" | |||
| #define CORENAME "KATMAI" | |||
| #define LIBNAME "coppermine" | |||
| #define CORENAME "COPPERMINE" | |||
| #endif | |||
| #ifdef FORCE_NORTHWOOD | |||
| @@ -278,6 +282,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "NEHALEM" | |||
| #endif | |||
| #ifdef FORCE_SANDYBRIDGE | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "SANDYBRIDGE" | |||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | |||
| #define LIBNAME "sandybridge" | |||
| #define CORENAME "SANDYBRIDGE" | |||
| #endif | |||
| #ifdef FORCE_ATOM | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| @@ -342,13 +360,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ARCHCONFIG "-DBARCELONA " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL3_SIZE=2097152 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=48 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ | |||
| "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=48 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | |||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" | |||
| #define LIBNAME "barcelona" | |||
| #define CORENAME "BARCELONA" | |||
| #endif | |||
| #if defined(FORCE_BOBCAT) | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "BOBCAT" | |||
| #define ARCHCONFIG "-DBOBCAT " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \ | |||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV" | |||
| #define LIBNAME "bobcat" | |||
| #define CORENAME "BOBCAT" | |||
| #endif | |||
| #if defined (FORCE_BULLDOZER) | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "BULLDOZER" | |||
| #define ARCHCONFIG "-DBULLDOZER " \ | |||
| "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | |||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ | |||
| "-DHAVE_AVX -DHAVE_FMA4" | |||
| #define LIBNAME "bulldozer" | |||
| #define CORENAME "BULLDOZER" | |||
| #endif | |||
| #ifdef FORCE_SSE_GENERIC | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| @@ -34,7 +34,7 @@ int main(int argc, char **argv) { | |||
| #ifdef USE64BITINT | |||
| printf("#define USE64BITINT\n"); | |||
| #endif | |||
| printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD); | |||
| printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD); | |||
| } | |||
| return 0; | |||
| @@ -318,7 +318,7 @@ CZBLAS3OBJS = \ | |||
| ifndef NO_CBLAS | |||
| CFLAGS += -I. | |||
| override CFLAGS += -I. | |||
| SBLAS1OBJS += $(CSBLAS1OBJS) | |||
| SBLAS2OBJS += $(CSBLAS2OBJS) | |||
| @@ -400,7 +400,7 @@ all :: libs | |||
| ifdef FUNCTION_PROFILE | |||
| $(BLASOBJS) $(BLASOBJS_P) : functable.h | |||
| $(BLASOBJS) $(BLASOBJS_P) : CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) | |||
| $(BLASOBJS) $(BLASOBJS_P) : override CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) | |||
| functable.h : Makefile | |||
| ./create $(FUNCALLFILES) > functable.h | |||
| @@ -420,7 +420,7 @@ level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $ | |||
| $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ | |||
| $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ | |||
| $(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : CFLAGS += -DCBLAS | |||
| $(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS | |||
| srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| @@ -6,7 +6,7 @@ TOPDIR = .. | |||
| include $(TOPDIR)/Makefile.system | |||
| ifdef TARGET_CORE | |||
| CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| BUILD_KERNEL = 1 | |||
| KDIR = | |||
| TSUFFIX = _$(TARGET_CORE) | |||
| @@ -48,7 +48,7 @@ HPLOBJS = \ | |||
| COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX) | |||
| ifdef DYNAMIC_ARCH | |||
| ifeq ($(DYNAMIC_ARCH), 1) | |||
| SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX) | |||
| CCOMMON_OPT += -DTS=$(TSUFFIX) | |||
| endif | |||
| @@ -0,0 +1,235 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||
| be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||
| { | |||
| BLASLONG i,j; | |||
| BLASLONG idx=0; | |||
| BLASLONG ii; | |||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||
| for (j=0; j<col/4; j+=1) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src2 = src1+2*srcdim; | |||
| src3 = src2+2*srcdim; | |||
| src = src3+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (row<<3); | |||
| dest = dest+ii; | |||
| for (i=0; i<row/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| dest0[8] = src0[2]; | |||
| dest0[9] = src0[3]; | |||
| dest0[10] = src1[2]; | |||
| dest0[11] = src1[3]; | |||
| dest0[12] = src2[2]; | |||
| dest0[13] = src2[3]; | |||
| dest0[14] = src3[2]; | |||
| dest0[15] = src3[3]; | |||
| dest0[16] = src0[4]; | |||
| dest0[17] = src0[5]; | |||
| dest0[18] = src1[4]; | |||
| dest0[19] = src1[5]; | |||
| dest0[20] = src2[4]; | |||
| dest0[21] = src2[5]; | |||
| dest0[22] = src3[4]; | |||
| dest0[23] = src3[5]; | |||
| dest0[24] = src0[6]; | |||
| dest0[25] = src0[7]; | |||
| dest0[26] = src1[6]; | |||
| dest0[27] = src1[7]; | |||
| dest0[28] = src2[6]; | |||
| dest0[29] = src2[7]; | |||
| dest0[30] = src3[6]; | |||
| dest0[31] = src3[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| src2 = src2+8; | |||
| src3 = src3+8; | |||
| ii = (4<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&2) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| dest0[8] = src0[2]; | |||
| dest0[9] = src0[3]; | |||
| dest0[10] = src1[2]; | |||
| dest0[11] = src1[3]; | |||
| dest0[12] = src2[2]; | |||
| dest0[13] = src2[3]; | |||
| dest0[14] = src3[2]; | |||
| dest0[15] = src3[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| src2 = src2+4; | |||
| src3 = src3+4; | |||
| ii = (2<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| src2 = src2+2; | |||
| src3 = src3+2; | |||
| ii = (1<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| } | |||
| if (col&2) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src = src1+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (row<<2); | |||
| dest = dest+ii; | |||
| for (i=0; i<row/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src0[2]; | |||
| dest0[5] = src0[3]; | |||
| dest0[6] = src1[2]; | |||
| dest0[7] = src1[3]; | |||
| dest0[8] = src0[4]; | |||
| dest0[9] = src0[5]; | |||
| dest0[10] = src1[4]; | |||
| dest0[11] = src1[5]; | |||
| dest0[12] = src0[6]; | |||
| dest0[13] = src0[7]; | |||
| dest0[14] = src1[6]; | |||
| dest0[15] = src1[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| ii = (4<<2); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&2) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src0[2]; | |||
| dest0[5] = src0[3]; | |||
| dest0[6] = src1[2]; | |||
| dest0[7] = src1[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| ii = (2<<2); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| ii = (1<<2); | |||
| dest0 = dest0+ii; | |||
| } | |||
| } | |||
| if (col&1) | |||
| { | |||
| src0 = src; | |||
| src = src0+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (row<<1); | |||
| dest = dest+ii; | |||
| for (i=0; i<row/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| src0 = src0+8; | |||
| ii = (4<<1); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&2) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| src0 = src0+4; | |||
| ii = (2<<1); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| src0 = src0+2; | |||
| ii = (1<<1); | |||
| dest0 = dest0+ii; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,401 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||
| be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||
| { | |||
| BLASLONG i,j; | |||
| BLASLONG idx=0; | |||
| BLASLONG ii; | |||
| FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; | |||
| for (j=0; j<col/8; j+=1) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src2 = src1+2*srcdim; | |||
| src3 = src2+2*srcdim; | |||
| src4 = src3+2*srcdim; | |||
| src5 = src4+2*srcdim; | |||
| src6 = src5+2*srcdim; | |||
| src7 = src6+2*srcdim; | |||
| src = src7+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (row<<4); | |||
| dest = dest+ii; | |||
| for (i=0; i<row/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| dest0[8] = src4[0]; | |||
| dest0[9] = src4[1]; | |||
| dest0[10] = src5[0]; | |||
| dest0[11] = src5[1]; | |||
| dest0[12] = src6[0]; | |||
| dest0[13] = src6[1]; | |||
| dest0[14] = src7[0]; | |||
| dest0[15] = src7[1]; | |||
| dest0[16] = src0[2]; | |||
| dest0[17] = src0[3]; | |||
| dest0[18] = src1[2]; | |||
| dest0[19] = src1[3]; | |||
| dest0[20] = src2[2]; | |||
| dest0[21] = src2[3]; | |||
| dest0[22] = src3[2]; | |||
| dest0[23] = src3[3]; | |||
| dest0[24] = src4[2]; | |||
| dest0[25] = src4[3]; | |||
| dest0[26] = src5[2]; | |||
| dest0[27] = src5[3]; | |||
| dest0[28] = src6[2]; | |||
| dest0[29] = src6[3]; | |||
| dest0[30] = src7[2]; | |||
| dest0[31] = src7[3]; | |||
| dest0[32] = src0[4]; | |||
| dest0[33] = src0[5]; | |||
| dest0[34] = src1[4]; | |||
| dest0[35] = src1[5]; | |||
| dest0[36] = src2[4]; | |||
| dest0[37] = src2[5]; | |||
| dest0[38] = src3[4]; | |||
| dest0[39] = src3[5]; | |||
| dest0[40] = src4[4]; | |||
| dest0[41] = src4[5]; | |||
| dest0[42] = src5[4]; | |||
| dest0[43] = src5[5]; | |||
| dest0[44] = src6[4]; | |||
| dest0[45] = src6[5]; | |||
| dest0[46] = src7[4]; | |||
| dest0[47] = src7[5]; | |||
| dest0[48] = src0[6]; | |||
| dest0[49] = src0[7]; | |||
| dest0[50] = src1[6]; | |||
| dest0[51] = src1[7]; | |||
| dest0[52] = src2[6]; | |||
| dest0[53] = src2[7]; | |||
| dest0[54] = src3[6]; | |||
| dest0[55] = src3[7]; | |||
| dest0[56] = src4[6]; | |||
| dest0[57] = src4[7]; | |||
| dest0[58] = src5[6]; | |||
| dest0[59] = src5[7]; | |||
| dest0[60] = src6[6]; | |||
| dest0[61] = src6[7]; | |||
| dest0[62] = src7[6]; | |||
| dest0[63] = src7[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| src2 = src2+8; | |||
| src3 = src3+8; | |||
| src4 = src4+8; | |||
| src5 = src5+8; | |||
| src6 = src6+8; | |||
| src7 = src7+8; | |||
| ii = (4<<4); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&2) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| dest0[8] = src4[0]; | |||
| dest0[9] = src4[1]; | |||
| dest0[10] = src5[0]; | |||
| dest0[11] = src5[1]; | |||
| dest0[12] = src6[0]; | |||
| dest0[13] = src6[1]; | |||
| dest0[14] = src7[0]; | |||
| dest0[15] = src7[1]; | |||
| dest0[16] = src0[2]; | |||
| dest0[17] = src0[3]; | |||
| dest0[18] = src1[2]; | |||
| dest0[19] = src1[3]; | |||
| dest0[20] = src2[2]; | |||
| dest0[21] = src2[3]; | |||
| dest0[22] = src3[2]; | |||
| dest0[23] = src3[3]; | |||
| dest0[24] = src4[2]; | |||
| dest0[25] = src4[3]; | |||
| dest0[26] = src5[2]; | |||
| dest0[27] = src5[3]; | |||
| dest0[28] = src6[2]; | |||
| dest0[29] = src6[3]; | |||
| dest0[30] = src7[2]; | |||
| dest0[31] = src7[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| src2 = src2+4; | |||
| src3 = src3+4; | |||
| src4 = src4+4; | |||
| src5 = src5+4; | |||
| src6 = src6+4; | |||
| src7 = src7+4; | |||
| ii = (2<<4); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| dest0[8] = src4[0]; | |||
| dest0[9] = src4[1]; | |||
| dest0[10] = src5[0]; | |||
| dest0[11] = src5[1]; | |||
| dest0[12] = src6[0]; | |||
| dest0[13] = src6[1]; | |||
| dest0[14] = src7[0]; | |||
| dest0[15] = src7[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| src2 = src2+2; | |||
| src3 = src3+2; | |||
| src4 = src4+2; | |||
| src5 = src5+2; | |||
| src6 = src6+2; | |||
| src7 = src7+2; | |||
| ii = (1<<4); | |||
| dest0 = dest0+ii; | |||
| } | |||
| } | |||
| if (col&4) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src2 = src1+2*srcdim; | |||
| src3 = src2+2*srcdim; | |||
| src = src3+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (row<<3); | |||
| dest = dest+ii; | |||
| for (i=0; i<row/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| dest0[8] = src0[2]; | |||
| dest0[9] = src0[3]; | |||
| dest0[10] = src1[2]; | |||
| dest0[11] = src1[3]; | |||
| dest0[12] = src2[2]; | |||
| dest0[13] = src2[3]; | |||
| dest0[14] = src3[2]; | |||
| dest0[15] = src3[3]; | |||
| dest0[16] = src0[4]; | |||
| dest0[17] = src0[5]; | |||
| dest0[18] = src1[4]; | |||
| dest0[19] = src1[5]; | |||
| dest0[20] = src2[4]; | |||
| dest0[21] = src2[5]; | |||
| dest0[22] = src3[4]; | |||
| dest0[23] = src3[5]; | |||
| dest0[24] = src0[6]; | |||
| dest0[25] = src0[7]; | |||
| dest0[26] = src1[6]; | |||
| dest0[27] = src1[7]; | |||
| dest0[28] = src2[6]; | |||
| dest0[29] = src2[7]; | |||
| dest0[30] = src3[6]; | |||
| dest0[31] = src3[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| src2 = src2+8; | |||
| src3 = src3+8; | |||
| ii = (4<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&2) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| dest0[8] = src0[2]; | |||
| dest0[9] = src0[3]; | |||
| dest0[10] = src1[2]; | |||
| dest0[11] = src1[3]; | |||
| dest0[12] = src2[2]; | |||
| dest0[13] = src2[3]; | |||
| dest0[14] = src3[2]; | |||
| dest0[15] = src3[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| src2 = src2+4; | |||
| src3 = src3+4; | |||
| ii = (2<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src2[0]; | |||
| dest0[5] = src2[1]; | |||
| dest0[6] = src3[0]; | |||
| dest0[7] = src3[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| src2 = src2+2; | |||
| src3 = src3+2; | |||
| ii = (1<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| } | |||
| if (col&2) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src = src1+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (row<<2); | |||
| dest = dest+ii; | |||
| for (i=0; i<row/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src0[2]; | |||
| dest0[5] = src0[3]; | |||
| dest0[6] = src1[2]; | |||
| dest0[7] = src1[3]; | |||
| dest0[8] = src0[4]; | |||
| dest0[9] = src0[5]; | |||
| dest0[10] = src1[4]; | |||
| dest0[11] = src1[5]; | |||
| dest0[12] = src0[6]; | |||
| dest0[13] = src0[7]; | |||
| dest0[14] = src1[6]; | |||
| dest0[15] = src1[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| ii = (4<<2); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&2) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| dest0[4] = src0[2]; | |||
| dest0[5] = src0[3]; | |||
| dest0[6] = src1[2]; | |||
| dest0[7] = src1[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| ii = (2<<2); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src1[0]; | |||
| dest0[3] = src1[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| ii = (1<<2); | |||
| dest0 = dest0+ii; | |||
| } | |||
| } | |||
| if (col&1) | |||
| { | |||
| src0 = src; | |||
| src = src0+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (row<<1); | |||
| dest = dest+ii; | |||
| for (i=0; i<row/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| src0 = src0+8; | |||
| ii = (4<<1); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&2) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| src0 = src0+4; | |||
| ii = (2<<1); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (row&1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| src0 = src0+2; | |||
| ii = (1<<1); | |||
| dest0 = dest0+ii; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,237 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||
| be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||
| { | |||
| BLASLONG i,j; | |||
| BLASLONG idx=0; | |||
| BLASLONG ii; | |||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||
| FLOAT *dest1,*dest2; | |||
| ii = col&-4; | |||
| ii = ii*(2*row); | |||
| dest2 = dest+ii; | |||
| ii = col&-2; | |||
| ii = ii*(2*row); | |||
| dest1 = dest+ii; | |||
| for (j=0; j<row/4; j+=1) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src2 = src1+2*srcdim; | |||
| src3 = src2+2*srcdim; | |||
| src = src3+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (4<<3); | |||
| dest = dest+ii; | |||
| for (i=0; i<col/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| dest0[8] = src1[0]; | |||
| dest0[9] = src1[1]; | |||
| dest0[10] = src1[2]; | |||
| dest0[11] = src1[3]; | |||
| dest0[12] = src1[4]; | |||
| dest0[13] = src1[5]; | |||
| dest0[14] = src1[6]; | |||
| dest0[15] = src1[7]; | |||
| dest0[16] = src2[0]; | |||
| dest0[17] = src2[1]; | |||
| dest0[18] = src2[2]; | |||
| dest0[19] = src2[3]; | |||
| dest0[20] = src2[4]; | |||
| dest0[21] = src2[5]; | |||
| dest0[22] = src2[6]; | |||
| dest0[23] = src2[7]; | |||
| dest0[24] = src3[0]; | |||
| dest0[25] = src3[1]; | |||
| dest0[26] = src3[2]; | |||
| dest0[27] = src3[3]; | |||
| dest0[28] = src3[4]; | |||
| dest0[29] = src3[5]; | |||
| dest0[30] = src3[6]; | |||
| dest0[31] = src3[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| src2 = src2+8; | |||
| src3 = src3+8; | |||
| ii = (row<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (col&2) | |||
| { | |||
| dest2[0] = src0[0]; | |||
| dest2[1] = src0[1]; | |||
| dest2[2] = src0[2]; | |||
| dest2[3] = src0[3]; | |||
| dest2[4] = src1[0]; | |||
| dest2[5] = src1[1]; | |||
| dest2[6] = src1[2]; | |||
| dest2[7] = src1[3]; | |||
| dest2[8] = src2[0]; | |||
| dest2[9] = src2[1]; | |||
| dest2[10] = src2[2]; | |||
| dest2[11] = src2[3]; | |||
| dest2[12] = src3[0]; | |||
| dest2[13] = src3[1]; | |||
| dest2[14] = src3[2]; | |||
| dest2[15] = src3[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| src2 = src2+4; | |||
| src3 = src3+4; | |||
| dest2 = dest2+16; | |||
| } | |||
| if (col&1) | |||
| { | |||
| dest1[0] = src0[0]; | |||
| dest1[1] = src0[1]; | |||
| dest1[2] = src1[0]; | |||
| dest1[3] = src1[1]; | |||
| dest1[4] = src2[0]; | |||
| dest1[5] = src2[1]; | |||
| dest1[6] = src3[0]; | |||
| dest1[7] = src3[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| src2 = src2+2; | |||
| src3 = src3+2; | |||
| dest1 = dest1+8; | |||
| } | |||
| } | |||
| if (row&2) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src = src1+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (2<<3); | |||
| dest = dest+ii; | |||
| for (i=0; i<col/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| dest0[8] = src1[0]; | |||
| dest0[9] = src1[1]; | |||
| dest0[10] = src1[2]; | |||
| dest0[11] = src1[3]; | |||
| dest0[12] = src1[4]; | |||
| dest0[13] = src1[5]; | |||
| dest0[14] = src1[6]; | |||
| dest0[15] = src1[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| ii = (row<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (col&2) | |||
| { | |||
| dest2[0] = src0[0]; | |||
| dest2[1] = src0[1]; | |||
| dest2[2] = src0[2]; | |||
| dest2[3] = src0[3]; | |||
| dest2[4] = src1[0]; | |||
| dest2[5] = src1[1]; | |||
| dest2[6] = src1[2]; | |||
| dest2[7] = src1[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| dest2 = dest2+8; | |||
| } | |||
| if (col&1) | |||
| { | |||
| dest1[0] = src0[0]; | |||
| dest1[1] = src0[1]; | |||
| dest1[2] = src1[0]; | |||
| dest1[3] = src1[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| dest1 = dest1+4; | |||
| } | |||
| } | |||
| if (row&1) | |||
| { | |||
| src0 = src; | |||
| src = src0+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (1<<3); | |||
| dest = dest+ii; | |||
| for (i=0; i<col/4; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| src0 = src0+8; | |||
| ii = (row<<3); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (col&2) | |||
| { | |||
| dest2[0] = src0[0]; | |||
| dest2[1] = src0[1]; | |||
| dest2[2] = src0[2]; | |||
| dest2[3] = src0[3]; | |||
| src0 = src0+4; | |||
| dest2 = dest2+4; | |||
| } | |||
| if (col&1) | |||
| { | |||
| dest1[0] = src0[0]; | |||
| dest1[1] = src0[1]; | |||
| src0 = src0+2; | |||
| dest1 = dest1+2; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,370 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||
| be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) | |||
| { | |||
| BLASLONG i,j; | |||
| BLASLONG idx=0; | |||
| BLASLONG ii; | |||
| FLOAT *src0,*src1,*src2,*src3,*dest0; | |||
| FLOAT *dest1,*dest2,*dest4; | |||
| ii = col&-8; | |||
| ii = ii*(2*row); | |||
| dest4 = dest+ii; | |||
| ii = col&-4; | |||
| ii = ii*(2*row); | |||
| dest2 = dest+ii; | |||
| ii = col&-2; | |||
| ii = ii*(2*row); | |||
| dest1 = dest+ii; | |||
| for (j=0; j<row/4; j+=1) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src2 = src1+2*srcdim; | |||
| src3 = src2+2*srcdim; | |||
| src = src3+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (4<<4); | |||
| dest = dest+ii; | |||
| for (i=0; i<col/8; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| dest0[8] = src0[8]; | |||
| dest0[9] = src0[9]; | |||
| dest0[10] = src0[10]; | |||
| dest0[11] = src0[11]; | |||
| dest0[12] = src0[12]; | |||
| dest0[13] = src0[13]; | |||
| dest0[14] = src0[14]; | |||
| dest0[15] = src0[15]; | |||
| dest0[16] = src1[0]; | |||
| dest0[17] = src1[1]; | |||
| dest0[18] = src1[2]; | |||
| dest0[19] = src1[3]; | |||
| dest0[20] = src1[4]; | |||
| dest0[21] = src1[5]; | |||
| dest0[22] = src1[6]; | |||
| dest0[23] = src1[7]; | |||
| dest0[24] = src1[8]; | |||
| dest0[25] = src1[9]; | |||
| dest0[26] = src1[10]; | |||
| dest0[27] = src1[11]; | |||
| dest0[28] = src1[12]; | |||
| dest0[29] = src1[13]; | |||
| dest0[30] = src1[14]; | |||
| dest0[31] = src1[15]; | |||
| dest0[32] = src2[0]; | |||
| dest0[33] = src2[1]; | |||
| dest0[34] = src2[2]; | |||
| dest0[35] = src2[3]; | |||
| dest0[36] = src2[4]; | |||
| dest0[37] = src2[5]; | |||
| dest0[38] = src2[6]; | |||
| dest0[39] = src2[7]; | |||
| dest0[40] = src2[8]; | |||
| dest0[41] = src2[9]; | |||
| dest0[42] = src2[10]; | |||
| dest0[43] = src2[11]; | |||
| dest0[44] = src2[12]; | |||
| dest0[45] = src2[13]; | |||
| dest0[46] = src2[14]; | |||
| dest0[47] = src2[15]; | |||
| dest0[48] = src3[0]; | |||
| dest0[49] = src3[1]; | |||
| dest0[50] = src3[2]; | |||
| dest0[51] = src3[3]; | |||
| dest0[52] = src3[4]; | |||
| dest0[53] = src3[5]; | |||
| dest0[54] = src3[6]; | |||
| dest0[55] = src3[7]; | |||
| dest0[56] = src3[8]; | |||
| dest0[57] = src3[9]; | |||
| dest0[58] = src3[10]; | |||
| dest0[59] = src3[11]; | |||
| dest0[60] = src3[12]; | |||
| dest0[61] = src3[13]; | |||
| dest0[62] = src3[14]; | |||
| dest0[63] = src3[15]; | |||
| src0 = src0+16; | |||
| src1 = src1+16; | |||
| src2 = src2+16; | |||
| src3 = src3+16; | |||
| ii = (row<<4); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (col&4) | |||
| { | |||
| dest4[0] = src0[0]; | |||
| dest4[1] = src0[1]; | |||
| dest4[2] = src0[2]; | |||
| dest4[3] = src0[3]; | |||
| dest4[4] = src0[4]; | |||
| dest4[5] = src0[5]; | |||
| dest4[6] = src0[6]; | |||
| dest4[7] = src0[7]; | |||
| dest4[8] = src1[0]; | |||
| dest4[9] = src1[1]; | |||
| dest4[10] = src1[2]; | |||
| dest4[11] = src1[3]; | |||
| dest4[12] = src1[4]; | |||
| dest4[13] = src1[5]; | |||
| dest4[14] = src1[6]; | |||
| dest4[15] = src1[7]; | |||
| dest4[16] = src2[0]; | |||
| dest4[17] = src2[1]; | |||
| dest4[18] = src2[2]; | |||
| dest4[19] = src2[3]; | |||
| dest4[20] = src2[4]; | |||
| dest4[21] = src2[5]; | |||
| dest4[22] = src2[6]; | |||
| dest4[23] = src2[7]; | |||
| dest4[24] = src3[0]; | |||
| dest4[25] = src3[1]; | |||
| dest4[26] = src3[2]; | |||
| dest4[27] = src3[3]; | |||
| dest4[28] = src3[4]; | |||
| dest4[29] = src3[5]; | |||
| dest4[30] = src3[6]; | |||
| dest4[31] = src3[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| src2 = src2+8; | |||
| src3 = src3+8; | |||
| dest4 = dest4+32; | |||
| } | |||
| if (col&2) | |||
| { | |||
| dest2[0] = src0[0]; | |||
| dest2[1] = src0[1]; | |||
| dest2[2] = src0[2]; | |||
| dest2[3] = src0[3]; | |||
| dest2[4] = src1[0]; | |||
| dest2[5] = src1[1]; | |||
| dest2[6] = src1[2]; | |||
| dest2[7] = src1[3]; | |||
| dest2[8] = src2[0]; | |||
| dest2[9] = src2[1]; | |||
| dest2[10] = src2[2]; | |||
| dest2[11] = src2[3]; | |||
| dest2[12] = src3[0]; | |||
| dest2[13] = src3[1]; | |||
| dest2[14] = src3[2]; | |||
| dest2[15] = src3[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| src2 = src2+4; | |||
| src3 = src3+4; | |||
| dest2 = dest2+16; | |||
| } | |||
| if (col&1) | |||
| { | |||
| dest1[0] = src0[0]; | |||
| dest1[1] = src0[1]; | |||
| dest1[2] = src1[0]; | |||
| dest1[3] = src1[1]; | |||
| dest1[4] = src2[0]; | |||
| dest1[5] = src2[1]; | |||
| dest1[6] = src3[0]; | |||
| dest1[7] = src3[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| src2 = src2+2; | |||
| src3 = src3+2; | |||
| dest1 = dest1+8; | |||
| } | |||
| } | |||
| if (row&2) | |||
| { | |||
| src0 = src; | |||
| src1 = src0+2*srcdim; | |||
| src = src1+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (2<<4); | |||
| dest = dest+ii; | |||
| for (i=0; i<col/8; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| dest0[8] = src0[8]; | |||
| dest0[9] = src0[9]; | |||
| dest0[10] = src0[10]; | |||
| dest0[11] = src0[11]; | |||
| dest0[12] = src0[12]; | |||
| dest0[13] = src0[13]; | |||
| dest0[14] = src0[14]; | |||
| dest0[15] = src0[15]; | |||
| dest0[16] = src1[0]; | |||
| dest0[17] = src1[1]; | |||
| dest0[18] = src1[2]; | |||
| dest0[19] = src1[3]; | |||
| dest0[20] = src1[4]; | |||
| dest0[21] = src1[5]; | |||
| dest0[22] = src1[6]; | |||
| dest0[23] = src1[7]; | |||
| dest0[24] = src1[8]; | |||
| dest0[25] = src1[9]; | |||
| dest0[26] = src1[10]; | |||
| dest0[27] = src1[11]; | |||
| dest0[28] = src1[12]; | |||
| dest0[29] = src1[13]; | |||
| dest0[30] = src1[14]; | |||
| dest0[31] = src1[15]; | |||
| src0 = src0+16; | |||
| src1 = src1+16; | |||
| ii = (row<<4); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (col&4) | |||
| { | |||
| dest4[0] = src0[0]; | |||
| dest4[1] = src0[1]; | |||
| dest4[2] = src0[2]; | |||
| dest4[3] = src0[3]; | |||
| dest4[4] = src0[4]; | |||
| dest4[5] = src0[5]; | |||
| dest4[6] = src0[6]; | |||
| dest4[7] = src0[7]; | |||
| dest4[8] = src1[0]; | |||
| dest4[9] = src1[1]; | |||
| dest4[10] = src1[2]; | |||
| dest4[11] = src1[3]; | |||
| dest4[12] = src1[4]; | |||
| dest4[13] = src1[5]; | |||
| dest4[14] = src1[6]; | |||
| dest4[15] = src1[7]; | |||
| src0 = src0+8; | |||
| src1 = src1+8; | |||
| dest4 = dest4+16; | |||
| } | |||
| if (col&2) | |||
| { | |||
| dest2[0] = src0[0]; | |||
| dest2[1] = src0[1]; | |||
| dest2[2] = src0[2]; | |||
| dest2[3] = src0[3]; | |||
| dest2[4] = src1[0]; | |||
| dest2[5] = src1[1]; | |||
| dest2[6] = src1[2]; | |||
| dest2[7] = src1[3]; | |||
| src0 = src0+4; | |||
| src1 = src1+4; | |||
| dest2 = dest2+8; | |||
| } | |||
| if (col&1) | |||
| { | |||
| dest1[0] = src0[0]; | |||
| dest1[1] = src0[1]; | |||
| dest1[2] = src1[0]; | |||
| dest1[3] = src1[1]; | |||
| src0 = src0+2; | |||
| src1 = src1+2; | |||
| dest1 = dest1+4; | |||
| } | |||
| } | |||
| if (row&1) | |||
| { | |||
| src0 = src; | |||
| src = src0+2*srcdim; | |||
| dest0 = dest; | |||
| ii = (1<<4); | |||
| dest = dest+ii; | |||
| for (i=0; i<col/8; i+=1) | |||
| { | |||
| dest0[0] = src0[0]; | |||
| dest0[1] = src0[1]; | |||
| dest0[2] = src0[2]; | |||
| dest0[3] = src0[3]; | |||
| dest0[4] = src0[4]; | |||
| dest0[5] = src0[5]; | |||
| dest0[6] = src0[6]; | |||
| dest0[7] = src0[7]; | |||
| dest0[8] = src0[8]; | |||
| dest0[9] = src0[9]; | |||
| dest0[10] = src0[10]; | |||
| dest0[11] = src0[11]; | |||
| dest0[12] = src0[12]; | |||
| dest0[13] = src0[13]; | |||
| dest0[14] = src0[14]; | |||
| dest0[15] = src0[15]; | |||
| src0 = src0+16; | |||
| ii = (row<<4); | |||
| dest0 = dest0+ii; | |||
| } | |||
| if (col&4) | |||
| { | |||
| dest4[0] = src0[0]; | |||
| dest4[1] = src0[1]; | |||
| dest4[2] = src0[2]; | |||
| dest4[3] = src0[3]; | |||
| dest4[4] = src0[4]; | |||
| dest4[5] = src0[5]; | |||
| dest4[6] = src0[6]; | |||
| dest4[7] = src0[7]; | |||
| src0 = src0+8; | |||
| dest4 = dest4+8; | |||
| } | |||
| if (col&2) | |||
| { | |||
| dest2[0] = src0[0]; | |||
| dest2[1] = src0[1]; | |||
| dest2[2] = src0[2]; | |||
| dest2[3] = src0[3]; | |||
| src0 = src0+4; | |||
| dest2 = dest2+4; | |||
| } | |||
| if (col&1) | |||
| { | |||
| dest1[0] = src0[0]; | |||
| dest1[1] = src0[1]; | |||
| src0 = src0+2; | |||
| dest1 = dest1+2; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -634,10 +634,10 @@ static void init_parameter(void) { | |||
| TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; | |||
| #endif | |||
| #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) | |||
| #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON) | |||
| #ifdef DEBUG | |||
| fprintf(stderr, "Katmai, Coppermine, Banias\n"); | |||
| fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n"); | |||
| #endif | |||
| TABLE_NAME.sgemm_p = 64 * (l2 >> 7); | |||
| @@ -746,6 +746,22 @@ static void init_parameter(void) { | |||
| #endif | |||
| #endif | |||
| #ifdef SANDYBRIDGE | |||
| #ifdef DEBUG | |||
| fprintf(stderr, "Sandybridge\n"); | |||
| #endif | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #ifdef EXPRECISION | |||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||
| #endif | |||
| #endif | |||
| #ifdef OPTERON | |||
| #ifdef DEBUG | |||
| @@ -778,6 +794,38 @@ static void init_parameter(void) { | |||
| #endif | |||
| #endif | |||
| #ifdef BOBCAT | |||
| #ifdef DEBUG | |||
| fprintf(stderr, "Bobcate\n"); | |||
| #endif | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #ifdef EXPRECISION | |||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||
| #endif | |||
| #endif | |||
| #ifdef BULLDOZER | |||
| #ifdef DEBUG | |||
| fprintf(stderr, "Bulldozer\n"); | |||
| #endif | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #ifdef EXPRECISION | |||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||
| #endif | |||
| #endif | |||
| #ifdef NANO | |||
| #ifdef DEBUG | |||
| @@ -0,0 +1,59 @@ | |||
| SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||
| SGEMMINCOPY = | |||
| SGEMMITCOPY = | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| SGEMMINCOPYOBJ = | |||
| SGEMMITCOPYOBJ = | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||
| CGEMMINCOPY = | |||
| CGEMMITCOPY = | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMINCOPYOBJ = | |||
| CGEMMITCOPYOBJ = | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||
| STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||
| STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||
| STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||
| CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S | |||
| @@ -0,0 +1,59 @@ | |||
| SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||
| SGEMMINCOPY = | |||
| SGEMMITCOPY = | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| SGEMMINCOPYOBJ = | |||
| SGEMMITCOPYOBJ = | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||
| CGEMMINCOPY = | |||
| CGEMMITCOPY = | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMINCOPYOBJ = | |||
| CGEMMITCOPYOBJ = | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||
| STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||
| STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||
| STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||
| CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S | |||
| @@ -0,0 +1 @@ | |||
| include $(KERNELDIR)/KERNEL.PENRYN | |||
| @@ -495,7 +495,6 @@ | |||
| ALIGN_4 | |||
| .L999: | |||
| RESTOREREGISTERS | |||
| subl $8, %esp | |||
| movss %xmm0, (%esp) | |||
| @@ -76,6 +76,12 @@ | |||
| #define PREFETCHB prefetcht0 | |||
| #endif | |||
| #ifdef SANDYBRIDGE | |||
| #define PREFETCHSIZE (8 * 1 - 4) | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHB prefetcht0 | |||
| #endif | |||
| #ifndef PREFETCH | |||
| #define PREFETCH prefetcht0 | |||
| #endif | |||
| @@ -596,7 +596,7 @@ | |||
| .L22: | |||
| mulps %xmm0, %xmm2 | |||
| addps %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movsd 4 * SIZE(BB), %xmm2 | |||
| @@ -842,7 +842,7 @@ | |||
| .L32: | |||
| mulss %xmm0, %xmm2 | |||
| addss %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 4 * SIZE(BB), %xmm2 | |||
| @@ -1168,7 +1168,7 @@ | |||
| .L52: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulps 4 * SIZE(BB), %xmm0 | |||
| @@ -1198,7 +1198,7 @@ | |||
| addps %xmm0, %xmm5 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm2 | |||
| @@ -1347,7 +1347,7 @@ | |||
| ALIGN_4 | |||
| .L62: | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| @@ -1531,7 +1531,7 @@ | |||
| .L72: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulss 4 * SIZE(BB), %xmm0 | |||
| @@ -1778,7 +1778,7 @@ | |||
| .L92: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(AA), %xmm0 | |||
| @@ -1793,7 +1793,7 @@ | |||
| mulps 12 * SIZE(BB), %xmm0 | |||
| addps %xmm0, %xmm7 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm3 | |||
| @@ -1924,7 +1924,7 @@ | |||
| .L102: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movsd 2 * SIZE(AA), %xmm0 | |||
| @@ -2069,7 +2069,7 @@ | |||
| .L112: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 1 * SIZE(AA), %xmm0 | |||
| @@ -69,6 +69,12 @@ | |||
| #define PREFETCHB prefetcht0 | |||
| #endif | |||
| #ifdef SANDYBRIDGE | |||
| #define PREFETCHSIZE (16 * 1 - 8) | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHB prefetcht0 | |||
| #endif | |||
| #ifndef PREFETCH | |||
| #define PREFETCH prefetcht0 | |||
| #endif | |||
| @@ -262,7 +268,7 @@ | |||
| movaps -16 * SIZE(AA), %xmm0 | |||
| addps %xmm2, %xmm7 | |||
| #ifndef NEHALEM | |||
| #if !(defined(NEHALEM) || defined(SANDYBRIDGE)) | |||
| PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| pshufd $0x93, %xmm1, %xmm2 | |||
| @@ -58,7 +58,7 @@ | |||
| #define PREFETCHSIZE (16 * 4) | |||
| #endif | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 7) | |||
| @@ -89,17 +89,22 @@ | |||
| #endif | |||
| #define STACKSIZE 16 | |||
| #define M 4 + STACKSIZE(%esp) | |||
| #define N 8 + STACKSIZE(%esp) | |||
| #define ALPHA 16 + STACKSIZE(%esp) | |||
| #define A 20 + STACKSIZE(%esp) | |||
| #define STACK_LDA 24 + STACKSIZE(%esp) | |||
| #define STACK_X 28 + STACKSIZE(%esp) | |||
| #define STACK_INCX 32 + STACKSIZE(%esp) | |||
| #define Y 36 + STACKSIZE(%esp) | |||
| #define STACK_INCY 40 + STACKSIZE(%esp) | |||
| #define BUFFER 44 + STACKSIZE(%esp) | |||
| #define ARGS 16 | |||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||
| #define ALPHA 16 + STACKSIZE+ARGS(%esp) | |||
| #define A 20 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_LDA 24 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_X 28 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCX 32 + STACKSIZE+ARGS(%esp) | |||
| #define Y 36 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCY 40 + STACKSIZE+ARGS(%esp) | |||
| #define BUFFER 44 + STACKSIZE+ARGS(%esp) | |||
| #define MMM 0+ARGS(%esp) | |||
| #define YY 4+ARGS(%esp) | |||
| #define AA 8+ARGS(%esp) | |||
| #define LDAX 12+ARGS(%esp) | |||
| #define I %eax | |||
| #define J %ebx | |||
| @@ -114,6 +119,7 @@ | |||
| PROLOGUE | |||
| subl $ARGS,%esp | |||
| pushl %ebp | |||
| pushl %edi | |||
| pushl %esi | |||
| @@ -121,7 +127,34 @@ | |||
| PROFCODE | |||
| movl Y,J | |||
| movl J,YY # backup Y | |||
| movl A,J | |||
| movl J,AA # backup A | |||
| movl M,J | |||
| movl J,MMM # backup MM | |||
| .L0t: | |||
| xorl J,J | |||
| addl $1,J | |||
| sall $21,J | |||
| subl J,MMM | |||
| movl J,M | |||
| jge .L00t | |||
| ALIGN_4 | |||
| movl MMM,%eax | |||
| addl J,%eax | |||
| jle .L999x | |||
| movl %eax,M | |||
| .L00t: | |||
| movl AA,%eax | |||
| movl %eax,A | |||
| movl YY,J | |||
| movl J,Y | |||
| movl STACK_LDA, LDA | |||
| movl STACK_X, X | |||
| movl STACK_INCX, INCX | |||
| @@ -651,12 +684,22 @@ | |||
| addss 0 * SIZE(X), %xmm0 | |||
| movss %xmm0, (Y1) | |||
| ALIGN_3 | |||
| .L999: | |||
| movl M,J | |||
| leal (,J,SIZE),%eax | |||
| addl %eax,AA | |||
| movl YY,J | |||
| addl %eax,J | |||
| movl J,YY | |||
| jmp .L0t | |||
| ALIGN_4 | |||
| .L999x: | |||
| popl %ebx | |||
| popl %esi | |||
| popl %edi | |||
| popl %ebp | |||
| addl $ARGS,%esp | |||
| ret | |||
| EPILOGUE | |||
| @@ -45,7 +45,7 @@ | |||
| #define PREFETCHSIZE (8 * 2) | |||
| #endif | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (8 * 7) | |||
| @@ -76,17 +76,22 @@ | |||
| #endif | |||
| #define STACKSIZE 16 | |||
| #define M 4 + STACKSIZE(%esp) | |||
| #define N 8 + STACKSIZE(%esp) | |||
| #define ALPHA 16 + STACKSIZE(%esp) | |||
| #define A 24 + STACKSIZE(%esp) | |||
| #define STACK_LDA 28 + STACKSIZE(%esp) | |||
| #define STACK_X 32 + STACKSIZE(%esp) | |||
| #define STACK_INCX 36 + STACKSIZE(%esp) | |||
| #define Y 40 + STACKSIZE(%esp) | |||
| #define STACK_INCY 44 + STACKSIZE(%esp) | |||
| #define BUFFER 48 + STACKSIZE(%esp) | |||
| #define ARGS 16 | |||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||
| #define ALPHA 16 + STACKSIZE+ARGS(%esp) | |||
| #define A 24 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_LDA 28 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_X 32 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCX 36 + STACKSIZE+ARGS(%esp) | |||
| #define Y 40 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) | |||
| #define BUFFER 48 + STACKSIZE+ARGS(%esp) | |||
| #define MMM 0+ARGS(%esp) | |||
| #define YY 4+ARGS(%esp) | |||
| #define AA 8+ARGS(%esp) | |||
| #define I %eax | |||
| #define J %ebx | |||
| @@ -101,6 +106,8 @@ | |||
| PROLOGUE | |||
| subl $ARGS,%esp | |||
| pushl %ebp | |||
| pushl %edi | |||
| pushl %esi | |||
| @@ -108,6 +115,33 @@ | |||
| PROFCODE | |||
| movl Y,J | |||
| movl J,YY # backup Y | |||
| movl A,J | |||
| movl J,AA # backup A | |||
| movl M,J | |||
| movl J,MMM # backup MM | |||
| .L0t: | |||
| xorl J,J | |||
| addl $1,J | |||
| sall $20,J | |||
| subl J,MMM | |||
| movl J,M | |||
| jge .L00t | |||
| ALIGN_4 | |||
| movl MMM,%eax | |||
| addl J,%eax | |||
| jle .L999x | |||
| movl %eax,M | |||
| .L00t: | |||
| movl AA,%eax | |||
| movl %eax,A | |||
| movl YY,J | |||
| movl J,Y | |||
| movl STACK_LDA, LDA | |||
| movl STACK_X, X | |||
| movl STACK_INCX, INCX | |||
| @@ -677,10 +711,22 @@ | |||
| ALIGN_3 | |||
| .L999: | |||
| movl M,J | |||
| leal (,J,SIZE),%eax | |||
| addl %eax,AA | |||
| movl YY,J | |||
| addl %eax,J | |||
| movl J,YY | |||
| jmp .L0t | |||
| ALIGN_4 | |||
| .L999x: | |||
| popl %ebx | |||
| popl %esi | |||
| popl %edi | |||
| popl %ebp | |||
| addl $ARGS,%esp | |||
| ret | |||
| EPILOGUE | |||
| @@ -58,7 +58,7 @@ | |||
| #define PREFETCHSIZE (16 * 4) | |||
| #endif | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 7) | |||
| @@ -89,17 +89,24 @@ | |||
| #endif | |||
| #define STACKSIZE 16 | |||
| #define M 4 + STACKSIZE(%esp) | |||
| #define N 8 + STACKSIZE(%esp) | |||
| #define ALPHA 16 + STACKSIZE(%esp) | |||
| #define A 20 + STACKSIZE(%esp) | |||
| #define STACK_LDA 24 + STACKSIZE(%esp) | |||
| #define STACK_X 28 + STACKSIZE(%esp) | |||
| #define STACK_INCX 32 + STACKSIZE(%esp) | |||
| #define Y 36 + STACKSIZE(%esp) | |||
| #define STACK_INCY 40 + STACKSIZE(%esp) | |||
| #define BUFFER 44 + STACKSIZE(%esp) | |||
| #define ARGS 20 | |||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||
| #define ALPHA 16 + STACKSIZE+ARGS(%esp) | |||
| #define A 20 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_LDA 24 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_X 28 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCX 32 + STACKSIZE+ARGS(%esp) | |||
| #define Y 36 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCY 40 + STACKSIZE+ARGS(%esp) | |||
| #define BUFFER 44 + STACKSIZE+ARGS(%esp) | |||
| #define MMM 0+STACKSIZE(%esp) | |||
| #define NN 4+STACKSIZE(%esp) | |||
| #define AA 8+STACKSIZE(%esp) | |||
| #define LDAX 12+STACKSIZE(%esp) | |||
| #define XX 16+STACKSIZE(%esp) | |||
| #define I %eax | |||
| #define J %ebx | |||
| @@ -114,6 +121,7 @@ | |||
| PROLOGUE | |||
| subl $ARGS,%esp | |||
| pushl %ebp | |||
| pushl %edi | |||
| pushl %esi | |||
| @@ -122,7 +130,42 @@ | |||
| PROFCODE | |||
| movl STACK_LDA, LDA | |||
| movl LDA,LDAX # backup LDA | |||
| movl STACK_X, X | |||
| movl X,XX | |||
| movl N,J | |||
| movl J,NN # backup N | |||
| movl A,J | |||
| movl J,AA # backup A | |||
| movl M,J | |||
| movl J,MMM # mov M to MMM | |||
| .L0t: | |||
| xorl J,J | |||
| addl $1,J | |||
| sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) | |||
| subl $8, J # Don't use last 8 float in the buffer. | |||
| # Now, split M by block J | |||
| subl J,MMM # MMM=MMM-J | |||
| movl J,M | |||
| jge .L00t | |||
| ALIGN_4 | |||
| movl MMM,%eax | |||
| addl J,%eax | |||
| jle .L999x | |||
| movl %eax,M | |||
| .L00t: | |||
| movl AA,%eax | |||
| movl %eax,A # mov AA to A | |||
| movl NN,%eax | |||
| movl %eax,N # reset N | |||
| movl LDAX, LDA # reset LDA | |||
| movl XX,X | |||
| movl STACK_INCX, INCX | |||
| movl STACK_INCY, INCY | |||
| @@ -198,6 +241,20 @@ | |||
| jg .L06 | |||
| ALIGN_4 | |||
| //Padding zero to prevent loading the dirty number from buffer. | |||
| movl M, I | |||
| movl $8, J | |||
| andl $7, I | |||
| xorps %xmm0, %xmm0 | |||
| subl I, J | |||
| ALIGN_2 | |||
| .L07: | |||
| movss %xmm0, 0 * SIZE(Y1) | |||
| addl $SIZE, Y1 | |||
| decl J | |||
| jg .L07 | |||
| ALIGN_4 | |||
| .L10: | |||
| movl Y, Y1 | |||
| @@ -628,10 +685,22 @@ | |||
| ALIGN_4 | |||
| .L999: | |||
| movl M,J | |||
| leal (,J,SIZE),%eax | |||
| addl %eax,AA | |||
| movl XX,J | |||
| addl %eax,J | |||
| movl J,XX | |||
| jmp .L0t | |||
| ALIGN_4 | |||
| .L999x: | |||
| popl %ebx | |||
| popl %esi | |||
| popl %edi | |||
| popl %ebp | |||
| addl $ARGS,%esp | |||
| ret | |||
| EPILOGUE | |||
| @@ -45,7 +45,7 @@ | |||
| #define PREFETCHSIZE (8 * 2) | |||
| #endif | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (8 * 7) | |||
| @@ -76,18 +76,24 @@ | |||
| #endif | |||
| #define STACKSIZE 16 | |||
| #define ARGS 16 | |||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||
| #define ALPHA 16 + STACKSIZE+ARGS(%esp) | |||
| #define A 24 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_LDA 28 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_X 32 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCX 36 + STACKSIZE+ARGS(%esp) | |||
| #define Y 40 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) | |||
| #define BUFFER 48 + STACKSIZE+ARGS(%esp) | |||
| #define MMM 0+STACKSIZE(%esp) | |||
| #define AA 4+STACKSIZE(%esp) | |||
| #define LDAX 8+STACKSIZE(%esp) | |||
| #define NN 12+STACKSIZE(%esp) | |||
| #define M 4 + STACKSIZE(%esp) | |||
| #define N 8 + STACKSIZE(%esp) | |||
| #define ALPHA 16 + STACKSIZE(%esp) | |||
| #define A 24 + STACKSIZE(%esp) | |||
| #define STACK_LDA 28 + STACKSIZE(%esp) | |||
| #define STACK_X 32 + STACKSIZE(%esp) | |||
| #define STACK_INCX 36 + STACKSIZE(%esp) | |||
| #define Y 40 + STACKSIZE(%esp) | |||
| #define STACK_INCY 44 + STACKSIZE(%esp) | |||
| #define BUFFER 48 + STACKSIZE(%esp) | |||
| #define I %eax | |||
| #define J %ebx | |||
| @@ -101,6 +107,8 @@ | |||
| PROLOGUE | |||
| subl $ARGS,%esp | |||
| pushl %ebp | |||
| pushl %edi | |||
| pushl %esi | |||
| @@ -108,7 +116,40 @@ | |||
| PROFCODE | |||
| movl STACK_LDA, LDA | |||
| movl LDA,LDAX # backup LDA | |||
| movl N,J | |||
| movl J,NN # backup N | |||
| movl A,J | |||
| movl J,AA # backup A | |||
| movl M,J | |||
| movl J,MMM # mov M to MMM | |||
| .L0t: | |||
| xorl J,J | |||
| addl $1,J | |||
| sall $21,J # J=2^21*sizeof(double)=buffer size(16MB) | |||
| subl $4, J # Don't use last 4 double in the buffer. | |||
| # Now, split M by block J | |||
| subl J,MMM # MMM=MMM-J | |||
| movl J,M | |||
| jge .L00t | |||
| ALIGN_4 | |||
| movl MMM,%eax | |||
| addl J,%eax | |||
| jle .L999x | |||
| movl %eax,M | |||
| .L00t: | |||
| movl AA,%eax | |||
| movl %eax,A # mov AA to A | |||
| movl NN,%eax | |||
| movl %eax,N # reset N | |||
| movl LDAX, LDA # reset LDA | |||
| movl STACK_X, X | |||
| movl STACK_INCX, INCX | |||
| movl STACK_INCY, INCY | |||
| @@ -117,6 +158,7 @@ | |||
| leal (,INCY, SIZE), INCY | |||
| leal (,LDA, SIZE), LDA | |||
| subl $-16 * SIZE, A | |||
| cmpl $0, N | |||
| @@ -560,10 +602,19 @@ | |||
| ALIGN_4 | |||
| .L999: | |||
| movl M,J | |||
| leal (,J,SIZE),%eax | |||
| addl %eax,AA | |||
| jmp .L0t | |||
| ALIGN_4 | |||
| .L999x: | |||
| popl %ebx | |||
| popl %esi | |||
| popl %edi | |||
| popl %ebp | |||
| addl $ARGS,%esp | |||
| ret | |||
| EPILOGUE | |||
| @@ -269,7 +269,7 @@ | |||
| sarl $5, I | |||
| jle .L113 | |||
| #if defined(BARCELONA) | |||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||
| movaps %xmm0, %xmm1 | |||
| mulps -32 * SIZE(X), %xmm1 | |||
| @@ -76,7 +76,8 @@ | |||
| xorps %xmm1, %xmm1 | |||
| comisd %xmm0, %xmm1 | |||
| jne .L100 # Alpha != ZERO | |||
| jp .L100 # For Alpha = NaN | |||
| /* Alpha == ZERO */ | |||
| cmpl $SIZE, INCX | |||
| jne .L50 | |||
| @@ -252,7 +253,7 @@ | |||
| sarl $4, I | |||
| jle .L113 | |||
| #if defined(BARCELONA) | |||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||
| movaps %xmm0, %xmm1 | |||
| mulpd -16 * SIZE(X), %xmm1 | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #ifdef NEHALEM | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -69,7 +69,7 @@ | |||
| #define STACK_ALIGN 4096 | |||
| #define STACK_OFFSET 1024 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHSIZE (8 * 10 + 4) | |||
| #endif | |||
| @@ -439,7 +439,7 @@ | |||
| .L22: | |||
| mulsd %xmm0, %xmm2 | |||
| addsd %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movlpd 2 * SIZE(BB), %xmm2 | |||
| @@ -488,7 +488,7 @@ | |||
| movlpd 40 * SIZE(BB), %xmm3 | |||
| addsd %xmm0, %xmm7 | |||
| movlpd 8 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | |||
| #endif | |||
| mulsd %xmm1, %xmm2 | |||
| @@ -1697,7 +1697,7 @@ | |||
| .L42: | |||
| mulpd %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulpd 2 * SIZE(BB), %xmm0 | |||
| @@ -1727,7 +1727,7 @@ | |||
| addpd %xmm0, %xmm7 | |||
| movapd 16 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | |||
| #endif | |||
| mulpd %xmm1, %xmm2 | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #ifdef NEHALEM | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -64,7 +64,7 @@ | |||
| #define BORIG 60(%esp) | |||
| #define BUFFER 128(%esp) | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 10 + 8) | |||
| @@ -437,7 +437,7 @@ | |||
| .L32: | |||
| mulss %xmm0, %xmm2 | |||
| addss %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 4 * SIZE(BB), %xmm2 | |||
| @@ -833,7 +833,7 @@ | |||
| .L22: | |||
| mulps %xmm0, %xmm2 | |||
| addps %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(BB), %xmm2 | |||
| @@ -1848,7 +1848,7 @@ | |||
| .L72: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulss 4 * SIZE(BB), %xmm0 | |||
| @@ -2109,7 +2109,7 @@ | |||
| ALIGN_4 | |||
| .L62: | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| @@ -2429,7 +2429,7 @@ | |||
| .L52: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulps 4 * SIZE(BB), %xmm0 | |||
| @@ -2459,7 +2459,7 @@ | |||
| addps %xmm0, %xmm5 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm2 | |||
| @@ -2952,7 +2952,7 @@ | |||
| .L112: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 1 * SIZE(AA), %xmm0 | |||
| @@ -3148,7 +3148,7 @@ | |||
| .L102: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movsd 2 * SIZE(AA), %xmm0 | |||
| @@ -3389,7 +3389,7 @@ | |||
| .L92: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(AA), %xmm0 | |||
| @@ -3404,7 +3404,7 @@ | |||
| mulps 12 * SIZE(BB), %xmm0 | |||
| addps %xmm0, %xmm7 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm3 | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #ifdef NEHALEM | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -69,7 +69,7 @@ | |||
| #define STACK_ALIGN 4096 | |||
| #define STACK_OFFSET 1024 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHSIZE (8 * 10 + 4) | |||
| #endif | |||
| @@ -910,7 +910,7 @@ | |||
| .L22: | |||
| mulsd %xmm0, %xmm2 | |||
| addsd %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movlpd 2 * SIZE(BB), %xmm2 | |||
| @@ -959,7 +959,7 @@ | |||
| movlpd 40 * SIZE(BB), %xmm3 | |||
| addsd %xmm0, %xmm7 | |||
| movlpd 8 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | |||
| #endif | |||
| mulsd %xmm1, %xmm2 | |||
| @@ -1439,7 +1439,7 @@ | |||
| .L42: | |||
| mulpd %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulpd 2 * SIZE(BB), %xmm0 | |||
| @@ -1469,7 +1469,7 @@ | |||
| addpd %xmm0, %xmm7 | |||
| movapd 16 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | |||
| #endif | |||
| mulpd %xmm1, %xmm2 | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #ifdef NEHALEM | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -64,7 +64,7 @@ | |||
| #define BORIG 60(%esp) | |||
| #define BUFFER 128(%esp) | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 10 + 8) | |||
| @@ -872,7 +872,7 @@ | |||
| .L22: | |||
| mulps %xmm0, %xmm2 | |||
| addps %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(BB), %xmm2 | |||
| @@ -1316,7 +1316,7 @@ | |||
| .L32: | |||
| mulss %xmm0, %xmm2 | |||
| addss %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 4 * SIZE(BB), %xmm2 | |||
| @@ -1855,7 +1855,7 @@ | |||
| .L52: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulps 4 * SIZE(BB), %xmm0 | |||
| @@ -1885,7 +1885,7 @@ | |||
| addps %xmm0, %xmm5 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm2 | |||
| @@ -2249,7 +2249,7 @@ | |||
| ALIGN_4 | |||
| .L62: | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| @@ -2562,7 +2562,7 @@ | |||
| .L72: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulss 4 * SIZE(BB), %xmm0 | |||
| @@ -2957,7 +2957,7 @@ | |||
| .L92: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(AA), %xmm0 | |||
| @@ -2972,7 +2972,7 @@ | |||
| mulps 12 * SIZE(BB), %xmm0 | |||
| addps %xmm0, %xmm7 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm3 | |||
| @@ -3280,7 +3280,7 @@ | |||
| .L102: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movsd 2 * SIZE(AA), %xmm0 | |||
| @@ -3515,7 +3515,7 @@ | |||
| .L112: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 1 * SIZE(AA), %xmm0 | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #ifdef NEHALEM | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -69,7 +69,7 @@ | |||
| #define STACK_ALIGN 4096 | |||
| #define STACK_OFFSET 1024 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHSIZE (8 * 10 + 4) | |||
| #endif | |||
| @@ -1036,7 +1036,7 @@ | |||
| .L42: | |||
| mulpd %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulpd 2 * SIZE(BB), %xmm0 | |||
| @@ -1066,7 +1066,7 @@ | |||
| addpd %xmm0, %xmm7 | |||
| movapd 16 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | |||
| #endif | |||
| mulpd %xmm1, %xmm2 | |||
| @@ -2224,7 +2224,7 @@ | |||
| .L22: | |||
| mulsd %xmm0, %xmm2 | |||
| addsd %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movlpd 2 * SIZE(BB), %xmm2 | |||
| @@ -2273,7 +2273,7 @@ | |||
| movlpd 40 * SIZE(BB), %xmm3 | |||
| addsd %xmm0, %xmm7 | |||
| movlpd 8 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | |||
| #endif | |||
| mulsd %xmm1, %xmm2 | |||
| @@ -62,7 +62,7 @@ | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| #ifdef NEHALEM | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHSIZE (8 * 21 + 4) | |||
| #endif | |||
| @@ -64,7 +64,7 @@ | |||
| #define BORIG 60(%esp) | |||
| #define BUFFER 128(%esp) | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 10 + 8) | |||
| @@ -439,7 +439,7 @@ | |||
| .L92: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(AA), %xmm0 | |||
| @@ -454,7 +454,7 @@ | |||
| mulps 12 * SIZE(BB), %xmm0 | |||
| addps %xmm0, %xmm7 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm3 | |||
| @@ -758,7 +758,7 @@ | |||
| .L102: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movsd 2 * SIZE(AA), %xmm0 | |||
| @@ -993,7 +993,7 @@ | |||
| .L112: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 1 * SIZE(AA), %xmm0 | |||
| @@ -1324,7 +1324,7 @@ | |||
| .L52: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulps 4 * SIZE(BB), %xmm0 | |||
| @@ -1354,7 +1354,7 @@ | |||
| addps %xmm0, %xmm5 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm2 | |||
| @@ -1718,7 +1718,7 @@ | |||
| ALIGN_4 | |||
| .L62: | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| @@ -2031,7 +2031,7 @@ | |||
| .L72: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulss 4 * SIZE(BB), %xmm0 | |||
| @@ -2859,7 +2859,7 @@ | |||
| .L22: | |||
| mulps %xmm0, %xmm2 | |||
| addps %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(BB), %xmm2 | |||
| @@ -3303,7 +3303,7 @@ | |||
| .L32: | |||
| mulss %xmm0, %xmm2 | |||
| addss %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 4 * SIZE(BB), %xmm2 | |||
| @@ -1541,6 +1541,16 @@ | |||
| popl %ebx | |||
| popl %esi | |||
| popl %edi | |||
| /*remove the hidden return value address from the stack.*/ | |||
| #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) | |||
| #ifdef MS_ABI | |||
| /* For MingW GCC >= 4.7. It is compatible with MSVC ABI. http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36834 */ | |||
| ret | |||
| #else | |||
| /* remove the hidden return value address from the stack. For MingW GCC < 4.7 */ | |||
| ret $0x4 | |||
| #endif | |||
| #else | |||
| /*remove the hidden return value address from the stack on Linux.*/ | |||
| ret $0x4 | |||
| #endif | |||
| EPILOGUE | |||
| @@ -74,7 +74,7 @@ | |||
| #define BB %ecx | |||
| #define LDC %ebp | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| #define movsd movlps | |||
| #endif | |||
| @@ -625,7 +625,7 @@ | |||
| .L22: | |||
| mulps %xmm0, %xmm2 | |||
| addps %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movsd 4 * SIZE(BB), %xmm2 | |||
| @@ -870,7 +870,7 @@ | |||
| .L32: | |||
| mulss %xmm0, %xmm2 | |||
| addss %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 4 * SIZE(BB), %xmm2 | |||
| @@ -1173,7 +1173,7 @@ | |||
| .L52: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulps 4 * SIZE(BB), %xmm0 | |||
| @@ -1203,7 +1203,7 @@ | |||
| addps %xmm0, %xmm5 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm2 | |||
| @@ -1359,7 +1359,7 @@ | |||
| ALIGN_4 | |||
| .L62: | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| @@ -1536,7 +1536,7 @@ | |||
| .L72: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulss 4 * SIZE(BB), %xmm0 | |||
| @@ -1794,7 +1794,7 @@ | |||
| .L92: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(AA), %xmm0 | |||
| @@ -1809,7 +1809,7 @@ | |||
| mulps 12 * SIZE(BB), %xmm0 | |||
| addps %xmm0, %xmm7 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm3 | |||
| @@ -1936,7 +1936,7 @@ | |||
| .L102: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movsd 2 * SIZE(AA), %xmm0 | |||
| @@ -2069,7 +2069,7 @@ | |||
| .L112: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 1 * SIZE(AA), %xmm0 | |||
| @@ -64,7 +64,7 @@ | |||
| #define PREFETCHB prefetcht0 | |||
| #endif | |||
| #ifdef NEHALEM | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCHSIZE (8 * 1 - 4) | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHB prefetcht0 | |||
| @@ -64,7 +64,7 @@ | |||
| #define PREFETCHB prefetcht0 | |||
| #endif | |||
| #ifdef NEHALEM | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCHSIZE (16 * 1 + 8) | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHB prefetcht0 | |||
| @@ -58,7 +58,7 @@ | |||
| #define PREFETCHSIZE (16 * 2) | |||
| #endif | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 7) | |||
| @@ -71,7 +71,7 @@ | |||
| #define movsd movlps | |||
| #endif | |||
| #ifdef BARCELONA | |||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||
| #define PREFETCH prefetchnta | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 5) | |||
| @@ -45,7 +45,7 @@ | |||
| #define PREFETCHSIZE (8 * 2) | |||
| #endif | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (8 * 7) | |||
| @@ -58,7 +58,7 @@ | |||
| #define movsd movlps | |||
| #endif | |||
| #ifdef BARCELONA | |||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||
| #define PREFETCH prefetchnta | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (8 * 5) | |||
| @@ -58,7 +58,7 @@ | |||
| #define PREFETCHSIZE (16 * 2) | |||
| #endif | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (16 * 7) | |||
| @@ -71,7 +71,7 @@ | |||
| #define movsd movlps | |||
| #endif | |||
| #ifdef BARCELONA | |||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||
| #define PREFETCH prefetchnta | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 5) | |||
| @@ -45,7 +45,7 @@ | |||
| #define PREFETCHSIZE (8 * 2) | |||
| #endif | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) | |||
| #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| #define PREFETCHSIZE (8 * 7) | |||
| @@ -58,7 +58,7 @@ | |||
| #define movsd movlps | |||
| #endif | |||
| #ifdef BARCELONA | |||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||
| #define PREFETCH prefetchnta | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (8 * 5) | |||
| @@ -55,7 +55,7 @@ | |||
| #define XX %edi | |||
| #define FLAG %ebp | |||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) | |||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) | |||
| #define USE_PSHUFD | |||
| #else | |||
| #define USE_PSHUFD_HALF | |||
| @@ -697,7 +697,7 @@ | |||
| cmpl $2 * SIZE, INCX | |||
| jne .L120 | |||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) | |||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) | |||
| PSHUFD2($0, %xmm0, %xmm6) | |||
| PSHUFD2($0, %xmm1, %xmm1) | |||
| @@ -57,7 +57,7 @@ | |||
| #include "l1param.h" | |||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) | |||
| #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) | |||
| #define USE_PSHUFD | |||
| #else | |||
| #define USE_PSHUFD_HALF | |||
| @@ -860,7 +860,7 @@ | |||
| cmpl $2 * SIZE, INCX | |||
| jne .L220 | |||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) | |||
| #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) | |||
| #ifdef HAVE_SSE3 | |||
| movddup %xmm0, %xmm6 | |||
| @@ -61,7 +61,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #ifdef NEHALEM | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -75,7 +75,7 @@ | |||
| #define STACK_ALIGN 4096 | |||
| #define STACK_OFFSET 1024 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCHSIZE (16 * 10 + 8) | |||
| #define WPREFETCHSIZE 112 | |||
| #define PREFETCH prefetch | |||
| @@ -533,7 +533,7 @@ | |||
| addps %xmm0, %xmm7 | |||
| movsd 16 * SIZE(AA), %xmm0 | |||
| mulps %xmm1, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| addps %xmm2, %xmm4 | |||
| @@ -63,7 +63,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #ifdef NEHALEM | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -61,7 +61,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #ifdef NEHALEM | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -75,7 +75,7 @@ | |||
| #define STACK_ALIGN 4096 | |||
| #define STACK_OFFSET 1024 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCHSIZE (16 * 10 + 8) | |||
| #define WPREFETCHSIZE 112 | |||
| #define PREFETCH prefetch | |||
| @@ -994,7 +994,7 @@ | |||
| addps %xmm0, %xmm7 | |||
| movsd 16 * SIZE(AA), %xmm0 | |||
| mulps %xmm1, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| addps %xmm2, %xmm4 | |||
| @@ -63,7 +63,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #ifdef NEHALEM | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -61,7 +61,7 @@ | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| #ifdef NEHALEM | |||
| #if defined(NEHALEM) || defined(SANDYBRIDGE) | |||
| #define PREFETCH prefetcht1 | |||
| #define PREFETCHSIZE 84 | |||
| #endif | |||
| @@ -75,7 +75,7 @@ | |||
| #define STACK_ALIGN 4096 | |||
| #define STACK_OFFSET 1024 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCHSIZE (16 * 10 + 8) | |||
| #define WPREFETCHSIZE 112 | |||
| #define PREFETCH prefetch | |||
| @@ -1820,7 +1820,7 @@ | |||
| addps %xmm0, %xmm7 | |||
| movsd 16 * SIZE(AA), %xmm0 | |||
| mulps %xmm1, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| addps %xmm2, %xmm4 | |||
| @@ -0,0 +1,62 @@ | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||
| SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
| SGEMMONCOPY = gemm_ncopy_4_opteron.S | |||
| SGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||
| DGEMMINCOPY = | |||
| DGEMMITCOPY = | |||
| DGEMMONCOPY = gemm_ncopy_4_opteron.S | |||
| DGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||
| DGEMMINCOPYOBJ = | |||
| DGEMMITCOPYOBJ = | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPY = zgemm_ncopy_2.S | |||
| CGEMMOTCOPY = zgemm_tcopy_2.S | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||
| ZGEMMINCOPY = | |||
| ZGEMMITCOPY = | |||
| ZGEMMONCOPY = zgemm_ncopy_2.S | |||
| ZGEMMOTCOPY = zgemm_tcopy_2.S | |||
| ZGEMMINCOPYOBJ = | |||
| ZGEMMITCOPYOBJ = | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||
| STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||
| STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||
| STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||
| DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||
| ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
| @@ -0,0 +1,62 @@ | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||
| SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
| SGEMMONCOPY = gemm_ncopy_4_opteron.S | |||
| SGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S | |||
| DGEMMINCOPY = | |||
| DGEMMITCOPY = | |||
| DGEMMONCOPY = gemm_ncopy_4_opteron.S | |||
| DGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||
| DGEMMINCOPYOBJ = | |||
| DGEMMITCOPYOBJ = | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPY = zgemm_ncopy_2.S | |||
| CGEMMOTCOPY = zgemm_tcopy_2.S | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||
| ZGEMMINCOPY = | |||
| ZGEMMITCOPY = | |||
| ZGEMMONCOPY = zgemm_ncopy_2.S | |||
| ZGEMMOTCOPY = zgemm_tcopy_2.S | |||
| ZGEMMINCOPYOBJ = | |||
| ZGEMMITCOPYOBJ = | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||
| STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||
| STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||
| STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||
| DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||
| ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
| @@ -0,0 +1,84 @@ | |||
| SGEMMKERNEL = sgemm_kernel_8x8_sandy.S | |||
| SGEMMINCOPY = | |||
| SGEMMITCOPY = | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
| SGEMMINCOPYOBJ = | |||
| SGEMMITCOPYOBJ = | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_4x8_sandy.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
| #DGEMMONCOPY = gemm_ncopy_4.S | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| #DGEMMOTCOPY = gemm_tcopy_4.S | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| #CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S | |||
| CGEMMKERNEL = cgemm_kernel_4x8_sandy.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| #ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S | |||
| ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S | |||
| ZGEMMINCOPY = | |||
| ZGEMMITCOPY = | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| ZGEMMINCOPYOBJ = | |||
| ZGEMMITCOPYOBJ = | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| #STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S | |||
| #STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | |||
| #STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | |||
| #STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S | |||
| #DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S | |||
| #DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S | |||
| #DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S | |||
| #DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S | |||
| #CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S | |||
| #CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S | |||
| #CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S | |||
| #CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S | |||
| #ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S | |||
| #ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S | |||
| #ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S | |||
| #ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S | |||
| @@ -69,7 +69,7 @@ | |||
| #endif | |||
| movaps %xmm0, ALPHA | |||
| #else | |||
| movaps %xmm3, ALPHA | |||
| movq 40(%rsp), X | |||
| movq 48(%rsp), INCX | |||
| @@ -79,6 +79,10 @@ | |||
| SAVEREGISTERS | |||
| #ifdef WINDOWS_ABI | |||
| movaps %xmm3, ALPHA | |||
| #endif | |||
| shufps $0, ALPHA, ALPHA | |||
| leaq (, INCX, SIZE), INCX | |||
| @@ -69,7 +69,6 @@ | |||
| #endif | |||
| movaps %xmm0, ALPHA | |||
| #else | |||
| movaps %xmm3, ALPHA | |||
| movq 40(%rsp), X | |||
| movq 48(%rsp), INCX | |||
| @@ -79,6 +78,10 @@ | |||
| SAVEREGISTERS | |||
| #ifdef WINDOWS_ABI | |||
| movaps %xmm3, ALPHA | |||
| #endif | |||
| unpcklpd ALPHA, ALPHA | |||
| leaq (, INCX, SIZE), INCX | |||