Update release-0.3.0 branch to match developtags/v0.3.0
| @@ -7,6 +7,7 @@ language: c | |||
| jobs: | |||
| include: | |||
| - &test-ubuntu | |||
| os: linux | |||
| stage: test | |||
| compiler: gcc | |||
| addons: | |||
| @@ -57,7 +58,8 @@ jobs: | |||
| - TARGET_BOX=LINUX32 | |||
| - BTYPE="BINARY=32" | |||
| - stage: test | |||
| - os: linux | |||
| stage: test | |||
| compiler: gcc | |||
| addons: | |||
| apt: | |||
| @@ -77,6 +79,7 @@ jobs: | |||
| # which is slower than container-based infrastructure used for jobs | |||
| # that don't require sudo. | |||
| - &test-alpine | |||
| os: linux | |||
| stage: test | |||
| dist: trusty | |||
| sudo: true | |||
| @@ -120,6 +123,7 @@ jobs: | |||
| - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" | |||
| - &test-cmake | |||
| os: linux | |||
| stage: test | |||
| compiler: clang | |||
| addons: | |||
| @@ -147,6 +151,23 @@ jobs: | |||
| env: | |||
| - CMAKE=1 | |||
| - &test-macos | |||
| os: osx | |||
| stage: test | |||
| osx_image: xcode8 | |||
| before_script: | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||
| - brew update | |||
| - brew install gcc # for gfortran | |||
| script: | |||
| - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| env: | |||
| - BTYPE="BINARY=64 INTERFACE64=1" | |||
| - <<: *test-macos | |||
| env: | |||
| - BTYPE="BINARY=32" | |||
| # whitelist | |||
| branches: | |||
| only: | |||
| @@ -91,11 +91,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| @$(MAKE) -C exports so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| ifeq ($(OSNAME), NetBSD) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) | |||
| @$(MAKE) -C exports so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| @@ -72,12 +72,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| endif | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| endif | |||
| ifeq ($(OSNAME), NetBSD) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) | |||
| @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @@ -101,8 +96,9 @@ endif | |||
| #Generating openblas.pc | |||
| @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" | |||
| @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @@ -115,7 +111,7 @@ endif | |||
| ifndef NO_SHARED | |||
| #ifeq logical or | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| endif | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) | |||
| @@ -17,6 +17,10 @@ ifdef CPUIDEMU | |||
| EXFLAGS = -DCPUIDEMU -DVENDOR=99 | |||
| endif | |||
| ifeq ($(TARGET), 1004K) | |||
| TARGET_FLAGS = -mips32r2 | |||
| endif | |||
| ifeq ($(TARGET), P5600) | |||
| TARGET_FLAGS = -mips32r5 | |||
| endif | |||
| @@ -60,6 +60,13 @@ VERSION = 0.3.0.dev | |||
| # automatically detected by the the script. | |||
| # NUM_THREADS = 24 | |||
| # If you have enabled USE_OPENMP and your application would call | |||
| # OpenBLAS's calculation API from multi threads, please comment it in. | |||
| # This flag defines how many instances of OpenBLAS's calculation API can | |||
| # actually run in parallel. If more threads call OpenBLAS's calculation API, | |||
| # they need to wait for the preceding API calls to finish or risk data corruption. | |||
| # NUM_PARALLEL = 2 | |||
| # if you don't need to install the static library, please comment it in. | |||
| # NO_STATIC = 1 | |||
| @@ -17,15 +17,24 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib | |||
| # http://stackoverflow.com/questions/4029274/mingw-and-make-variables | |||
| # - Default value is 'cc' which is not always a valid command (e.g. MinGW). | |||
| ifeq ($(origin CC),default) | |||
| # Check if $(CC) refers to a valid command and set the value to gcc if not | |||
| ifneq ($(findstring cmd.exe,$(SHELL)),) | |||
| ifeq ($(shell where $(CC) 2>NUL),) | |||
| CC = gcc | |||
| # Change the default compile to clang on Mac OSX. | |||
| # http://stackoverflow.com/questions/714100/os-detecting-makefile | |||
| UNAME_S := $(shell uname -s) | |||
| ifeq ($(UNAME_S),Darwin) | |||
| CC = clang | |||
| # EXTRALIB += -Wl,-no_compact_unwind | |||
| endif | |||
| endif | |||
| else # POSIX-ish | |||
| ifeq ($(shell command -v $(CC) 2>/dev/null),) | |||
| ifeq ($(shell uname -s),Darwin) | |||
| CC = clang | |||
| # EXTRALIB += -Wl,-no_compact_unwind | |||
| else | |||
| CC = gcc | |||
| endif # Darwin | |||
| endif # CC exists | |||
| endif # Shell is sane | |||
| endif # CC is set to default | |||
| # Default Fortran compiler (FC) is selected by f_check. | |||
| @@ -175,6 +184,10 @@ endif | |||
| endif | |||
| ifndef NUM_PARALLEL | |||
| NUM_PARALLEL = 1 | |||
| endif | |||
| ifndef NUM_THREADS | |||
| NUM_THREADS = $(NUM_CORES) | |||
| endif | |||
| @@ -230,7 +243,7 @@ endif | |||
| MD5SUM = md5 -r | |||
| endif | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) | |||
| MD5SUM = md5 -r | |||
| endif | |||
| @@ -424,7 +437,7 @@ CCOMMON_OPT += -fopenmp | |||
| endif | |||
| ifeq ($(C_COMPILER), INTEL) | |||
| CCOMMON_OPT += -openmp | |||
| CCOMMON_OPT += -fopenmp | |||
| endif | |||
| ifeq ($(C_COMPILER), PGI) | |||
| @@ -555,9 +568,14 @@ CCOMMON_OPT += -march=mips64 | |||
| FCOMMON_OPT += -march=mips64 | |||
| endif | |||
| ifeq ($(CORE), 1004K) | |||
| CCOMMON_OPT += -mips32r2 $(MSA_FLAGS) | |||
| FCOMMON_OPT += -mips32r2 $(MSA_FLAGS) | |||
| endif | |||
| ifeq ($(CORE), P5600) | |||
| CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) | |||
| FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) | |||
| CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) | |||
| FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) | |||
| endif | |||
| ifeq ($(CORE), I6400) | |||
| @@ -704,7 +722,7 @@ FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| FCOMMON_OPT += -openmp | |||
| FCOMMON_OPT += -fopenmp | |||
| endif | |||
| endif | |||
| @@ -952,6 +970,8 @@ endif | |||
| CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) | |||
| CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL) | |||
| ifdef USE_SIMPLE_THREADED_LEVEL3 | |||
| CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 | |||
| endif | |||
| @@ -5,175 +5,219 @@ | |||
| Travis CI: [](https://travis-ci.org/xianyi/OpenBLAS) | |||
| AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) | |||
| ## Introduction | |||
| OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. | |||
| Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>. | |||
| Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>. | |||
| ## Binary Packages | |||
| We provide binary packages for the following platform. | |||
| We provide official binary packages for the following platform: | |||
| * Windows x86/x86_64 | |||
| You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). | |||
| ## Installation from Source | |||
| Download from project homepage. http://xianyi.github.com/OpenBLAS/ | |||
| Or, check out codes from git://github.com/xianyi/OpenBLAS.git | |||
| ### Normal compile | |||
| * type "make" to detect the CPU automatically. | |||
| or | |||
| * type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. | |||
| Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code | |||
| using Git from https://github.com/xianyi/OpenBLAS.git. | |||
| ### Cross compile | |||
| Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. | |||
| ### Dependencies | |||
| Examples: | |||
| Building OpenBLAS requires the following to be installed: | |||
| On X86 box, compile this library for loongson3a CPU. | |||
| * GNU Make | |||
| * A C compiler, e.g. GCC or Clang | |||
| * A Fortran compiler (optional, for LAPACK) | |||
| * IBM MASS (optional, see below) | |||
| make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A | |||
| ### Normal compile | |||
| On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler. | |||
| Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically. | |||
| To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`. | |||
| The full target list is in the file `TargetList.txt`. | |||
| make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 | |||
| ### Cross compile | |||
| ### Debug version | |||
| Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler. | |||
| The target must be specified explicitly when cross compiling. | |||
| Examples: | |||
| make DEBUG=1 | |||
| * On an x86 box, compile this library for a loongson3a CPU: | |||
| ```sh | |||
| make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A | |||
| ``` | |||
| ### Compile with MASS Support on Power CPU (Optional dependency) | |||
| * On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler: | |||
| ```sh | |||
| make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 | |||
| ``` | |||
| [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and | |||
| Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER. | |||
| The library can be installed as below - | |||
| ### Debug version | |||
| * On Ubuntu: | |||
| A debug version can be built using `make DEBUG=1`. | |||
| wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br> | |||
| echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br> | |||
| sudo apt-get update</br> | |||
| sudo apt-get install libxlmass-devel.8.1.5</br> | |||
| ### Compile with MASS support on Power CPU (optional) | |||
| * On RHEL/CentOS: | |||
| The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library | |||
| consists of a set of mathematical functions for C, C++, and Fortran applications that are | |||
| are tuned for optimum performance on POWER architectures. | |||
| OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. | |||
| The library can be installed as shown: | |||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br> | |||
| sudo rpm --import repomd.xml.key</br> | |||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br> | |||
| sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br> | |||
| sudo yum install libxlmass-devel.8.1.5</br> | |||
| * On Ubuntu: | |||
| ```sh | |||
| wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add - | |||
| echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list | |||
| sudo apt-get update | |||
| sudo apt-get install libxlmass-devel.8.1.5 | |||
| ``` | |||
| After installing MASS library, compile openblas with USE_MASS=1. | |||
| * On RHEL/CentOS: | |||
| ```sh | |||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key | |||
| sudo rpm --import repomd.xml.key | |||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo | |||
| sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/ | |||
| sudo yum install libxlmass-devel.8.1.5 | |||
| ``` | |||
| Example: | |||
| After installing the MASS library, compile OpenBLAS with `USE_MASS=1`. | |||
| For example, to compile on Power8 with MASS support: `make USE_MASS=1 TARGET=POWER8`. | |||
| Compiling on Power8 with MASS support - | |||
| ### Install to a specific directory (optional) | |||
| make USE_MASS=1 TARGET=POWER8 | |||
| Use `PREFIX=` when invoking `make`, for example | |||
| ### Install to the directory (optional) | |||
| ```sh | |||
| make install PREFIX=your_installation_directory | |||
| ``` | |||
| Example: | |||
| The default installation directory is `/opt/OpenBLAS`. | |||
| make install PREFIX=your_installation_directory | |||
| ## Supported CPUs and Operating Systems | |||
| The default directory is /opt/OpenBLAS | |||
| Please read `GotoBLAS_01Readme.txt`. | |||
| ## Support CPU & OS | |||
| Please read GotoBLAS_01Readme.txt | |||
| ### Additional supported CPUs | |||
| ### Additional support CPU: | |||
| #### x86/x86-64 | |||
| #### x86/x86-64: | |||
| - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. | |||
| - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. | |||
| - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. | |||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | |||
| - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar) | |||
| - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) | |||
| - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. | |||
| - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. | |||
| #### MIPS64: | |||
| #### MIPS64 | |||
| - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. | |||
| - **ICT Loongson 3B**: Experimental | |||
| #### ARM: | |||
| - **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ ) | |||
| - **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 ) | |||
| #### ARM | |||
| #### ARM64: | |||
| - **ARMV8**: Experimental | |||
| - **ARMv6**: Optimized BLAS for vfpv2 and vfpv3-d16 (e.g. BCM2835, Cortex M0+) | |||
| - **ARMv7**: Optimized BLAS for vfpv3-d32 (e.g. Cortex A8, A9 and A15) | |||
| #### ARM64 | |||
| - **ARMv8**: Experimental | |||
| - **ARM Cortex-A57**: Experimental | |||
| #### PPC/PPC64 | |||
| - **POWER8**: Optmized Level-3 BLAS and some Level-1, only with USE_OPENMP=1 | |||
| #### IBM zEnterprise System: | |||
| - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) | |||
| - **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` | |||
| ### Support OS: | |||
| - **GNU/Linux** | |||
| - **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | |||
| - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. | |||
| - **FreeBSD**: Supported by community. We didn't test the library on this OS. | |||
| - **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. | |||
| #### IBM zEnterprise System | |||
| ## Usages | |||
| Link with libopenblas.a or -lopenblas for shared library. | |||
| - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) | |||
| ### Set the number of threads with environment variables. | |||
| ### Supported OS | |||
| Examples: | |||
| - **GNU/Linux** | |||
| - **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | |||
| - **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts. | |||
| - **FreeBSD**: Supported by the community. We don't actively test the library on this OS. | |||
| - **OpenBSD**: Supported by the community. We don't actively test the library on this OS. | |||
| - **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS. | |||
| - **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. | |||
| export OPENBLAS_NUM_THREADS=4 | |||
| ## Usage | |||
| or | |||
| Statically link with `libopenblas.a` or dynamically link with `-lopenblas` if OpenBLAS was | |||
| compiled as a shared library. | |||
| export GOTO_NUM_THREADS=4 | |||
| ### Setting the number of threads using environment variables | |||
| or | |||
| Environment variables are used to specify a maximum number of threads. | |||
| For example, | |||
| export OMP_NUM_THREADS=4 | |||
| ```sh | |||
| export OPENBLAS_NUM_THREADS=4 | |||
| export GOTO_NUM_THREADS=4 | |||
| export OMP_NUM_THREADS=4 | |||
| ``` | |||
| The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. | |||
| The priorities are `OPENBLAS_NUM_THREADS` > `GOTO_NUM_THREADS` > `OMP_NUM_THREADS`. | |||
| If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. | |||
| If you compile this library with `USE_OPENMP=1`, you should set the `OMP_NUM_THREADS` | |||
| environment variable; OpenBLAS ignores `OPENBLAS_NUM_THREADS` and `GOTO_NUM_THREADS` when | |||
| compiled with `USE_OPENMP=1`. | |||
| ### Set the number of threads on runtime. | |||
| ### Setting the number of threads at runtime | |||
| We provided the below functions to control the number of threads on runtime. | |||
| We provide the following functions to control the number of threads at runtime: | |||
| void goto_set_num_threads(int num_threads); | |||
| ```c | |||
| void goto_set_num_threads(int num_threads); | |||
| void openblas_set_num_threads(int num_threads); | |||
| ``` | |||
| void openblas_set_num_threads(int num_threads); | |||
| If you compile this library with `USE_OPENMP=1`, you should use the above functions too. | |||
| If you compile this lib with USE_OPENMP=1, you should use the above functions, too. | |||
| ## Reporting bugs | |||
| ## Report Bugs | |||
| Please add a issue in https://github.com/xianyi/OpenBLAS/issues | |||
| Please submit an issue in https://github.com/xianyi/OpenBLAS/issues. | |||
| ## Contact | |||
| * OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users | |||
| * OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev | |||
| ## ChangeLog | |||
| Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | |||
| ## Change log | |||
| Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version. | |||
| ## Troubleshooting | |||
| * Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. | |||
| * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. | |||
| * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. | |||
| * The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1. | |||
| * OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). | |||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. | |||
| * Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first. | |||
| * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. | |||
| * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. | |||
| Clang 3.0 will generate the wrong AVX binary code. | |||
| * The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), | |||
| there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build | |||
| the library with `BIGNUMA=1`. | |||
| * OpenBLAS does not set processor affinity by default. | |||
| On Linux, you can enable processor affinity by commenting out the line `NO_AFFINITY=1` in | |||
| Makefile.rule. However, note that this may cause | |||
| [a conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). | |||
| * On Loongson 3A, `make test` may fail with a `pthread_create` error (`EAGAIN`). | |||
| However, it will be okay when you run the same test case on the shell. | |||
| ## Contributing | |||
| 1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug. | |||
| 1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. | |||
| 1. Write a test which shows that the bug was fixed or that the feature works as expected. | |||
| 1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. | |||
| 1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue | |||
| to start a discussion around a feature idea or a bug. | |||
| 2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. | |||
| 3. Write a test which shows that the bug was fixed or that the feature works as expected. | |||
| 4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. | |||
| ## Donation | |||
| Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation). | |||
| @@ -56,6 +56,7 @@ CELL | |||
| 3.MIPS CPU: | |||
| P5600 | |||
| 1004K | |||
| 4.MIPS64 CPU: | |||
| SICORTEX | |||
| @@ -14,6 +14,20 @@ Please build OpenBLAS with larger `NUM_THREADS`. For example, `make | |||
| NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set | |||
| `MAX_CPU_NUMBER=NUM_THREADS`. | |||
| Despite its name, and due to the use of memory buffers in functions like SGEMM, | |||
| the setting of NUM_THREADS can be relevant even for a single-threaded build | |||
| of OpenBLAS, if such functions get called by multiple threads of a program | |||
| that uses OpenBLAS. In some cases, the affected code may simply crash or throw | |||
| a segmentation fault without displaying the above warning first. | |||
| Note that the number of threads used at runtime can be altered to differ from the | |||
| value NUM_THREADS was set to at build time. At runtime, the actual number of | |||
| threads can be set anywhere from 1 to the build's NUM_THREADS (note however, | |||
| that this does not change the number of memory buffers that will be allocated, | |||
| which is set at build time). The number of threads for a process can be set by | |||
| using the mechanisms described below. | |||
| #### How can I use OpenBLAS in multi-threaded applications? | |||
| If your application is already multi-threaded, it will conflict with OpenBLAS | |||
| @@ -54,6 +54,8 @@ $compiler = GCC if ($compiler eq ""); | |||
| $os = Linux if ($data =~ /OS_LINUX/); | |||
| $os = FreeBSD if ($data =~ /OS_FREEBSD/); | |||
| $os = NetBSD if ($data =~ /OS_NETBSD/); | |||
| $os = OpenBSD if ($data =~ /OS_OPENBSD/); | |||
| $os = DragonFly if ($data =~ /OS_DRAGONFLY/); | |||
| $os = Darwin if ($data =~ /OS_DARWIN/); | |||
| $os = SunOS if ($data =~ /OS_SUNOS/); | |||
| $os = AIX if ($data =~ /OS_AIX/); | |||
| @@ -1,6 +1,7 @@ | |||
| libdir=@CMAKE_INSTALL_FULL_LIBDIR@ | |||
| includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ | |||
| openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ | |||
| Name: OpenBLAS | |||
| Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version | |||
| Version: @OPENBLAS_VERSION@ | |||
| @@ -96,6 +96,10 @@ if (NOT CMAKE_CROSSCOMPILING) | |||
| endif() | |||
| if (NOT DEFINED NUM_PARALLEL) | |||
| set(NUM_PARALLEL 1) | |||
| endif() | |||
| if (NOT DEFINED NUM_THREADS) | |||
| if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0) | |||
| # HT? | |||
| @@ -224,6 +228,8 @@ endif () | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}") | |||
| if (USE_SIMPLE_THREADED_LEVEL3) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") | |||
| endif () | |||
| @@ -93,7 +93,7 @@ extern "C" { | |||
| #include <sched.h> | |||
| #endif | |||
| #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID) | |||
| #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID) | |||
| #include <sched.h> | |||
| #endif | |||
| @@ -179,7 +179,7 @@ extern "C" { | |||
| #define ALLOCA_ALIGN 63UL | |||
| #define NUM_BUFFERS (MAX_CPU_NUMBER * 2) | |||
| #define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) | |||
| #ifdef NEEDBUNDERSCORE | |||
| #define BLASFUNC(FUNC) FUNC##_ | |||
| @@ -649,6 +649,12 @@ int omp_get_num_procs(void); | |||
| __declspec(dllimport) int __cdecl omp_in_parallel(void); | |||
| __declspec(dllimport) int __cdecl omp_get_num_procs(void); | |||
| #endif | |||
| #if (__STDC_VERSION__ >= 201112L) | |||
| #ifndef _Atomic | |||
| #define _Atomic volatile | |||
| #endif | |||
| #include <stdatomic.h> | |||
| #endif | |||
| #else | |||
| #ifdef __ELF__ | |||
| int omp_in_parallel (void) __attribute__ ((weak)); | |||
| @@ -178,7 +178,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| result = x/y; | |||
| return result; | |||
| #else | |||
| #if (MAX_CPU_NUMBER > 64) | |||
| if ( y > 64) { | |||
| result = x/y; | |||
| return result; | |||
| } | |||
| #endif | |||
| y = blas_quick_divide_table[y]; | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); | |||
| @@ -327,7 +333,7 @@ REALNAME: | |||
| #endif | |||
| #endif | |||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) | |||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__) | |||
| #define PROLOGUE \ | |||
| .text; \ | |||
| .align 16; \ | |||
| @@ -196,6 +196,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| if (y <= 1) return x; | |||
| #if (MAX_CPU_NUMBER > 64) | |||
| if (y > 64) { | |||
| result = x / y; | |||
| return result; | |||
| } | |||
| #endif | |||
| y = blas_quick_divide_table[y]; | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); | |||
| @@ -403,7 +410,7 @@ REALNAME: | |||
| #define EPILOGUE .end | |||
| #endif | |||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) | |||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(__ELF__) || defined(C_PGI) | |||
| #define PROLOGUE \ | |||
| .text; \ | |||
| .align 512; \ | |||
| @@ -121,7 +121,7 @@ int detect(void) | |||
| return CPU_VULCAN; | |||
| else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) | |||
| return CPU_THUNDERX; | |||
| else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */ | |||
| else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43")) | |||
| return CPU_THUNDERX2T99; | |||
| } | |||
| @@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_P5600 1 | |||
| #define CPU_1004K 2 | |||
| static char *cpuname[] = { | |||
| "UNKOWN", | |||
| "P5600" | |||
| "P5600", | |||
| "1004K" | |||
| }; | |||
| int detect(void){ | |||
| @@ -90,7 +92,7 @@ int detect(void){ | |||
| if (!strncmp("cpu", buffer, 3)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| #if 0 | |||
| fprintf(stderr, "%s\n", p); | |||
| fprintf(stderr, "%s \n", p); | |||
| #endif | |||
| break; | |||
| } | |||
| @@ -99,43 +101,13 @@ int detect(void){ | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A")){ | |||
| return CPU_LOONGSON3A; | |||
| }else if(strstr(p, "Loongson-3B")){ | |||
| return CPU_LOONGSON3B; | |||
| }else if (strstr(p, "Loongson-3")){ | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| p = (char *)NULL; | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("system type", buffer, 11)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if (strstr(p, "loongson3a")) | |||
| return CPU_LOONGSON3A; | |||
| }else{ | |||
| if (strstr(p, "5600")) { | |||
| return CPU_P5600; | |||
| } else if (strstr(p, "1004K")) { | |||
| return CPU_1004K; | |||
| } else | |||
| return CPU_UNKNOWN; | |||
| } | |||
| } | |||
| //Check model name for Loongson3 | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| p = (char *)NULL; | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("model name", buffer, 10)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A")){ | |||
| return CPU_LOONGSON3A; | |||
| }else if(strstr(p, "Loongson-3B")){ | |||
| return CPU_LOONGSON3B; | |||
| } | |||
| } | |||
| #endif | |||
| return CPU_UNKNOWN; | |||
| } | |||
| @@ -149,7 +121,7 @@ void get_architecture(void){ | |||
| } | |||
| void get_subarchitecture(void){ | |||
| if(detect()==CPU_P5600){ | |||
| if(detect()==CPU_P5600|| detect()==CPU_1004K){ | |||
| printf("P5600"); | |||
| }else{ | |||
| printf("UNKNOWN"); | |||
| @@ -170,6 +142,14 @@ void get_cpuconfig(void){ | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| } else if (detect()==CPU_1004K) { | |||
| printf("#define MIPS1004K\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 26144\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 8\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| }else{ | |||
| printf("#define UNKNOWN\n"); | |||
| } | |||
| @@ -178,6 +158,8 @@ void get_cpuconfig(void){ | |||
| void get_libname(void){ | |||
| if(detect()==CPU_P5600) { | |||
| printf("p5600\n"); | |||
| } else if (detect()==CPU_1004K) { | |||
| printf("1004K\n"); | |||
| }else{ | |||
| printf("mips\n"); | |||
| } | |||
| @@ -60,6 +60,14 @@ OS_FREEBSD | |||
| OS_NETBSD | |||
| #endif | |||
| #if defined(__OpenBSD__) | |||
| OS_OPENBSD | |||
| #endif | |||
| #if defined(__DragonFly__) | |||
| OS_DRAGONFLY | |||
| #endif | |||
| #if defined(__sun) | |||
| OS_SUNOS | |||
| #endif | |||
| @@ -91,7 +91,12 @@ | |||
| #endif | |||
| typedef struct { | |||
| volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| #endif | |||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| } job_t; | |||
| @@ -67,7 +67,12 @@ | |||
| #endif | |||
| typedef struct { | |||
| volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| #endif | |||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| } job_t; | |||
| @@ -91,7 +91,12 @@ | |||
| #endif | |||
| typedef struct { | |||
| volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| #endif | |||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| } job_t; | |||
| @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /*********************************************************************/ | |||
| #include "common.h" | |||
| #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) | |||
| #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) | |||
| #include <dlfcn.h> | |||
| #include <signal.h> | |||
| #include <sys/resource.h> | |||
| @@ -36,6 +36,7 @@ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdbool.h> | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| //#include <sys/mman.h> | |||
| @@ -49,11 +50,16 @@ | |||
| int blas_server_avail = 0; | |||
| static void * blas_thread_buffer[MAX_CPU_NUMBER]; | |||
| static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; | |||
| #if __STDC_VERSION__ >= 201112L | |||
| static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; | |||
| #else | |||
| static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; | |||
| #endif | |||
| void goto_set_num_threads(int num_threads) { | |||
| int i=0; | |||
| int i=0, j=0; | |||
| if (num_threads < 1) num_threads = blas_num_threads; | |||
| @@ -68,15 +74,17 @@ void goto_set_num_threads(int num_threads) { | |||
| omp_set_num_threads(blas_cpu_number); | |||
| //adjust buffer for each thread | |||
| for(i=0; i<blas_cpu_number; i++){ | |||
| if(blas_thread_buffer[i]==NULL){ | |||
| blas_thread_buffer[i]=blas_memory_alloc(2); | |||
| for(i=0; i<MAX_PARALLEL_NUMBER; i++) { | |||
| for(j=0; j<blas_cpu_number; j++){ | |||
| if(blas_thread_buffer[i][j]==NULL){ | |||
| blas_thread_buffer[i][j]=blas_memory_alloc(2); | |||
| } | |||
| } | |||
| } | |||
| for(; i<MAX_CPU_NUMBER; i++){ | |||
| if(blas_thread_buffer[i]!=NULL){ | |||
| blas_memory_free(blas_thread_buffer[i]); | |||
| blas_thread_buffer[i]=NULL; | |||
| for(; j<MAX_CPU_NUMBER; j++){ | |||
| if(blas_thread_buffer[i][j]!=NULL){ | |||
| blas_memory_free(blas_thread_buffer[i][j]); | |||
| blas_thread_buffer[i][j]=NULL; | |||
| } | |||
| } | |||
| } | |||
| #if defined(ARCH_MIPS64) | |||
| @@ -92,30 +100,34 @@ void openblas_set_num_threads(int num_threads) { | |||
| int blas_thread_init(void){ | |||
| int i=0; | |||
| int i=0, j=0; | |||
| blas_get_cpu_number(); | |||
| blas_server_avail = 1; | |||
| for(i=0; i<blas_num_threads; i++){ | |||
| blas_thread_buffer[i]=blas_memory_alloc(2); | |||
| } | |||
| for(; i<MAX_CPU_NUMBER; i++){ | |||
| blas_thread_buffer[i]=NULL; | |||
| for(i=0; i<MAX_PARALLEL_NUMBER; i++) { | |||
| for(j=0; j<blas_num_threads; j++){ | |||
| blas_thread_buffer[i][j]=blas_memory_alloc(2); | |||
| } | |||
| for(; j<MAX_CPU_NUMBER; j++){ | |||
| blas_thread_buffer[i][j]=NULL; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int BLASFUNC(blas_thread_shutdown)(void){ | |||
| int i=0; | |||
| int i=0, j=0; | |||
| blas_server_avail = 0; | |||
| for(i=0; i<MAX_CPU_NUMBER; i++){ | |||
| if(blas_thread_buffer[i]!=NULL){ | |||
| blas_memory_free(blas_thread_buffer[i]); | |||
| blas_thread_buffer[i]=NULL; | |||
| for(i=0; i<MAX_PARALLEL_NUMBER; i++) { | |||
| for(j=0; j<MAX_CPU_NUMBER; j++){ | |||
| if(blas_thread_buffer[i][j]!=NULL){ | |||
| blas_memory_free(blas_thread_buffer[i][j]); | |||
| blas_thread_buffer[i][j]=NULL; | |||
| } | |||
| } | |||
| } | |||
| @@ -206,7 +218,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| } | |||
| } | |||
| static void exec_threads(blas_queue_t *queue){ | |||
| static void exec_threads(blas_queue_t *queue, int buf_index){ | |||
| void *buffer, *sa, *sb; | |||
| int pos=0, release_flag=0; | |||
| @@ -223,7 +235,7 @@ static void exec_threads(blas_queue_t *queue){ | |||
| if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { | |||
| pos = omp_get_thread_num(); | |||
| buffer = blas_thread_buffer[pos]; | |||
| buffer = blas_thread_buffer[buf_index][pos]; | |||
| //fallback | |||
| if(buffer==NULL) { | |||
| @@ -291,7 +303,7 @@ static void exec_threads(blas_queue_t *queue){ | |||
| int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| BLASLONG i; | |||
| BLASLONG i, buf_index; | |||
| if ((num <= 0) || (queue == NULL)) return 0; | |||
| @@ -302,6 +314,23 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| } | |||
| #endif | |||
| while(true) { | |||
| for(i=0; i < MAX_PARALLEL_NUMBER; i++) { | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Bool inuse = false; | |||
| if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) { | |||
| #else | |||
| if(blas_buffer_inuse[i] == false) { | |||
| blas_buffer_inuse[i] = true; | |||
| #endif | |||
| buf_index = i; | |||
| break; | |||
| } | |||
| } | |||
| if(i != MAX_PARALLEL_NUMBER) | |||
| break; | |||
| } | |||
| #pragma omp parallel for schedule(static) | |||
| for (i = 0; i < num; i ++) { | |||
| @@ -309,9 +338,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| queue[i].position = i; | |||
| #endif | |||
| exec_threads(&queue[i]); | |||
| exec_threads(&queue[i], buf_index); | |||
| } | |||
| #if __STDC_VERSION__ >= 201112L | |||
| atomic_store(&blas_buffer_inuse[buf_index], false); | |||
| #else | |||
| blas_buffer_inuse[buf_index] = false; | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -108,7 +108,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <sys/resource.h> | |||
| #endif | |||
| #if defined(OS_FREEBSD) || defined(OS_DARWIN) | |||
| #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) | |||
| #include <sys/sysctl.h> | |||
| #include <sys/resource.h> | |||
| #endif | |||
| @@ -147,9 +147,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) | |||
| #define CONSTRUCTOR __attribute__ ((constructor)) | |||
| #define DESTRUCTOR __attribute__ ((destructor)) | |||
| #else | |||
| #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) | |||
| #define CONSTRUCTOR __attribute__ ((constructor(101))) | |||
| #define DESTRUCTOR __attribute__ ((destructor(101))) | |||
| #else | |||
| #define CONSTRUCTOR __attribute__ ((constructor)) | |||
| #define DESTRUCTOR __attribute__ ((destructor)) | |||
| #endif | |||
| #ifdef DYNAMIC_ARCH | |||
| @@ -209,7 +212,8 @@ int ret; | |||
| size = CPU_ALLOC_SIZE(nums); | |||
| ret = sched_getaffinity(0,size,cpusetp); | |||
| if (ret!=0) return nums; | |||
| nums = CPU_COUNT_S(size,cpusetp); | |||
| ret = CPU_COUNT_S(size,cpusetp); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| #endif | |||
| @@ -246,7 +250,7 @@ int get_num_procs(void) { | |||
| #endif | |||
| #if defined(OS_FREEBSD) | |||
| #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) | |||
| int get_num_procs(void) { | |||
| @@ -336,7 +340,7 @@ extern int openblas_goto_num_threads_env(); | |||
| extern int openblas_omp_num_threads_env(); | |||
| int blas_get_cpu_number(void){ | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| int max_num; | |||
| #endif | |||
| int blas_goto_num = 0; | |||
| @@ -344,7 +348,7 @@ int blas_get_cpu_number(void){ | |||
| if (blas_num_threads) return blas_num_threads; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| max_num = get_num_procs(); | |||
| #endif | |||
| @@ -368,7 +372,7 @@ int blas_get_cpu_number(void){ | |||
| else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | |||
| else blas_num_threads = MAX_CPU_NUMBER; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| if (blas_num_threads > max_num) blas_num_threads = max_num; | |||
| #endif | |||
| @@ -54,6 +54,9 @@ static char* openblas_config_str="" | |||
| #ifdef NO_AFFINITY | |||
| "NO_AFFINITY " | |||
| #endif | |||
| #ifdef USE_OPENMP | |||
| "USE_OPENMP " | |||
| #endif | |||
| #ifndef DYNAMIC_ARCH | |||
| CHAR_CORENAME | |||
| #endif | |||
| @@ -61,18 +64,23 @@ static char* openblas_config_str="" | |||
| #ifdef DYNAMIC_ARCH | |||
| char *gotoblas_corename(); | |||
| static char tmp_config_str[256]; | |||
| #endif | |||
| static char tmp_config_str[256]; | |||
| int openblas_get_parallel(); | |||
| char* CNAME() { | |||
| #ifndef DYNAMIC_ARCH | |||
| return openblas_config_str; | |||
| #else | |||
| char tmpstr[20]; | |||
| strcpy(tmp_config_str, openblas_config_str); | |||
| #ifdef DYNAMIC_ARCH | |||
| strcat(tmp_config_str, gotoblas_corename()); | |||
| return tmp_config_str; | |||
| #endif | |||
| if (openblas_get_parallel() == 0) | |||
| sprintf(tmpstr, " SINGLE_THREADED"); | |||
| else | |||
| snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER); | |||
| strcat(tmp_config_str, tmpstr); | |||
| return tmp_config_str; | |||
| } | |||
| @@ -83,3 +91,4 @@ char* openblas_get_corename() { | |||
| return gotoblas_corename(); | |||
| #endif | |||
| } | |||
| @@ -156,7 +156,7 @@ endif | |||
| endif | |||
| #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD)) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) | |||
| so : ../$(LIBSONAME) | |||
| @@ -97,7 +97,7 @@ if ($compiler eq "") { | |||
| if ($data =~ /Intel/) { | |||
| $vendor = INTEL; | |||
| $openmp = "-openmp"; | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($data =~ /Sun Fortran/) { | |||
| @@ -127,7 +127,7 @@ if ($compiler eq "") { | |||
| # for embeded underscore name, e.g. zho_ge, it may append 2 underscores. | |||
| $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; | |||
| if ($data =~ /zho_ge__/) { | |||
| if ($data =~ / zho_ge__/) { | |||
| $need2bu = 1; | |||
| } | |||
| } | |||
| @@ -155,7 +155,7 @@ if ($compiler eq "") { | |||
| if ($compiler =~ /ifort/) { | |||
| $vendor = INTEL; | |||
| $bu = "_"; | |||
| $openmp = "-openmp"; | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($compiler =~ /pathf/) { | |||
| @@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef OS_WINDOWS | |||
| #include <windows.h> | |||
| #endif | |||
| #if defined(__FreeBSD__) || defined(__APPLE__) | |||
| #if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) | |||
| #include <sys/types.h> | |||
| #include <sys/sysctl.h> | |||
| #endif | |||
| @@ -1074,7 +1074,7 @@ static int get_num_cores(void) { | |||
| #ifdef OS_WINDOWS | |||
| SYSTEM_INFO sysinfo; | |||
| #elif defined(__FreeBSD__) || defined(__APPLE__) | |||
| #elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) | |||
| int m[2], count; | |||
| size_t len; | |||
| #endif | |||
| @@ -1088,7 +1088,7 @@ static int get_num_cores(void) { | |||
| GetSystemInfo(&sysinfo); | |||
| return sysinfo.dwNumberOfProcessors; | |||
| #elif defined(__FreeBSD__) || defined(__APPLE__) | |||
| #elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) | |||
| m[0] = CTL_HW; | |||
| m[1] = HW_NCPU; | |||
| len = sizeof(int); | |||
| @@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| } else | |||
| nthreads = 1; | |||
| /* FIXME TRMV multithreading appears to be broken, see issue 1332*/ | |||
| nthreads = 1; | |||
| if(nthreads > 1) { | |||
| buffer_size = n > 16 ? 0 : n * 4 + 40; | |||
| } | |||
| @@ -29,10 +29,8 @@ USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), HASWELL) | |||
| ifeq ($(ARCH), x86_64) | |||
| USE_TRMM = 1 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), ZEN) | |||
| USE_TRMM = 1 | |||
| @@ -440,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp N, #0 | |||
| ble axpy_kernel_L999 | |||
| /* | |||
| cmp INC_X, #0 | |||
| beq axpy_kernel_L999 | |||
| cmp INC_Y, #0 | |||
| beq axpy_kernel_L999 | |||
| */ | |||
| cmp INC_X, #1 | |||
| bne axpy_kernel_S_BEGIN | |||
| @@ -483,13 +483,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp N, #0 | |||
| ble rot_kernel_L999 | |||
| /* | |||
| cmp INC_X, #0 | |||
| beq rot_kernel_L999 | |||
| cmp INC_Y, #0 | |||
| beq rot_kernel_L999 | |||
| */ | |||
| cmp INC_X, #1 | |||
| bne rot_kernel_S_BEGIN | |||
| @@ -584,6 +584,12 @@ rot_kernel_S1: | |||
| rot_kernel_S10: | |||
| KERNEL_S1 | |||
| cmp INC_X, #0 | |||
| beq rot_kernel_L999 | |||
| cmp INC_Y, #0 | |||
| beq rot_kernel_L999 | |||
| subs I, I, #1 | |||
| bne rot_kernel_S10 | |||
| @@ -116,22 +116,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| if (m & 1) { | |||
| if (X > posY) { | |||
| /* ao1 += 1; | |||
| ao2 += 1; */ | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| b += 2; | |||
| } else | |||
| #ifdef UNIT | |||
| if (X < posY) { | |||
| #endif | |||
| b[ 0] = *(ao1 + 0); | |||
| #ifdef UNIT | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| ao1 += lda; | |||
| b += 2; | |||
| } else { | |||
| #ifdef UNIT | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = ONE; | |||
| b[ 1] = data02; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| data02 = *(ao1 + 1); | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| #endif | |||
| ao1 += 2; | |||
| b += 2; | |||
| } | |||
| #endif | |||
| b[ 1] = *(ao1 + 1); | |||
| b += 2; | |||
| } | |||
| posY += 2; | |||
| @@ -178,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| } while (i > 0); | |||
| } | |||
| // posY += 1; | |||
| posY += 1; | |||
| } | |||
| return 0; | |||
| @@ -518,7 +518,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| i = (m & 15); | |||
| if (i > 0) { | |||
| if (X < posY) { | |||
| /* a01 += i; | |||
| a01 += i; | |||
| a02 += i; | |||
| a03 += i; | |||
| a04 += i; | |||
| @@ -533,7 +533,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| a13 += i; | |||
| a14 += i; | |||
| a15 += i; | |||
| a16 += i; */ | |||
| a16 += i; | |||
| b += 16 * i; | |||
| } else | |||
| if (X > posY) { | |||
| @@ -1130,14 +1130,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| i = (m & 7); | |||
| if (i > 0) { | |||
| if (X < posY) { | |||
| /* a01 += i; | |||
| a01 += i; | |||
| a02 += i; | |||
| a03 += i; | |||
| a04 += i; | |||
| a05 += i; | |||
| a06 += i; | |||
| a07 += i; | |||
| a08 += i; */ | |||
| a08 += i; | |||
| b += 8 * i; | |||
| } else | |||
| if (X > posY) { | |||
| @@ -1156,13 +1156,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| b += 8; | |||
| } | |||
| /* a02 += i * lda; | |||
| a02 += i * lda; | |||
| a03 += i * lda; | |||
| a04 += i * lda; | |||
| a05 += i * lda; | |||
| a06 += i * lda; | |||
| a07 += i * lda; | |||
| a08 += i * lda; */ | |||
| a08 += i * lda; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| @@ -1371,10 +1371,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| i = (m & 3); | |||
| if (i > 0) { | |||
| if (X < posY) { | |||
| /* a01 += i; | |||
| a01 += i; | |||
| a02 += i; | |||
| a03 += i; | |||
| a04 += i; */ | |||
| a04 += i; | |||
| b += 4 * i; | |||
| } else | |||
| if (X > posY) { | |||
| @@ -1387,9 +1387,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| a01 += lda; | |||
| b += 4; | |||
| } | |||
| /* a02 += lda; | |||
| a02 += lda; | |||
| a03 += lda; | |||
| a04 += lda; */ | |||
| a04 += lda; | |||
| } else { | |||
| #ifdef UNIT | |||
| @@ -1487,19 +1487,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| if (X < posY) { | |||
| a01 ++; | |||
| a02 ++; | |||
| } else { | |||
| #ifdef UNIT | |||
| b += 2; | |||
| } else | |||
| if (X > posY) { | |||
| #endif | |||
| b[ 0] = *(a01 + 0); | |||
| #ifdef UNIT | |||
| b[ 1] = *(a01 + 1); | |||
| a01 += lda; | |||
| b += 2; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| } | |||
| b[ 1] = *(a01 + 1); | |||
| #else | |||
| b[ 0] = *(a01 + 0); | |||
| b[ 1] = *(a01 + 1); | |||
| #endif | |||
| b[ 1] = *(a01 + 1); | |||
| } | |||
| b += 2; | |||
| b += 2; | |||
| } | |||
| } | |||
| posY += 2; | |||
| } | |||
| @@ -1518,25 +1522,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| if (i > 0) { | |||
| do { | |||
| if (X < posY) { | |||
| a01 ++; | |||
| } else { | |||
| #ifdef UNIT | |||
| a01 += 1; | |||
| b ++; | |||
| } else | |||
| if (X > posY) { | |||
| #endif | |||
| b[ 0] = *(a01 + 0); | |||
| #ifdef UNIT | |||
| a01 += lda; | |||
| b ++; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| } | |||
| #else | |||
| b[ 0] = *(a01 + 0); | |||
| #endif | |||
| a01 += lda; | |||
| } | |||
| b ++; | |||
| X ++; | |||
| i --; | |||
| a01 += lda; | |||
| b ++; | |||
| } | |||
| X += 1; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| // posY += 1; | |||
| posY += 1; | |||
| } | |||
| return 0; | |||
| @@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| if (m & 1) { | |||
| if (X < posY) { | |||
| /* ao1 += 1; | |||
| ao2 += 1; */ | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| b += 2; | |||
| } else | |||
| if (X > posY) { | |||
| @@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| // ao1 += lda; | |||
| ao1 += lda; | |||
| b += 2; | |||
| } else { | |||
| #ifdef UNIT | |||
| @@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| b[ 0] = data01; | |||
| b[ 1] = ZERO; | |||
| #endif | |||
| // ao1 += lda; | |||
| ao1 += lda; | |||
| b += 2; | |||
| } | |||
| } | |||
| @@ -161,18 +161,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| i = m; | |||
| if (m > 0) { | |||
| do { | |||
| if (X < posY) { | |||
| b += 1; | |||
| ao1 += 1; | |||
| } else | |||
| if (X > posY) { | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| b += 1; | |||
| ao1 += lda; | |||
| } else { | |||
| #ifdef UNIT | |||
| if (X > posY) { | |||
| #endif | |||
| b[ 0] = *(ao1 + 0); | |||
| #ifdef UNIT | |||
| } else { | |||
| b[ 0] = ONE; | |||
| } | |||
| b[ 0] = ONE; | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| #endif | |||
| b ++; | |||
| ao1 += lda; | |||
| X ++; | |||
| b += 1; | |||
| ao1 += lda; | |||
| } | |||
| X += 1; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| @@ -201,18 +201,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| if (X < posY) { | |||
| if (m & 2) { | |||
| /* ao1 += 2; | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| ao3 += 2; | |||
| ao4 += 2; */ | |||
| ao4 += 2; | |||
| b += 8; | |||
| } | |||
| if (m & 1) { | |||
| /* ao1 += 1; | |||
| ao1 += 1; | |||
| ao2 += 1; | |||
| ao3 += 1; | |||
| ao4 += 1; */ | |||
| ao4 += 1; | |||
| b += 4; | |||
| } | |||
| @@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| b[ 7] = data08; | |||
| ao1 += 2 * lda; | |||
| // ao2 += 2 * lda; | |||
| ao2 += 2 * lda; | |||
| b += 8; | |||
| } | |||
| @@ -253,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| b[ 2] = data03; | |||
| b[ 3] = data04; | |||
| // ao1 += lda; | |||
| ao1 += lda; | |||
| b += 4; | |||
| } | |||
| @@ -401,7 +401,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| if (i) { | |||
| if (X < posY) { | |||
| // ao1 += 2; | |||
| ao1 += 2; | |||
| b += 2; | |||
| } else | |||
| if (X > posY) { | |||
| @@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| b[ 0] = data01; | |||
| b[ 1] = data02; | |||
| // ao1 += lda; | |||
| ao1 += lda; | |||
| b += 2; | |||
| } else { | |||
| #ifdef UNIT | |||
| @@ -443,21 +443,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| do { | |||
| if (X < posY) { | |||
| b += 1; | |||
| ao1 += 1; | |||
| } else { | |||
| #ifdef UNIT | |||
| } else | |||
| if (X > posY) { | |||
| #endif | |||
| b[ 0] = *(ao1 + 0); | |||
| #ifdef UNIT | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| ao1 += lda; | |||
| b += 1; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| } | |||
| #else | |||
| data01 = *(ao1 + 0); | |||
| b[ 0] = data01; | |||
| #endif | |||
| ao1 += lda; | |||
| } | |||
| b ++; | |||
| X ++; | |||
| ao1 += lda; | |||
| b += 1; | |||
| } | |||
| X += 1; | |||
| i --; | |||
| } while (i > 0); | |||
| } | |||
| @@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| } | |||
| a1 += 2 * lda; | |||
| // a2 += 2 * lda; | |||
| a2 += 2 * lda; | |||
| b += 8; | |||
| ii += 2; | |||
| @@ -139,18 +139,48 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| } | |||
| if (m & 1) { | |||
| #ifdef UNIT | |||
| if (X > posY) { | |||
| ao1 += 2; | |||
| ao2 += 2; | |||
| b += 4; | |||
| } else | |||
| if (X < posY) { | |||
| #endif | |||
| b[ 0] = *(ao1 + 0); | |||
| b[ 1] = *(ao1 + 1); | |||
| #ifdef UNIT | |||
| data1 = *(ao1 + 0); | |||
| data2 = *(ao1 + 1); | |||
| data3 = *(ao1 + 2); | |||
| data4 = *(ao1 + 3); | |||
| b[ 0] = data1; | |||
| b[ 1] = data2; | |||
| b[ 2] = data3; | |||
| b[ 3] = data4; | |||
| ao1 += lda; | |||
| b += 4; | |||
| } else { | |||
| #ifdef UNIT | |||
| data3 = *(ao1 + 2); | |||
| data4 = *(ao1 + 3); | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| } | |||
| b[ 2] = data3; | |||
| b[ 3] = data4; | |||
| #else | |||
| data1 = *(ao1 + 0); | |||
| data2 = *(ao1 + 1); | |||
| data3 = *(ao1 + 2); | |||
| data4 = *(ao1 + 3); | |||
| b[ 0] = data1; | |||
| b[ 1] = data2; | |||
| b[ 2] = data3; | |||
| b[ 3] = data4; | |||
| #endif | |||
| b += 4; | |||
| b += 4; | |||
| } | |||
| } | |||
| posY += 2; | |||
| @@ -203,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| } while (i > 0); | |||
| } | |||
| // posY += 1; | |||
| posY += 1; | |||
| } | |||
| return 0; | |||
| @@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| BLASLONG i, ii, j, jj; | |||
| FLOAT data01 = 0.0, data02 = 0.0; | |||
| FLOAT data01, data02; | |||
| FLOAT *a1; | |||
| lda *= 2; | |||
| @@ -43,8 +43,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| BLASLONG i, ii, j, jj; | |||
| FLOAT data01 = 0.0, data02 = 0.0, data03, data04; | |||
| FLOAT data05, data06, data07 = 0.0, data08 = 0.0; | |||
| FLOAT data01, data02, data03, data04; | |||
| FLOAT data05, data06, data07, data08; | |||
| FLOAT *a1, *a2; | |||
| lda *= 2; | |||
| @@ -0,0 +1 @@ | |||
| include $(KERNELDIR)/KERNEL.P5600 | |||
| @@ -38,8 +38,8 @@ ZASUMKERNEL = ../mips/zasum_msa.c | |||
| else | |||
| SASUMKERNEL = ../mips/asum.c | |||
| DASUMKERNEL = ../mips/asum.c | |||
| CASUMKERNEL = ../mips/asum.c | |||
| ZASUMKERNEL = ../mips/asum.c | |||
| CASUMKERNEL = ../mips/zasum.c | |||
| ZASUMKERNEL = ../mips/zasum.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| @@ -253,4 +253,4 @@ ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| endif | |||
| @@ -484,10 +484,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, | |||
| FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; | |||
| FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; | |||
| v2f64 v_alpha; | |||
| v2f64 x0, x1, x2, x3, y0 = 0.0, y1 = 0.0, y2 = 0.0, y3 = 0.0; | |||
| v2f64 x0, x1, x2, x3, y0 = {0,0}, y1 = {0,0}, y2 = {0,0}, y3 = {0,0}; | |||
| v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; | |||
| v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; | |||
| v2f64 t30, t31, tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0; | |||
| v2f64 t30, t31, tp0 = {0,0}, tp1 = {0,0}, tp2 = {0,0}, tp3 = {0,0}, tp4 = {0,0}, tp5 = {0,0}, tp6 = {0,0}, tp7 = {0,0}; | |||
| v_alpha = COPY_DOUBLE_TO_VECTOR(alpha); | |||
| @@ -41,8 +41,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| while(i < n) | |||
| { | |||
| dot += y[iy] * x[ix] ; | |||
| #if defined(DSDOT) | |||
| dot += (double)y[iy] * (double)x[ix] ; | |||
| #else | |||
| dot += y[iy] * x[ix]; | |||
| #endif | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| @@ -423,9 +423,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, | |||
| FLOAT *y_org = y; | |||
| FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; | |||
| FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; | |||
| v4f32 v_alpha, x0, x1, y0 = 0.0, y1 = 0.0; | |||
| v4f32 v_alpha, x0, x1, y0 = {0,0,0,0}, y1 = {0,0,0,0}; | |||
| v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; | |||
| v4f32 tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0; | |||
| v4f32 tp0 = {0,0,0,0}, tp1 = {0,0,0,0}, tp2 = {0,0,0,0}, tp3 = {0,0,0,0}, tp4 = {0,0,0,0}, tp5 = {0,0,0,0}, tp6 = {0,0,0,0}, tp7 = {0,0,0,0}; | |||
| v_alpha = COPY_FLOAT_TO_VECTOR(alpha); | |||
| @@ -54,3 +54,6 @@ ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S | |||
| CROTKERNEL = ../arm/zrot.c | |||
| ZROTKERNEL = ../arm/zrot.c | |||
| @@ -39,7 +39,7 @@ lapack_int LAPACKE_clacgv( lapack_int n, lapack_complex_float* x, | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input matrices for NaNs */ | |||
| if( LAPACKE_c_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) { | |||
| if( LAPACKE_c_nancheck( n, x, incx ) ) { | |||
| return -2; | |||
| } | |||
| } | |||
| @@ -51,16 +51,21 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input matrices for NaNs */ | |||
| ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && | |||
| LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && | |||
| LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); | |||
| nrows_v = ( LAPACKE_lsame( storev, 'c' ) && | |||
| LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'c' ) && | |||
| LAPACKE_lsame( side, 'r' ) ) ? n : | |||
| ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); | |||
| lapack_int lrv, lcv; /* row, column stride */ | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| lrv = 1; | |||
| lcv = ldv; | |||
| } else { | |||
| lrv = ldv; | |||
| lcv = 1; | |||
| } | |||
| ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); | |||
| nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : | |||
| ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); | |||
| if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -13; | |||
| } | |||
| @@ -70,8 +75,8 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct | |||
| if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { | |||
| if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], | |||
| ldv ) ) | |||
| if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, | |||
| &v[k*lrv], ldv ) ) | |||
| return -9; | |||
| } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { | |||
| if( k > nrows_v ) { | |||
| @@ -79,23 +84,23 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct | |||
| return -8; | |||
| } | |||
| if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, | |||
| &v[(nrows_v-k)*ldv], ldv ) ) | |||
| &v[(nrows_v-k)*lrv], ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) | |||
| return -9; | |||
| } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { | |||
| if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], | |||
| ldv ) ) | |||
| if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, | |||
| &v[k*lrv], ldv ) ) | |||
| return -9; | |||
| } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { | |||
| } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { | |||
| if( k > ncols_v ) { | |||
| LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); | |||
| return -8; | |||
| } | |||
| if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], | |||
| ldv ) ) | |||
| if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, | |||
| &v[(ncols_v-k)*lcv], ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) | |||
| return -9; | |||
| @@ -43,7 +43,7 @@ lapack_int LAPACKE_clarfg( lapack_int n, lapack_complex_float* alpha, | |||
| if( LAPACKE_c_nancheck( 1, alpha, 1 ) ) { | |||
| return -2; | |||
| } | |||
| if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { | |||
| if( LAPACKE_c_nancheck( n-1, x, incx ) ) { | |||
| return -3; | |||
| } | |||
| } | |||
| @@ -38,6 +38,7 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m, | |||
| lapack_complex_float tau, lapack_complex_float* c, | |||
| lapack_int ldc, lapack_complex_float* work ) | |||
| { | |||
| lapack_int lv; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_clarfx", -1 ); | |||
| return -1; | |||
| @@ -51,7 +52,8 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m, | |||
| if( LAPACKE_c_nancheck( 1, &tau, 1 ) ) { | |||
| return -6; | |||
| } | |||
| if( LAPACKE_c_nancheck( m, v, 1 ) ) { | |||
| lv = (LAPACKE_lsame( side, 'l' ) ? m : n); | |||
| if( LAPACKE_c_nancheck( lv, v, 1 ) ) { | |||
| return -5; | |||
| } | |||
| } | |||
| @@ -39,7 +39,7 @@ lapack_int LAPACKE_classq( lapack_int n, lapack_complex_float* x, | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ | |||
| if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { | |||
| if( LAPACKE_c_nancheck( n, x, incx ) ) { | |||
| return -2; | |||
| } | |||
| if( LAPACKE_s_nancheck( 1, scale, 1 ) ) { | |||
| @@ -58,7 +58,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans, | |||
| if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -10; | |||
| } | |||
| if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) { | |||
| if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) { | |||
| return -9; | |||
| } | |||
| } | |||
| @@ -58,7 +58,7 @@ lapack_int LAPACKE_cupmtr( int matrix_layout, char side, char uplo, char trans, | |||
| if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -9; | |||
| } | |||
| if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) { | |||
| if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) { | |||
| return -8; | |||
| } | |||
| } | |||
| @@ -50,16 +50,21 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input matrices for NaNs */ | |||
| ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && | |||
| LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && | |||
| LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); | |||
| nrows_v = ( LAPACKE_lsame( storev, 'c' ) && | |||
| LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'c' ) && | |||
| LAPACKE_lsame( side, 'r' ) ) ? n : | |||
| ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); | |||
| lapack_int lrv, lcv; /* row, column stride */ | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| lrv = 1; | |||
| lcv = ldv; | |||
| } else { | |||
| lrv = ldv; | |||
| lcv = 1; | |||
| } | |||
| ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); | |||
| nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : | |||
| ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); | |||
| if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -13; | |||
| } | |||
| @@ -69,8 +74,8 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct | |||
| if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { | |||
| if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], | |||
| ldv ) ) | |||
| if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, | |||
| &v[k*lrv], ldv ) ) | |||
| return -9; | |||
| } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { | |||
| if( k > nrows_v ) { | |||
| @@ -78,23 +83,23 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct | |||
| return -8; | |||
| } | |||
| if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, | |||
| &v[(nrows_v-k)*ldv], ldv ) ) | |||
| &v[(nrows_v-k)*lrv], ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) | |||
| return -9; | |||
| } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { | |||
| if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], | |||
| ldv ) ) | |||
| if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, | |||
| &v[k*lrv], ldv ) ) | |||
| return -9; | |||
| } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { | |||
| } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { | |||
| if( k > ncols_v ) { | |||
| LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); | |||
| return -8; | |||
| } | |||
| if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], | |||
| ldv ) ) | |||
| if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, | |||
| &v[(ncols_v-k)*lcv], ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) | |||
| return -9; | |||
| @@ -42,7 +42,7 @@ lapack_int LAPACKE_dlarfg( lapack_int n, double* alpha, double* x, | |||
| if( LAPACKE_d_nancheck( 1, alpha, 1 ) ) { | |||
| return -2; | |||
| } | |||
| if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { | |||
| if( LAPACKE_d_nancheck( n-1, x, incx ) ) { | |||
| return -3; | |||
| } | |||
| } | |||
| @@ -37,6 +37,7 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m, | |||
| lapack_int n, const double* v, double tau, double* c, | |||
| lapack_int ldc, double* work ) | |||
| { | |||
| lapack_int lv; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_dlarfx", -1 ); | |||
| return -1; | |||
| @@ -50,7 +51,8 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m, | |||
| if( LAPACKE_d_nancheck( 1, &tau, 1 ) ) { | |||
| return -6; | |||
| } | |||
| if( LAPACKE_d_nancheck( m, v, 1 ) ) { | |||
| lv = (LAPACKE_lsame( side, 'l' ) ? m : n); | |||
| if( LAPACKE_d_nancheck( lv, v, 1 ) ) { | |||
| return -5; | |||
| } | |||
| } | |||
| @@ -38,7 +38,7 @@ lapack_int LAPACKE_dlassq( lapack_int n, double* x, lapack_int incx, double* sca | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ | |||
| if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { | |||
| if( LAPACKE_d_nancheck( n, x, incx ) ) { | |||
| return -2; | |||
| } | |||
| if( LAPACKE_d_nancheck( 1, scale, 1 ) ) { | |||
| @@ -56,7 +56,7 @@ lapack_int LAPACKE_dopmtr( int matrix_layout, char side, char uplo, char trans, | |||
| if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -9; | |||
| } | |||
| if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) { | |||
| if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) { | |||
| return -8; | |||
| } | |||
| } | |||
| @@ -57,7 +57,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans, | |||
| if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -10; | |||
| } | |||
| if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) { | |||
| if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) { | |||
| return -9; | |||
| } | |||
| } | |||
| @@ -50,16 +50,21 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input matrices for NaNs */ | |||
| ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && | |||
| LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && | |||
| LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); | |||
| nrows_v = ( LAPACKE_lsame( storev, 'c' ) && | |||
| LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'c' ) && | |||
| LAPACKE_lsame( side, 'r' ) ) ? n : | |||
| ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); | |||
| lapack_int lrv, lcv; /* row, column stride */ | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| lrv = 1; | |||
| lcv = ldv; | |||
| } else { | |||
| lrv = ldv; | |||
| lcv = 1; | |||
| } | |||
| ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); | |||
| nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : | |||
| ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); | |||
| if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -13; | |||
| } | |||
| @@ -69,8 +74,8 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct | |||
| if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { | |||
| if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], | |||
| ldv ) ) | |||
| if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, | |||
| &v[k*lrv], ldv ) ) | |||
| return -9; | |||
| } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { | |||
| if( k > nrows_v ) { | |||
| @@ -78,23 +83,23 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct | |||
| return -8; | |||
| } | |||
| if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, | |||
| &v[(nrows_v-k)*ldv], ldv ) ) | |||
| &v[(nrows_v-k)*lrv], ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) | |||
| return -9; | |||
| } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { | |||
| if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], | |||
| ldv ) ) | |||
| if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, | |||
| &v[k*lrv], ldv ) ) | |||
| return -9; | |||
| } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { | |||
| } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { | |||
| if( k > ncols_v ) { | |||
| LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); | |||
| return -8; | |||
| } | |||
| if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], | |||
| ldv ) ) | |||
| if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, | |||
| &v[(ncols_v-k)*lcv], ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) | |||
| return -9; | |||
| @@ -42,7 +42,7 @@ lapack_int LAPACKE_slarfg( lapack_int n, float* alpha, float* x, | |||
| if( LAPACKE_s_nancheck( 1, alpha, 1 ) ) { | |||
| return -2; | |||
| } | |||
| if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { | |||
| if( LAPACKE_s_nancheck( n-1, x, incx ) ) { | |||
| return -3; | |||
| } | |||
| } | |||
| @@ -37,6 +37,7 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m, | |||
| lapack_int n, const float* v, float tau, float* c, | |||
| lapack_int ldc, float* work ) | |||
| { | |||
| lapack_int lv; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_slarfx", -1 ); | |||
| return -1; | |||
| @@ -50,7 +51,8 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m, | |||
| if( LAPACKE_s_nancheck( 1, &tau, 1 ) ) { | |||
| return -6; | |||
| } | |||
| if( LAPACKE_s_nancheck( m, v, 1 ) ) { | |||
| lv = (LAPACKE_lsame( side, 'l' ) ? m : n); | |||
| if( LAPACKE_s_nancheck( lv, v, 1 ) ) { | |||
| return -5; | |||
| } | |||
| } | |||
| @@ -38,7 +38,7 @@ lapack_int LAPACKE_slassq( lapack_int n, float* x, lapack_int incx, float* scale | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ | |||
| if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { | |||
| if( LAPACKE_s_nancheck( n, x, incx ) ) { | |||
| return -2; | |||
| } | |||
| if( LAPACKE_s_nancheck( 1, scale, 1 ) ) { | |||
| @@ -56,7 +56,7 @@ lapack_int LAPACKE_sopmtr( int matrix_layout, char side, char uplo, char trans, | |||
| if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -9; | |||
| } | |||
| if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) { | |||
| if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) { | |||
| return -8; | |||
| } | |||
| } | |||
| @@ -57,7 +57,7 @@ lapack_int LAPACKE_sormtr( int matrix_layout, char side, char uplo, char trans, | |||
| if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -10; | |||
| } | |||
| if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) { | |||
| if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) { | |||
| return -9; | |||
| } | |||
| } | |||
| @@ -39,7 +39,7 @@ lapack_int LAPACKE_zlacgv( lapack_int n, lapack_complex_double* x, | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input matrices for NaNs */ | |||
| if( LAPACKE_z_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) { | |||
| if( LAPACKE_z_nancheck( n, x, incx ) ) { | |||
| return -2; | |||
| } | |||
| } | |||
| @@ -51,16 +51,21 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input matrices for NaNs */ | |||
| ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && | |||
| LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && | |||
| LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); | |||
| nrows_v = ( LAPACKE_lsame( storev, 'c' ) && | |||
| LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'c' ) && | |||
| LAPACKE_lsame( side, 'r' ) ) ? n : | |||
| ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); | |||
| lapack_int lrv, lcv; /* row, column stride */ | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| lrv = 1; | |||
| lcv = ldv; | |||
| } else { | |||
| lrv = ldv; | |||
| lcv = 1; | |||
| } | |||
| ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); | |||
| nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : | |||
| ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : | |||
| ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); | |||
| if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -13; | |||
| } | |||
| @@ -70,8 +75,8 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct | |||
| if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { | |||
| if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], | |||
| ldv ) ) | |||
| if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, | |||
| &v[k*lrv], ldv ) ) | |||
| return -9; | |||
| } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { | |||
| if( k > nrows_v ) { | |||
| @@ -79,23 +84,23 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct | |||
| return -8; | |||
| } | |||
| if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, | |||
| &v[(nrows_v-k)*ldv], ldv ) ) | |||
| &v[(nrows_v-k)*lrv], ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) | |||
| return -9; | |||
| } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { | |||
| if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], | |||
| ldv ) ) | |||
| if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, | |||
| &v[k*lrv], ldv ) ) | |||
| return -9; | |||
| } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { | |||
| } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { | |||
| if( k > ncols_v ) { | |||
| LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); | |||
| return -8; | |||
| } | |||
| if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], | |||
| ldv ) ) | |||
| if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, | |||
| &v[(ncols_v-k)*lcv], ldv ) ) | |||
| return -9; | |||
| if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) | |||
| return -9; | |||
| @@ -43,7 +43,7 @@ lapack_int LAPACKE_zlarfg( lapack_int n, lapack_complex_double* alpha, | |||
| if( LAPACKE_z_nancheck( 1, alpha, 1 ) ) { | |||
| return -2; | |||
| } | |||
| if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { | |||
| if( LAPACKE_z_nancheck( n-1, x, incx ) ) { | |||
| return -3; | |||
| } | |||
| } | |||
| @@ -38,6 +38,7 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m, | |||
| lapack_complex_double tau, lapack_complex_double* c, | |||
| lapack_int ldc, lapack_complex_double* work ) | |||
| { | |||
| lapack_int lv; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_zlarfx", -1 ); | |||
| return -1; | |||
| @@ -51,7 +52,8 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m, | |||
| if( LAPACKE_z_nancheck( 1, &tau, 1 ) ) { | |||
| return -6; | |||
| } | |||
| if( LAPACKE_z_nancheck( m, v, 1 ) ) { | |||
| lv = (LAPACKE_lsame( side, 'l' ) ? m : n); | |||
| if( LAPACKE_z_nancheck( lv, v, 1 ) ) { | |||
| return -5; | |||
| } | |||
| } | |||
| @@ -39,7 +39,7 @@ lapack_int LAPACKE_zlassq( lapack_int n, lapack_complex_double* x, | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ | |||
| if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { | |||
| if( LAPACKE_z_nancheck( n, x, incx ) ) { | |||
| return -2; | |||
| } | |||
| if( LAPACKE_d_nancheck( 1, scale, 1 ) ) { | |||
| @@ -58,7 +58,7 @@ lapack_int LAPACKE_zunmtr( int matrix_layout, char side, char uplo, char trans, | |||
| if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -10; | |||
| } | |||
| if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) { | |||
| if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) { | |||
| return -9; | |||
| } | |||
| } | |||
| @@ -58,7 +58,7 @@ lapack_int LAPACKE_zupmtr( int matrix_layout, char side, char uplo, char trans, | |||
| if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -9; | |||
| } | |||
| if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) { | |||
| if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) { | |||
| return -8; | |||
| } | |||
| } | |||
| @@ -512,7 +512,7 @@ C END IF | |||
| * | |||
| * Call the kernel | |||
| * | |||
| #if defined(_OPENMP) && _OPENMP >= 201307L | |||
| #if defined(_OPENMP) && _OPENMP >= 201307 | |||
| IF( TTYPE.NE.1 ) THEN | |||
| !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) | |||
| !$OMP$ DEPEND(in:WORK(MYID-1)) | |||
| @@ -481,7 +481,7 @@ | |||
| * | |||
| * Call the kernel | |||
| * | |||
| #if defined(_OPENMP) && _OPENMP >= 201307L | |||
| #if defined(_OPENMP) && _OPENMP >= 201307 | |||
| IF( TTYPE.NE.1 ) THEN | |||
| !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) | |||
| !$OMP$ DEPEND(in:WORK(MYID-1)) | |||
| @@ -512,7 +512,7 @@ C END IF | |||
| * | |||
| * Call the kernel | |||
| * | |||
| #if defined(_OPENMP) && _OPENMP >= 201307L | |||
| #if defined(_OPENMP) && _OPENMP >= 201307 | |||
| IF( TTYPE.NE.1 ) THEN | |||
| !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) | |||
| @@ -67,6 +67,26 @@ double sqrt(double); | |||
| #undef GETRF_FACTOR | |||
| #define GETRF_FACTOR 1.00 | |||
| #if defined(USE_PTHREAD_LOCK) | |||
| static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER; | |||
| #elif defined(USE_PTHREAD_SPINLOCK) | |||
| static pthread_spinlock_t getrf_lock = 0; | |||
| #else | |||
| static BLASULONG getrf_lock = 0UL; | |||
| #endif | |||
| #if defined(USE_PTHREAD_LOCK) | |||
| static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER; | |||
| #elif defined(USE_PTHREAD_SPINLOCK) | |||
| static pthread_spinlock_t getrf_flag_lock = 0; | |||
| #else | |||
| static BLASULONG getrf_flag_lock = 0UL; | |||
| #endif | |||
| static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { | |||
| double m = (double)(M - IS - BK); | |||
| @@ -99,7 +119,11 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra | |||
| FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; | |||
| FLOAT *sbb = sb; | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; | |||
| #else | |||
| volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; | |||
| #endif | |||
| blasint *ipiv = (blasint *)args -> c; | |||
| @@ -177,7 +201,12 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra | |||
| /* Non blocking implementation */ | |||
| typedef struct { | |||
| volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| #endif | |||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| } job_t; | |||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| @@ -216,9 +245,12 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * | |||
| FLOAT *sbb= sb; | |||
| blasint *ipiv = (blasint *)args -> c; | |||
| BLASLONG jw; | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; | |||
| #else | |||
| volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; | |||
| #endif | |||
| if (args -> a == NULL) { | |||
| TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); | |||
| sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
| @@ -245,8 +277,20 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * | |||
| for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { | |||
| for (i = 0; i < args -> nthreads; i++) | |||
| #if 1 | |||
| { | |||
| LOCK_COMMAND(&getrf_lock); | |||
| jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; | |||
| UNLOCK_COMMAND(&getrf_lock); | |||
| do { | |||
| LOCK_COMMAND(&getrf_lock); | |||
| jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; | |||
| UNLOCK_COMMAND(&getrf_lock); | |||
| } while (jw); | |||
| } | |||
| #else | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {}; | |||
| #endif | |||
| for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ | |||
| min_jj = MIN(n_to, xxx + div_n) - jjs; | |||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
| @@ -283,18 +327,23 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * | |||
| b + (is + jjs * lda) * COMPSIZE, lda, is); | |||
| } | |||
| } | |||
| MB; | |||
| for (i = 0; i < args -> nthreads; i++) | |||
| for (i = 0; i < args -> nthreads; i++) { | |||
| LOCK_COMMAND(&getrf_lock); | |||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||
| UNLOCK_COMMAND(&getrf_lock); | |||
| } | |||
| } | |||
| LOCK_COMMAND(&getrf_flag_lock); | |||
| flag[mypos * CACHE_LINE_SIZE] = 0; | |||
| UNLOCK_COMMAND(&getrf_flag_lock); | |||
| if (m == 0) { | |||
| for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { | |||
| LOCK_COMMAND(&getrf_lock); | |||
| job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0; | |||
| UNLOCK_COMMAND(&getrf_lock); | |||
| } | |||
| } | |||
| @@ -318,7 +367,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * | |||
| for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { | |||
| if ((current != mypos) && (!is)) { | |||
| #if 1 | |||
| LOCK_COMMAND(&getrf_lock); | |||
| jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; | |||
| UNLOCK_COMMAND(&getrf_lock); | |||
| do { | |||
| LOCK_COMMAND(&getrf_lock); | |||
| jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; | |||
| UNLOCK_COMMAND(&getrf_lock); | |||
| } while (jw == 0); | |||
| #else | |||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {}; | |||
| #endif | |||
| } | |||
| KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, | |||
| @@ -327,7 +387,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * | |||
| MB; | |||
| if (is + min_i >= m) { | |||
| LOCK_COMMAND(&getrf_lock); | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||
| UNLOCK_COMMAND(&getrf_lock); | |||
| } | |||
| } | |||
| @@ -339,7 +401,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * | |||
| for (i = 0; i < args -> nthreads; i++) { | |||
| for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { | |||
| #if 1 | |||
| LOCK_COMMAND(&getrf_lock); | |||
| jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; | |||
| UNLOCK_COMMAND(&getrf_lock); | |||
| do { | |||
| LOCK_COMMAND(&getrf_lock); | |||
| jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; | |||
| UNLOCK_COMMAND(&getrf_lock); | |||
| } while(jw != 0); | |||
| #else | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {}; | |||
| #endif | |||
| } | |||
| } | |||
| @@ -374,11 +447,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| BLASLONG i, j, k, is, bk; | |||
| BLASLONG num_cpu; | |||
| BLASLONG f; | |||
| #ifdef _MSC_VER | |||
| BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE]; | |||
| #else | |||
| volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| #endif | |||
| BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); | |||
| #endif | |||
| #ifndef COMPLEX | |||
| @@ -501,11 +580,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| if (mm >= nn) { | |||
| width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); | |||
| if (width == 0) width = nn; | |||
| if (nn < width) width = nn; | |||
| nn -= width; | |||
| range_N[num_cpu + 1] = range_N[num_cpu] + width; | |||
| width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); | |||
| if (width == 0) width = mm; | |||
| if (mm < width) width = mm; | |||
| if (nn <= 0) width = mm; | |||
| mm -= width; | |||
| @@ -514,11 +595,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| } else { | |||
| width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); | |||
| if (width == 0) width = mm; | |||
| if (mm < width) width = mm; | |||
| mm -= width; | |||
| range_M[num_cpu + 1] = range_M[num_cpu] + width; | |||
| width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); | |||
| if (width == 0) width = nn; | |||
| if (nn < width) width = nn; | |||
| if (mm <= 0) width = nn; | |||
| nn -= width; | |||
| @@ -561,7 +644,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| range_n_new[1] = offset + is + bk; | |||
| if (num_cpu > 0) { | |||
| queue[num_cpu - 1].next = NULL; | |||
| exec_blas_async(0, &queue[0]); | |||
| @@ -572,8 +654,20 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| if (iinfo && !info) info = iinfo + is; | |||
| for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; | |||
| for (i = 0; i < num_cpu; i ++) { | |||
| #if 1 | |||
| LOCK_COMMAND(&getrf_flag_lock); | |||
| f=flag[i*CACHE_LINE_SIZE]; | |||
| UNLOCK_COMMAND(&getrf_flag_lock); | |||
| while (f!=0) { | |||
| LOCK_COMMAND(&getrf_flag_lock); | |||
| f=flag[i*CACHE_LINE_SIZE]; | |||
| UNLOCK_COMMAND(&getrf_flag_lock); | |||
| }; | |||
| #else | |||
| while (flag[i*CACHE_LINE_SIZE]) {}; | |||
| #endif | |||
| } | |||
| TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); | |||
| } else { | |||
| @@ -634,8 +728,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| BLASLONG range[MAX_CPU_NUMBER + 1]; | |||
| BLASLONG width, nn, num_cpu; | |||
| volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| #endif | |||
| BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| @@ -0,0 +1,664 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | |||
| //The array of job_t may overflow the stack. | |||
| //Instead, use malloc to alloc job_t. | |||
| #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD | |||
| #define USE_ALLOC_HEAP | |||
| #endif | |||
| static FLOAT dm1 = -1.; | |||
| #ifndef KERNEL_FUNC | |||
| #ifndef LOWER | |||
| #define KERNEL_FUNC SYRK_KERNEL_U | |||
| #else | |||
| #define KERNEL_FUNC SYRK_KERNEL_L | |||
| #endif | |||
| #endif | |||
| #ifndef LOWER | |||
| #ifndef COMPLEX | |||
| #define TRSM_KERNEL TRSM_KERNEL_LT | |||
| #else | |||
| #define TRSM_KERNEL TRSM_KERNEL_LC | |||
| #endif | |||
| #else | |||
| #ifndef COMPLEX | |||
| #define TRSM_KERNEL TRSM_KERNEL_RN | |||
| #else | |||
| #define TRSM_KERNEL TRSM_KERNEL_RR | |||
| #endif | |||
| #endif | |||
| #ifndef CACHE_LINE_SIZE | |||
| #define CACHE_LINE_SIZE 8 | |||
| #endif | |||
| #ifndef DIVIDE_RATE | |||
| #define DIVIDE_RATE 2 | |||
| #endif | |||
| #ifndef SWITCH_RATIO | |||
| #define SWITCH_RATIO 2 | |||
| #endif | |||
| #ifndef LOWER | |||
| #define TRANS | |||
| #endif | |||
| #ifndef SYRK_LOCAL | |||
| #if !defined(LOWER) && !defined(TRANS) | |||
| #define SYRK_LOCAL SYRK_UN | |||
| #elif !defined(LOWER) && defined(TRANS) | |||
| #define SYRK_LOCAL SYRK_UT | |||
| #elif defined(LOWER) && !defined(TRANS) | |||
| #define SYRK_LOCAL SYRK_LN | |||
| #else | |||
| #define SYRK_LOCAL SYRK_LT | |||
| #endif | |||
| #endif | |||
| typedef struct { | |||
| #if __STDC_VERSION__ >= 201112L | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| #endif | |||
| BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; | |||
| } job_t; | |||
| #ifndef KERNEL_OPERATION | |||
| #ifndef COMPLEX | |||
| #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ | |||
| KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) | |||
| #else | |||
| #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ | |||
| KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) | |||
| #endif | |||
| #endif | |||
| #ifndef ICOPY_OPERATION | |||
| #ifndef TRANS | |||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #else | |||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #endif | |||
| #endif | |||
| #ifndef OCOPY_OPERATION | |||
| #ifdef TRANS | |||
| #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #else | |||
| #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #endif | |||
| #endif | |||
| #ifndef S | |||
| #define S args -> a | |||
| #endif | |||
| #ifndef A | |||
| #define A args -> b | |||
| #endif | |||
| #ifndef C | |||
| #define C args -> c | |||
| #endif | |||
| #ifndef LDA | |||
| #define LDA args -> lda | |||
| #endif | |||
| #ifndef N | |||
| #define N args -> m | |||
| #endif | |||
| #ifndef K | |||
| #define K args -> k | |||
| #endif | |||
| static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ | |||
| FLOAT *buffer[DIVIDE_RATE]; | |||
| BLASLONG k, lda; | |||
| BLASLONG m_from, m_to; | |||
| FLOAT *alpha; | |||
| FLOAT *a, *c; | |||
| job_t *job = (job_t *)args -> common; | |||
| BLASLONG xxx, bufferside; | |||
| BLASLONG jjs, min_jj; | |||
| BLASLONG is, min_i, div_n; | |||
| BLASLONG i, current; | |||
| k = K; | |||
| a = (FLOAT *)A; | |||
| c = (FLOAT *)C; | |||
| lda = LDA; | |||
| alpha = (FLOAT *)args -> alpha; | |||
| m_from = range_n[mypos + 0]; | |||
| m_to = range_n[mypos + 1]; | |||
| #if 0 | |||
| fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); | |||
| #endif | |||
| div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||
| buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
| for (i = 1; i < DIVIDE_RATE; i++) { | |||
| buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; | |||
| } | |||
| #ifndef LOWER | |||
| TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); | |||
| #else | |||
| TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); | |||
| #endif | |||
| for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { | |||
| for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ | |||
| min_jj = MIN(m_to, xxx + div_n) - jjs; | |||
| #ifndef LOWER | |||
| if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; | |||
| #else | |||
| if (min_jj > GEMM_P) min_jj = GEMM_P; | |||
| #endif | |||
| #ifndef LOWER | |||
| OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); | |||
| TRSM_KERNEL (k, min_jj, k, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sb, | |||
| buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, | |||
| a + jjs * lda * COMPSIZE, lda, 0); | |||
| #else | |||
| ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); | |||
| TRSM_KERNEL (min_jj, k, k, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, | |||
| sb, | |||
| a + jjs * COMPSIZE, lda, 0); | |||
| #endif | |||
| } | |||
| #ifndef LOWER | |||
| for (i = 0; i <= mypos; i++) | |||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||
| #else | |||
| for (i = mypos; i < args -> nthreads; i++) | |||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||
| #endif | |||
| WMB; | |||
| } | |||
| min_i = m_to - m_from; | |||
| if (min_i >= GEMM_P * 2) { | |||
| min_i = GEMM_P; | |||
| } else | |||
| if (min_i > GEMM_P) { | |||
| min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||
| } | |||
| #ifndef LOWER | |||
| ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); | |||
| #else | |||
| OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); | |||
| #endif | |||
| current = mypos; | |||
| #ifndef LOWER | |||
| while (current < args -> nthreads) | |||
| #else | |||
| while (current >= 0) | |||
| #endif | |||
| { | |||
| div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||
| for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { | |||
| /* thread has to wait */ | |||
| if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; | |||
| KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, | |||
| sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | |||
| c, lda, m_from, xxx); | |||
| if (m_from + min_i >= m_to) { | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||
| WMB; | |||
| } | |||
| } | |||
| #ifndef LOWER | |||
| current ++; | |||
| #else | |||
| current --; | |||
| #endif | |||
| } | |||
| for(is = m_from + min_i; is < m_to; is += min_i){ | |||
| min_i = m_to - is; | |||
| if (min_i >= GEMM_P * 2) { | |||
| min_i = GEMM_P; | |||
| } else | |||
| if (min_i > GEMM_P) { | |||
| min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||
| } | |||
| #ifndef LOWER | |||
| ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); | |||
| #else | |||
| OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); | |||
| #endif | |||
| current = mypos; | |||
| #ifndef LOWER | |||
| while (current < args -> nthreads) | |||
| #else | |||
| while (current >= 0) | |||
| #endif | |||
| { | |||
| div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||
| for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { | |||
| KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, | |||
| sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | |||
| c, lda, is, xxx); | |||
| if (is + min_i >= m_to) { | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||
| WMB; | |||
| } | |||
| } | |||
| #ifndef LOWER | |||
| current ++; | |||
| #else | |||
| current --; | |||
| #endif | |||
| } | |||
| } | |||
| for (i = 0; i < args -> nthreads; i++) { | |||
| if (i != mypos) { | |||
| for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ | |||
| blas_arg_t newarg; | |||
| #ifndef USE_ALLOC_HEAP | |||
| job_t job[MAX_CPU_NUMBER]; | |||
| #else | |||
| job_t * job = NULL; | |||
| #endif | |||
| blas_queue_t queue[MAX_CPU_NUMBER]; | |||
| BLASLONG range[MAX_CPU_NUMBER + 100]; | |||
| BLASLONG num_cpu; | |||
| BLASLONG nthreads = args -> nthreads; | |||
| BLASLONG width, i, j, k; | |||
| BLASLONG n, n_from, n_to; | |||
| int mode, mask; | |||
| double dnum; | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| mode = BLAS_XDOUBLE | BLAS_REAL; | |||
| mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; | |||
| #elif defined(DOUBLE) | |||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||
| mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; | |||
| #else | |||
| mode = BLAS_SINGLE | BLAS_REAL; | |||
| mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; | |||
| #endif | |||
| #else | |||
| #ifdef XDOUBLE | |||
| mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||
| mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; | |||
| #elif defined(DOUBLE) | |||
| mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||
| mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; | |||
| #else | |||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | |||
| mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; | |||
| #endif | |||
| #endif | |||
| newarg.m = args -> m; | |||
| newarg.k = args -> k; | |||
| newarg.a = args -> a; | |||
| newarg.b = args -> b; | |||
| newarg.c = args -> c; | |||
| newarg.lda = args -> lda; | |||
| newarg.alpha = args -> alpha; | |||
| #ifdef USE_ALLOC_HEAP | |||
| job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); | |||
| if(job==NULL){ | |||
| fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); | |||
| exit(1); | |||
| } | |||
| #endif | |||
| newarg.common = (void *)job; | |||
| n_from = 0; | |||
| n_to = args -> m; | |||
| #ifndef LOWER | |||
| range[MAX_CPU_NUMBER] = n_to - n_from; | |||
| range[0] = 0; | |||
| num_cpu = 0; | |||
| i = 0; | |||
| n = n_to - n_from; | |||
| dnum = (double)n * (double)n /(double)nthreads; | |||
| while (i < n){ | |||
| if (nthreads - num_cpu > 1) { | |||
| double di = (double)i; | |||
| width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); | |||
| if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1)); | |||
| if ((width > n - i) || (width < mask)) width = n - i; | |||
| } else { | |||
| width = n - i; | |||
| } | |||
| range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = inner_thread; | |||
| queue[num_cpu].args = &newarg; | |||
| queue[num_cpu].range_m = NULL; | |||
| queue[num_cpu].sa = NULL; | |||
| queue[num_cpu].sb = NULL; | |||
| queue[num_cpu].next = &queue[num_cpu + 1]; | |||
| num_cpu ++; | |||
| i += width; | |||
| } | |||
| for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; | |||
| #else | |||
| range[0] = 0; | |||
| num_cpu = 0; | |||
| i = 0; | |||
| n = n_to - n_from; | |||
| dnum = (double)n * (double)n /(double)nthreads; | |||
| while (i < n){ | |||
| if (nthreads - num_cpu > 1) { | |||
| double di = (double)i; | |||
| width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); | |||
| if ((width > n - i) || (width < mask)) width = n - i; | |||
| } else { | |||
| width = n - i; | |||
| } | |||
| range[num_cpu + 1] = range[num_cpu] + width; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = inner_thread; | |||
| queue[num_cpu].args = &newarg; | |||
| queue[num_cpu].range_m = NULL; | |||
| queue[num_cpu].range_n = range; | |||
| queue[num_cpu].sa = NULL; | |||
| queue[num_cpu].sb = NULL; | |||
| queue[num_cpu].next = &queue[num_cpu + 1]; | |||
| num_cpu ++; | |||
| i += width; | |||
| } | |||
| #endif | |||
| newarg.nthreads = num_cpu; | |||
| if (num_cpu) { | |||
| for (j = 0; j < num_cpu; j++) { | |||
| for (i = 0; i < num_cpu; i++) { | |||
| for (k = 0; k < DIVIDE_RATE; k++) { | |||
| job[j].working[i][CACHE_LINE_SIZE * k] = 0; | |||
| } | |||
| } | |||
| } | |||
| queue[0].sa = sa; | |||
| queue[0].sb = sb; | |||
| queue[num_cpu - 1].next = NULL; | |||
| exec_blas(num_cpu, queue); | |||
| } | |||
| #ifdef USE_ALLOC_HEAP | |||
| free(job); | |||
| #endif | |||
| return 0; | |||
| } | |||
| #endif | |||
| blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { | |||
| BLASLONG n, bk, i, blocking, lda; | |||
| BLASLONG info; | |||
| int mode; | |||
| blas_arg_t newarg; | |||
| FLOAT *a; | |||
| FLOAT alpha[2] = { -ONE, ZERO}; | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| mode = BLAS_XDOUBLE | BLAS_REAL; | |||
| #elif defined(DOUBLE) | |||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||
| #else | |||
| mode = BLAS_SINGLE | BLAS_REAL; | |||
| #endif | |||
| #else | |||
| #ifdef XDOUBLE | |||
| mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||
| #elif defined(DOUBLE) | |||
| mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||
| #else | |||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | |||
| #endif | |||
| #endif | |||
| if (args -> nthreads == 1) { | |||
| #ifndef LOWER | |||
| info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); | |||
| #else | |||
| info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); | |||
| #endif | |||
| return info; | |||
| } | |||
| n = args -> n; | |||
| a = (FLOAT *)args -> a; | |||
| lda = args -> lda; | |||
| if (range_n) n = range_n[1] - range_n[0]; | |||
| if (n <= GEMM_UNROLL_N * 2) { | |||
| #ifndef LOWER | |||
| info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); | |||
| #else | |||
| info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); | |||
| #endif | |||
| return info; | |||
| } | |||
| newarg.lda = lda; | |||
| newarg.ldb = lda; | |||
| newarg.ldc = lda; | |||
| newarg.alpha = alpha; | |||
| newarg.beta = NULL; | |||
| newarg.nthreads = args -> nthreads; | |||
| blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; | |||
| if (blocking > GEMM_Q) blocking = GEMM_Q; | |||
| for (i = 0; i < n; i += blocking) { | |||
| bk = n - i; | |||
| if (bk > blocking) bk = blocking; | |||
| newarg.m = bk; | |||
| newarg.n = bk; | |||
| newarg.a = a + (i + i * lda) * COMPSIZE; | |||
| info = CNAME(&newarg, NULL, NULL, sa, sb, 0); | |||
| if (info) return info + i; | |||
| if (n - i - bk > 0) { | |||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | |||
| newarg.m = n - i - bk; | |||
| newarg.k = bk; | |||
| #ifndef LOWER | |||
| newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE; | |||
| #else | |||
| newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; | |||
| #endif | |||
| newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; | |||
| thread_driver(&newarg, sa, sb); | |||
| #else | |||
| #ifndef LOWER | |||
| newarg.m = bk; | |||
| newarg.n = n - i - bk; | |||
| newarg.a = a + (i + i * lda) * COMPSIZE; | |||
| newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; | |||
| gemm_thread_n(mode | BLAS_TRANSA_T, | |||
| &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); | |||
| newarg.n = n - i - bk; | |||
| newarg.k = bk; | |||
| newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; | |||
| newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; | |||
| #if 0 | |||
| HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); | |||
| #else | |||
| syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, | |||
| &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); | |||
| #endif | |||
| #else | |||
| newarg.m = n - i - bk; | |||
| newarg.n = bk; | |||
| newarg.a = a + (i + i * lda) * COMPSIZE; | |||
| newarg.b = a + (i + bk + i * lda) * COMPSIZE; | |||
| gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, | |||
| &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); | |||
| newarg.n = n - i - bk; | |||
| newarg.k = bk; | |||
| newarg.a = a + (i + bk + i * lda) * COMPSIZE; | |||
| newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; | |||
| #if 0 | |||
| HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); | |||
| #else | |||
| syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, | |||
| &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); | |||
| #endif | |||
| #endif | |||
| #endif | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -2291,7 +2291,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SYMV_P 16 | |||
| #endif | |||
| #if defined(P5600) || defined(I6400) || defined(P6600) || defined(I6500) | |||
| #if defined(P5600) || defined(MIPS1004K) || defined(I6400) || defined(P6600) || defined(I6500) | |||
| #define SNUMOPT 2 | |||
| #define DNUMOPT 2 | |||
| @@ -25,6 +25,7 @@ endif () | |||
| # known to hang with the native Windows and Android threads | |||
| # FIXME needs checking if this works on any of the other platforms | |||
| if (NOT NO_CBLAS) | |||
| if (NOT USE_OPENMP) | |||
| if (OS_CYGWIN_NT OR OS_LINUX) | |||
| set(OpenBLAS_utest_src | |||
| @@ -33,6 +34,7 @@ set(OpenBLAS_utest_src | |||
| ) | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (NOT NO_LAPACK) | |||
| set(OpenBLAS_utest_src | |||
| @@ -17,11 +17,13 @@ endif | |||
| #this does not work with OpenMP nor with native Windows or Android threads | |||
| # FIXME TBD if this works on OSX, SunOS, POWER and zarch | |||
| ifneq ($(NO_CBLAS), 1) | |||
| ifndef USE_OPENMP | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) | |||
| OBJS += test_fork.o | |||
| endif | |||
| endif | |||
| endif | |||
| all : run_test | |||
| @@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include "openblas_utest.h" | |||
| #include <sys/types.h> | |||
| #include <sys/wait.h> | |||
| #include <cblas.h> | |||