Update branch from develop for 0.3.14 releasetags/v0.3.14
| @@ -190,3 +190,27 @@ steps: | |||||
| - make -C ctest $COMMON_FLAGS | - make -C ctest $COMMON_FLAGS | ||||
| - make -C utest $COMMON_FLAGS | - make -C utest $COMMON_FLAGS | ||||
| - make -C cpp_thread_test dgemm_tester | - make -C cpp_thread_test dgemm_tester | ||||
| --- | |||||
| kind: pipeline | |||||
| name: arm64_gcc10 | |||||
| platform: | |||||
| os: linux | |||||
| arch: arm64 | |||||
| steps: | |||||
| - name: Build and Test | |||||
| image: ubuntu:20.04 | |||||
| environment: | |||||
| CC: gcc-10 | |||||
| FC: gfortran-10 | |||||
| COMMON_FLAGS: 'TARGET=ARMV8 DYNAMIC_ARCH=1' | |||||
| commands: | |||||
| - echo "MAKE_FLAGS:= $COMMON_FLAGS" | |||||
| - apt-get update -y | |||||
| - apt-get install -y make $CC gfortran-10 perl python g++ | |||||
| - $CC --version | |||||
| - make QUIET_MAKE=1 $COMMON_FLAGS | |||||
| - make -C utest $COMMON_FLAGS | |||||
| - make -C test $COMMON_FLAGS | |||||
| @@ -44,6 +44,11 @@ jobs: | |||||
| if: github.event_name != 'pull_request' | if: github.event_name != 'pull_request' | ||||
| run: brew update || true | run: brew update || true | ||||
| - name: unlink installed gcc to allow updating | |||||
| run: | | |||||
| brew unlink gcc@8 | |||||
| brew unlink gcc@9 | |||||
| - name: Install prerequisites | - name: Install prerequisites | ||||
| run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas | run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas | ||||
| @@ -89,5 +89,7 @@ build.* | |||||
| *.swp | *.swp | ||||
| benchmark/*.goto | benchmark/*.goto | ||||
| benchmark/smallscaling | benchmark/smallscaling | ||||
| .vscode | |||||
| CMakeCache.txt | CMakeCache.txt | ||||
| CMakeFiles/* | CMakeFiles/* | ||||
| .vscode | |||||
| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||||
| project(OpenBLAS C ASM) | project(OpenBLAS C ASM) | ||||
| set(OpenBLAS_MAJOR_VERSION 0) | set(OpenBLAS_MAJOR_VERSION 0) | ||||
| set(OpenBLAS_MINOR_VERSION 3) | set(OpenBLAS_MINOR_VERSION 3) | ||||
| set(OpenBLAS_PATCH_VERSION 13) | |||||
| set(OpenBLAS_PATCH_VERSION 14) | |||||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | ||||
| # Adhere to GNU filesystem layout conventions | # Adhere to GNU filesystem layout conventions | ||||
| @@ -14,6 +14,9 @@ include(GNUInstallDirs) | |||||
| include(CMakePackageConfigHelpers) | include(CMakePackageConfigHelpers) | ||||
| if(MSVC AND NOT DEFINED NOFORTRAN) | |||||
| set(NOFORTRAN ON) | |||||
| endif() | |||||
| ####### | ####### | ||||
| if(MSVC) | if(MSVC) | ||||
| @@ -229,7 +232,7 @@ if (NOT NO_CBLAS) | |||||
| add_subdirectory(utest) | add_subdirectory(utest) | ||||
| endif() | endif() | ||||
| if (NOT MSVC AND NOT NOFORTRAN) | |||||
| if (NOT NOFORTRAN) | |||||
| # Build test and ctest | # Build test and ctest | ||||
| add_subdirectory(test) | add_subdirectory(test) | ||||
| if(NOT NO_CBLAS) | if(NOT NO_CBLAS) | ||||
| @@ -1,4 +1,52 @@ | |||||
| OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
| ==================================================================== | |||||
| Version 0.3.14 | |||||
| 17-Mar-2021 | |||||
| common: | |||||
| * Fixed a race condition on thread shutdown in non-OpenMP builds | |||||
| * Fixed custom BUFFERSIZE option getting ignored in gmake builds | |||||
| * Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms | |||||
| * Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT | |||||
| * Improved performance of OMATCOPY_RT across all platforms | |||||
| * Changed perl scripts to use env instead of a hardcoded /usr/bin/perl | |||||
| * Fixed potential misreading of the GCC compiler version in the build scripts | |||||
| * Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477) | |||||
| * Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335) | |||||
| RISCV: | |||||
| * Fixed compilation on RISCV (missing entry in getarch) | |||||
| POWER: | |||||
| * Fixed compilation for DYNAMIC_ARCH with clang and with old gcc versions | |||||
| * Added support for compilation on FreeBSD/ppc64le | |||||
| * Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL | |||||
| * Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM | |||||
| * Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10 | |||||
| * Improved SCOPY and CCOPY performance on POWER10 | |||||
| * Improved SGEMM and DGEMM performance on POWER10 | |||||
| * Added support for compilation with the NVIDIA HPC compiler | |||||
| x86_64: | |||||
| * Added an optimized bfloat16 GEMM kernel for Cooperlake | |||||
| * Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus | |||||
| * Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus | |||||
| * Added support for compilation with the NAG Fortran compiler | |||||
| * Fixed recognition of the AMD AOCC compiler | |||||
| * Fixed compilation for DYNAMIC_ARCH with clang on Windows | |||||
| * Added support for running the BLAS/CBLAS tests on Windows | |||||
| * Fixed signatures of the tls callback functions for Windows x64 | |||||
| * Fixed various issues with fma intrinsics support handling | |||||
| ARM: | |||||
| * Added support for embedded Cortex M targets via a new option EMBEDDED | |||||
| ARMV8: | |||||
| * Fixed the THUNDERX2T99 and NEOVERSEN1 DNRM2/ZNRM2 kernels for inputs with Inf | |||||
| * Added support for the DYNAMIC_LIST option | |||||
| * Added support for compilation with the NVIDIA HPC compiler | |||||
| * Added support for compiling with the NAG Fortran compiler | |||||
| ==================================================================== | ==================================================================== | ||||
| Version 0.3.13 | Version 0.3.13 | ||||
| 12-Dec-2020 | 12-Dec-2020 | ||||
| @@ -59,6 +59,9 @@ endif | |||||
| @$(CC) --version > /dev/null 2>&1;\ | @$(CC) --version > /dev/null 2>&1;\ | ||||
| if [ $$? -eq 0 ]; then \ | if [ $$? -eq 0 ]; then \ | ||||
| cverinfo=`$(CC) --version | sed -n '1p'`; \ | cverinfo=`$(CC) --version | sed -n '1p'`; \ | ||||
| if [ -z "$${cverinfo}" ]; then \ | |||||
| cverinfo=`$(CC) --version | sed -n '2p'`; \ | |||||
| fi; \ | |||||
| echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ | echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ | ||||
| else \ | else \ | ||||
| echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ | echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ | ||||
| @@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||||
| @$(FC) --version > /dev/null 2>&1;\ | @$(FC) --version > /dev/null 2>&1;\ | ||||
| if [ $$? -eq 0 ]; then \ | if [ $$? -eq 0 ]; then \ | ||||
| fverinfo=`$(FC) --version | sed -n '1p'`; \ | fverinfo=`$(FC) --version | sed -n '1p'`; \ | ||||
| if [ -z "$${fverinfo}" ]; then \ | |||||
| fverinfo=`$(FC) --version | sed -n '2p'`; \ | |||||
| fi; \ | |||||
| echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ | echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ | ||||
| else \ | else \ | ||||
| echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ | echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ | ||||
| @@ -1,28 +1,38 @@ | |||||
| ifneq ($(C_COMPILER), PGI) | |||||
| ifeq ($(CORE), ARMV8) | ifeq ($(CORE), ARMV8) | ||||
| CCOMMON_OPT += -march=armv8-a | CCOMMON_OPT += -march=armv8-a | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8-a | FCOMMON_OPT += -march=armv8-a | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(CORE), CORTEXA53) | ifeq ($(CORE), CORTEXA53) | ||||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(CORE), CORTEXA57) | ifeq ($(CORE), CORTEXA57) | ||||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 | CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 | FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(CORE), CORTEXA72) | ifeq ($(CORE), CORTEXA72) | ||||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(CORE), CORTEXA73) | ifeq ($(CORE), CORTEXA73) | ||||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 | CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 | FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 | ||||
| endif | endif | ||||
| endif | |||||
| # Use a72 tunings because Neoverse-N1 is only available | # Use a72 tunings because Neoverse-N1 is only available | ||||
| # in GCC>=9 | # in GCC>=9 | ||||
| @@ -30,51 +40,71 @@ ifeq ($(CORE), NEOVERSEN1) | |||||
| ifeq ($(GCCVERSIONGTEQ7), 1) | ifeq ($(GCCVERSIONGTEQ7), 1) | ||||
| ifeq ($(GCCVERSIONGTEQ9), 1) | ifeq ($(GCCVERSIONGTEQ9), 1) | ||||
| CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 | CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 | FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 | ||||
| endif | |||||
| else | else | ||||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | ||||
| endif | endif | ||||
| endif | |||||
| else | else | ||||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(CORE), THUNDERX) | ifeq ($(CORE), THUNDERX) | ||||
| CCOMMON_OPT += -march=armv8-a -mtune=thunderx | CCOMMON_OPT += -march=armv8-a -mtune=thunderx | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8-a -mtune=thunderx | FCOMMON_OPT += -march=armv8-a -mtune=thunderx | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(CORE), FALKOR) | ifeq ($(CORE), FALKOR) | ||||
| CCOMMON_OPT += -march=armv8-a -mtune=falkor | CCOMMON_OPT += -march=armv8-a -mtune=falkor | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8-a -mtune=falkor | FCOMMON_OPT += -march=armv8-a -mtune=falkor | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(CORE), THUNDERX2T99) | ifeq ($(CORE), THUNDERX2T99) | ||||
| CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(CORE), THUNDERX3T110) | ifeq ($(CORE), THUNDERX3T110) | ||||
| ifeq ($(GCCVERSIONGTEQ10), 1) | ifeq ($(GCCVERSIONGTEQ10), 1) | ||||
| CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | ||||
| endif | |||||
| else | else | ||||
| CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(CORE), VORTEX) | ifeq ($(CORE), VORTEX) | ||||
| CCOMMON_OPT += -march=armv8.3-a | CCOMMON_OPT += -march=armv8.3-a | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8.3-a | FCOMMON_OPT += -march=armv8.3-a | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(GCCVERSIONGTEQ9), 1) | ifeq ($(GCCVERSIONGTEQ9), 1) | ||||
| ifeq ($(CORE), TSV110) | ifeq ($(CORE), TSV110) | ||||
| CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| endif | |||||
| @@ -10,9 +10,11 @@ USE_OPENMP = 1 | |||||
| endif | endif | ||||
| ifeq ($(CORE), POWER10) | ifeq ($(CORE), POWER10) | ||||
| ifneq ($(C_COMPILER), PGI) | |||||
| CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math | CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math | ||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math | FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(CORE), POWER9) | ifeq ($(CORE), POWER9) | ||||
| ifneq ($(C_COMPILER), PGI) | ifneq ($(C_COMPILER), PGI) | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.3.13 | |||||
| VERSION = 0.3.14 | |||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
| @@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64) | |||||
| override ARCH=x86_64 | override ARCH=x86_64 | ||||
| else ifeq ($(ARCH), powerpc64) | else ifeq ($(ARCH), powerpc64) | ||||
| override ARCH=power | override ARCH=power | ||||
| else ifeq ($(ARCH), powerpc64le) | |||||
| override ARCH=power | |||||
| else ifeq ($(ARCH), powerpc) | else ifeq ($(ARCH), powerpc) | ||||
| override ARCH=power | override ARCH=power | ||||
| else ifeq ($(ARCH), i386) | else ifeq ($(ARCH), i386) | ||||
| @@ -181,7 +183,7 @@ endif | |||||
| # On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. | # On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. | ||||
| ifeq ($(HOSTARCH), x86_64) | ifeq ($(HOSTARCH), x86_64) | ||||
| ifeq ($(findstring pgcc,$(HOSTCC)),) | |||||
| ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),) | |||||
| GETARCH_FLAGS += -march=native | GETARCH_FLAGS += -march=native | ||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -623,6 +625,11 @@ DYNAMIC_CORE += THUNDERX2T99 | |||||
| DYNAMIC_CORE += TSV110 | DYNAMIC_CORE += TSV110 | ||||
| DYNAMIC_CORE += EMAG8180 | DYNAMIC_CORE += EMAG8180 | ||||
| DYNAMIC_CORE += THUNDERX3T110 | DYNAMIC_CORE += THUNDERX3T110 | ||||
| ifdef DYNAMIC_LIST | |||||
| override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST) | |||||
| XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8 | |||||
| XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) | |||||
| endif | |||||
| endif | endif | ||||
| ifeq ($(ARCH), mips64) | ifeq ($(ARCH), mips64) | ||||
| @@ -663,6 +670,7 @@ endif | |||||
| endif # ARCH zarch | endif # ARCH zarch | ||||
| ifeq ($(ARCH), power) | ifeq ($(ARCH), power) | ||||
| ifneq ($(C_COMPILER), PGI) | |||||
| DYNAMIC_CORE = POWER6 | DYNAMIC_CORE = POWER6 | ||||
| DYNAMIC_CORE += POWER8 | DYNAMIC_CORE += POWER8 | ||||
| ifneq ($(C_COMPILER), GCC) | ifneq ($(C_COMPILER), GCC) | ||||
| @@ -689,6 +697,10 @@ else | |||||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) | $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) | ||||
| endif | endif | ||||
| endif | endif | ||||
| else | |||||
| DYNAMIC_CORE = POWER8 | |||||
| DYNAMIC_CORE += POWER9 | |||||
| endif | |||||
| endif | endif | ||||
| # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty | # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty | ||||
| @@ -847,9 +859,19 @@ endif | |||||
| endif | endif | ||||
| ifeq ($(C_COMPILER), PGI) | ifeq ($(C_COMPILER), PGI) | ||||
| PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) | |||||
| PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20) | |||||
| PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11) | |||||
| PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) | |||||
| ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011)) | |||||
| NEWPGI := 1 | |||||
| endif | |||||
| ifdef BINARY64 | ifdef BINARY64 | ||||
| ifeq ($(ARCH), x86_64) | ifeq ($(ARCH), x86_64) | ||||
| CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm | |||||
| CCOMMON_OPT += -tp p7-64 | |||||
| ifneq ($(NEWPGI),1) | |||||
| CCOMMON_OPT += -D__MMX__ -Mnollvm | |||||
| endif | |||||
| else | else | ||||
| ifeq ($(ARCH), power) | ifeq ($(ARCH), power) | ||||
| ifeq ($(CORE), POWER8) | ifeq ($(CORE), POWER8) | ||||
| @@ -877,13 +899,25 @@ endif | |||||
| # Fortran Compiler dependent settings | # Fortran Compiler dependent settings | ||||
| # | # | ||||
| ifeq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe | |||||
| ifdef INTERFACE64 | |||||
| ifneq ($(INTERFACE64), 0) | |||||
| FCOMMON_OPT += -i8 | |||||
| endif | |||||
| endif | |||||
| ifeq ($(USE_OPENMP), 1) | |||||
| FCOMMON_OPT += -openmp | |||||
| endif | |||||
| endif | |||||
| ifeq ($(F_COMPILER), FLANG) | ifeq ($(F_COMPILER), FLANG) | ||||
| CCOMMON_OPT += -DF_INTERFACE_FLANG | CCOMMON_OPT += -DF_INTERFACE_FLANG | ||||
| FCOMMON_OPT += -Mrecursive -Kieee | FCOMMON_OPT += -Mrecursive -Kieee | ||||
| ifeq ($(OSNAME), Linux) | ifeq ($(OSNAME), Linux) | ||||
| ifeq ($(ARCH), x86_64) | ifeq ($(ARCH), x86_64) | ||||
| FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`) | |||||
| ifeq ($(FLANG_VENDOR),AOCC) | |||||
| FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ") | |||||
| ifeq ($(FLANG_VENDOR), AMD) | |||||
| FCOMMON_OPT += -fno-unroll-loops | FCOMMON_OPT += -fno-unroll-loops | ||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -1029,18 +1063,24 @@ ifeq ($(ARCH), x86_64) | |||||
| FCOMMON_OPT += -tp p7-64 | FCOMMON_OPT += -tp p7-64 | ||||
| else | else | ||||
| ifeq ($(ARCH), power) | ifeq ($(ARCH), power) | ||||
| ifeq ($(CORE), POWER6) | |||||
| $(warning NVIDIA HPC compilers do not support POWER6.) | |||||
| endif | |||||
| ifeq ($(CORE), POWER8) | ifeq ($(CORE), POWER8) | ||||
| FCOMMON_OPT += -tp pwr8 | FCOMMON_OPT += -tp pwr8 | ||||
| endif | endif | ||||
| ifeq ($(CORE), POWER9) | ifeq ($(CORE), POWER9) | ||||
| FCOMMON_OPT += -tp pwr9 | FCOMMON_OPT += -tp pwr9 | ||||
| endif | endif | ||||
| ifeq ($(CORE), POWER10) | |||||
| $(warning NVIDIA HPC compilers do not support POWER10.) | |||||
| endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| else | else | ||||
| FCOMMON_OPT += -tp p7 | FCOMMON_OPT += -tp p7 | ||||
| endif | endif | ||||
| FCOMMON_OPT += -Mrecursive | |||||
| FCOMMON_OPT += -Mrecursive -Kieee | |||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| FCOMMON_OPT += -mp | FCOMMON_OPT += -mp | ||||
| endif | endif | ||||
| @@ -1179,6 +1219,8 @@ CCOMMON_OPT += -fPIC | |||||
| endif | endif | ||||
| ifeq ($(F_COMPILER), SUN) | ifeq ($(F_COMPILER), SUN) | ||||
| FCOMMON_OPT += -pic | FCOMMON_OPT += -pic | ||||
| else ifeq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -PIC | |||||
| else | else | ||||
| FCOMMON_OPT += -fPIC | FCOMMON_OPT += -fPIC | ||||
| endif | endif | ||||
| @@ -1256,6 +1298,10 @@ CCOMMON_OPT += -DUSE_PAPI | |||||
| EXTRALIB += -lpapi -lperfctr | EXTRALIB += -lpapi -lperfctr | ||||
| endif | endif | ||||
| ifdef BUFFERSIZE | |||||
| CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE) | |||||
| endif | |||||
| ifdef DYNAMIC_THREADS | ifdef DYNAMIC_THREADS | ||||
| CCOMMON_OPT += -DDYNAMIC_THREADS | CCOMMON_OPT += -DDYNAMIC_THREADS | ||||
| endif | endif | ||||
| @@ -1433,6 +1479,10 @@ LAPACK_FFLAGS := $(FFLAGS) | |||||
| LAPACK_FPFLAGS := $(FPFLAGS) | LAPACK_FPFLAGS := $(FPFLAGS) | ||||
| endif | endif | ||||
| ifeq ($(F_COMPILER),NAG) | |||||
| LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) | |||||
| endif | |||||
| LAPACK_CFLAGS = $(CFLAGS) | LAPACK_CFLAGS = $(CFLAGS) | ||||
| LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H | LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H | ||||
| ifdef INTERFACE64 | ifdef INTERFACE64 | ||||
| @@ -10,40 +10,46 @@ endif | |||||
| ifdef HAVE_SSE3 | ifdef HAVE_SSE3 | ||||
| CCOMMON_OPT += -msse3 | CCOMMON_OPT += -msse3 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -msse3 | FCOMMON_OPT += -msse3 | ||||
| endif | endif | ||||
| endif | |||||
| ifdef HAVE_SSSE3 | ifdef HAVE_SSSE3 | ||||
| CCOMMON_OPT += -mssse3 | CCOMMON_OPT += -mssse3 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -mssse3 | FCOMMON_OPT += -mssse3 | ||||
| endif | endif | ||||
| endif | |||||
| ifdef HAVE_SSE4_1 | ifdef HAVE_SSE4_1 | ||||
| CCOMMON_OPT += -msse4.1 | CCOMMON_OPT += -msse4.1 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -msse4.1 | FCOMMON_OPT += -msse4.1 | ||||
| endif | endif | ||||
| endif | |||||
| ifndef OLDGCC | ifndef OLDGCC | ||||
| ifdef HAVE_AVX | ifdef HAVE_AVX | ||||
| CCOMMON_OPT += -mavx | CCOMMON_OPT += -mavx | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -mavx | FCOMMON_OPT += -mavx | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifndef NO_AVX2 | ifndef NO_AVX2 | ||||
| ifdef HAVE_AVX2 | ifdef HAVE_AVX2 | ||||
| CCOMMON_OPT += -mavx2 | CCOMMON_OPT += -mavx2 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -mavx2 | FCOMMON_OPT += -mavx2 | ||||
| endif | endif | ||||
| endif | endif | ||||
| ifndef OLDGCC | |||||
| ifdef HAVE_FMA3 | |||||
| CCOMMON_OPT += -mfma | |||||
| FCOMMON_OPT += -mfma | |||||
| endif | |||||
| endif | endif | ||||
| ifeq ($(CORE), SKYLAKEX) | ifeq ($(CORE), SKYLAKEX) | ||||
| ifndef DYNAMIC_ARCH | ifndef DYNAMIC_ARCH | ||||
| ifndef NO_AVX512 | ifndef NO_AVX512 | ||||
| CCOMMON_OPT += -march=skylake-avx512 | CCOMMON_OPT += -march=skylake-avx512 | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=skylake-avx512 | FCOMMON_OPT += -march=skylake-avx512 | ||||
| endif | |||||
| ifeq ($(OSNAME), CYGWIN_NT) | ifeq ($(OSNAME), CYGWIN_NT) | ||||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | CCOMMON_OPT += -fno-asynchronous-unwind-tables | ||||
| FCOMMON_OPT += -fno-asynchronous-unwind-tables | FCOMMON_OPT += -fno-asynchronous-unwind-tables | ||||
| @@ -65,9 +71,11 @@ ifeq ($(C_COMPILER), GCC) | |||||
| # cooperlake support was added in 10.1 | # cooperlake support was added in 10.1 | ||||
| ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) | ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) | ||||
| CCOMMON_OPT += -march=cooperlake | CCOMMON_OPT += -march=cooperlake | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=cooperlake | FCOMMON_OPT += -march=cooperlake | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(OSNAME), CYGWIN_NT) | ifeq ($(OSNAME), CYGWIN_NT) | ||||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | CCOMMON_OPT += -fno-asynchronous-unwind-tables | ||||
| FCOMMON_OPT += -fno-asynchronous-unwind-tables | FCOMMON_OPT += -fno-asynchronous-unwind-tables | ||||
| @@ -13,10 +13,14 @@ Drone CI: [ library based on GotoBLAS2 1.13 BSD version. | |||||
| Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>. | Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>. | ||||
| For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib: | |||||
| <https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six | |||||
| 20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare <https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/> or Youtube <https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek> may be helpful. | |||||
| ## Binary Packages | ## Binary Packages | ||||
| We provide official binary packages for the following platform: | We provide official binary packages for the following platform: | ||||
| @@ -208,7 +212,8 @@ Please note that it is not possible to combine support for different architectur | |||||
| - **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. | - **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. | ||||
| - **AIX**: Supported on PPC up to POWER8 | - **AIX**: Supported on PPC up to POWER8 | ||||
| - **Haiku**: Supported by the community. We don't actively test the library on this OS. | - **Haiku**: Supported by the community. We don't actively test the library on this OS. | ||||
| - **SunOS**: Supported by the community. We don't actively test the library on this OS: | |||||
| - **SunOS**: Supported by the community. We don't actively test the library on this OS. | |||||
| - **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>. | |||||
| ## Usage | ## Usage | ||||
| @@ -30,10 +30,10 @@ environment: | |||||
| CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 | CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 | ||||
| matrix: | matrix: | ||||
| - COMPILER: clang-cl | - COMPILER: clang-cl | ||||
| WITH_FORTRAN: yes | |||||
| WITH_FORTRAN: ON | |||||
| - COMPILER: clang-cl | - COMPILER: clang-cl | ||||
| DYNAMIC_ARCH: ON | DYNAMIC_ARCH: ON | ||||
| WITH_FORTRAN: no | |||||
| WITH_FORTRAN: OFF | |||||
| - COMPILER: cl | - COMPILER: cl | ||||
| - COMPILER: MinGW64-gcc-7.2.0-mingw | - COMPILER: MinGW64-gcc-7.2.0-mingw | ||||
| DYNAMIC_ARCH: OFF | DYNAMIC_ARCH: OFF | ||||
| @@ -47,12 +47,7 @@ environment: | |||||
| install: | install: | ||||
| - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat | - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat | ||||
| - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force | - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force | ||||
| - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake | |||||
| - if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja | |||||
| - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja | |||||
| - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang | |||||
| - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 | |||||
| - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 | - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 | ||||
| - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" | - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" | ||||
| - if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%" | - if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%" | ||||
| @@ -68,15 +63,14 @@ before_build: | |||||
| - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. | - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. | ||||
| - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | ||||
| - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | ||||
| - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | |||||
| - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||||
| - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | |||||
| - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||||
| - if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON .. | |||||
| - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. | - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. | ||||
| build_script: | build_script: | ||||
| - cmake --build . | - cmake --build . | ||||
| test_script: | test_script: | ||||
| - echo Running Test | |||||
| - cd utest | |||||
| - openblas_utest | |||||
| - ctest -j2 | |||||
| @@ -74,6 +74,9 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | ||||
| struct timeval start, stop; | struct timeval start, stop; | ||||
| #elif defined(__APPLE__) | |||||
| mach_timebase_info_data_t info; | |||||
| uint64_t start = 0, stop = 0; | |||||
| #else | #else | ||||
| struct timespec start = { 0, 0 }, stop = { 0, 0 }; | struct timespec start = { 0, 0 }, stop = { 0, 0 }; | ||||
| #endif | #endif | ||||
| @@ -82,6 +85,9 @@ double getsec() | |||||
| { | { | ||||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | ||||
| return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | ||||
| #elif defined(__APPLE__) | |||||
| mach_timebase_info(&info); | |||||
| return (double)(((stop - start) * info.numer)/info.denom) * 1.e-9; | |||||
| #else | #else | ||||
| return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; | return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; | ||||
| #endif | #endif | ||||
| @@ -90,6 +96,8 @@ double getsec() | |||||
| void begin() { | void begin() { | ||||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | ||||
| gettimeofday( &start, (struct timezone *)0); | gettimeofday( &start, (struct timezone *)0); | ||||
| #elif defined(__APPLE__) | |||||
| start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); | |||||
| #else | #else | ||||
| clock_gettime(CLOCK_REALTIME, &start); | clock_gettime(CLOCK_REALTIME, &start); | ||||
| #endif | #endif | ||||
| @@ -98,7 +106,9 @@ void begin() { | |||||
| void end() { | void end() { | ||||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | ||||
| gettimeofday( &stop, (struct timezone *)0); | gettimeofday( &stop, (struct timezone *)0); | ||||
| #elif defined(__APPLE__) | |||||
| stop = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); | |||||
| #else | #else | ||||
| clock_gettime(CLOCK_REALTIME, &stop); | clock_gettime(CLOCK_REALTIME, &stop); | ||||
| #endif | #endif | ||||
| } | |||||
| } | |||||
| @@ -1,11 +1,11 @@ | |||||
| #!/usr/bin/perl | |||||
| #!/usr/bin/env perl | |||||
| #use File::Basename; | #use File::Basename; | ||||
| # use File::Temp qw(tempfile); | # use File::Temp qw(tempfile); | ||||
| # Checking cross compile | # Checking cross compile | ||||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | ||||
| $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); | |||||
| $hostarch = `uname -m | sed -e s/i.86/x86/`; | |||||
| $hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS"); | $hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS"); | ||||
| chop($hostarch); | chop($hostarch); | ||||
| $hostarch = "x86_64" if ($hostarch eq "amd64"); | $hostarch = "x86_64" if ($hostarch eq "amd64"); | ||||
| @@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, | |||||
| void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); | void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); | ||||
| void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); | void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); | ||||
| void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); | |||||
| void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); | |||||
| void cblas_srotg(float *a, float *b, float *c, float *s); | void cblas_srotg(float *a, float *b, float *c, float *s); | ||||
| void cblas_drotg(double *a, double *b, double *c, double *s); | void cblas_drotg(double *a, double *b, double *c, double *s); | ||||
| void cblas_crotg(void *a, void *b, float *c, void *s); | |||||
| void cblas_zrotg(void *a, void *b, double *c, void *s); | |||||
| void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P); | void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P); | ||||
| void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P); | void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P); | ||||
| @@ -45,6 +45,9 @@ endif () | |||||
| if (DYNAMIC_ARCH) | if (DYNAMIC_ARCH) | ||||
| if (ARM64) | if (ARM64) | ||||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | ||||
| if (DYNAMIC_LIST) | |||||
| set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | |||||
| endif () | |||||
| endif () | endif () | ||||
| if (POWER) | if (POWER) | ||||
| @@ -2499,6 +2499,5 @@ foreach (Utils_FILE ${Utils_SRC}) | |||||
| endforeach () | endforeach () | ||||
| set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include") | set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include") | ||||
| configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY) | |||||
| include_directories(${lapacke_include_dir}) | include_directories(${lapacke_include_dir}) | ||||
| set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") | set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") | ||||
| @@ -148,16 +148,20 @@ endif () | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") | include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") | ||||
| if (DEFINED TARGET) | if (DEFINED TARGET) | ||||
| if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) | if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) | ||||
| # if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | ||||
| if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) | |||||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09) | |||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") | set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") | ||||
| else() | else() | ||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | ||||
| endif() | endif() | ||||
| # elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") | |||||
| # set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||||
| # endif() | |||||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") | |||||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99) | |||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") | |||||
| else() | |||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||||
| endif() | |||||
| endif() | |||||
| endif() | endif() | ||||
| if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) | if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) | ||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | ||||
| @@ -233,6 +237,11 @@ if (BINARY64) | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| if(EMBEDDED) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DOS_EMBEDDED") | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16") | |||||
| endif() | |||||
| if (NEED_PIC) | if (NEED_PIC) | ||||
| if (${CMAKE_C_COMPILER} STREQUAL "IBM") | if (${CMAKE_C_COMPILER} STREQUAL "IBM") | ||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large") | set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large") | ||||
| @@ -74,6 +74,9 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||||
| string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | ||||
| if (NOT "${line_match}" STREQUAL "") | if (NOT "${line_match}" STREQUAL "") | ||||
| # message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | # message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | ||||
| if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER) | |||||
| set (CMAKE_MATCH_1 CMAKE_C_COMPILER) | |||||
| endif () | |||||
| if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) | if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) | ||||
| # message (STATUS "condition is true") | # message (STATUS "condition is true") | ||||
| set (IfElse 1) | set (IfElse 1) | ||||
| @@ -122,7 +122,7 @@ extern "C" { | |||||
| #define ATOM GOTO_ATOM | #define ATOM GOTO_ATOM | ||||
| #undef GOTO_ATOM | #undef GOTO_ATOM | ||||
| #endif | #endif | ||||
| #else | |||||
| #elif !defined(OS_EMBEDDED) | |||||
| #include <sys/mman.h> | #include <sys/mman.h> | ||||
| #ifndef NO_SYSV_IPC | #ifndef NO_SYSV_IPC | ||||
| #include <sys/shm.h> | #include <sys/shm.h> | ||||
| @@ -134,6 +134,9 @@ extern "C" { | |||||
| #if defined(SMP) || defined(USE_LOCKING) | #if defined(SMP) || defined(USE_LOCKING) | ||||
| #include <pthread.h> | #include <pthread.h> | ||||
| #endif | #endif | ||||
| #else | |||||
| #include <time.h> | |||||
| #include <math.h> | |||||
| #endif | #endif | ||||
| #if defined(OS_SUNOS) | #if defined(OS_SUNOS) | ||||
| @@ -488,10 +491,12 @@ static inline unsigned long long rpcc(void){ | |||||
| struct timespec ts; | struct timespec ts; | ||||
| clock_gettime(CLOCK_MONOTONIC, &ts); | clock_gettime(CLOCK_MONOTONIC, &ts); | ||||
| return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; | return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; | ||||
| #else | |||||
| #elif !defined(OS_EMBEDDED) | |||||
| struct timeval tv; | struct timeval tv; | ||||
| gettimeofday(&tv,NULL); | gettimeofday(&tv,NULL); | ||||
| return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; | return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; | ||||
| #else | |||||
| return 0; | |||||
| #endif | #endif | ||||
| } | } | ||||
| #define RPCC_DEFINED | #define RPCC_DEFINED | ||||
| @@ -521,6 +526,10 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||||
| #include "common_linux.h" | #include "common_linux.h" | ||||
| #endif | #endif | ||||
| #ifdef OS_EMBEDDED | |||||
| #define DTB_DEFAULT_ENTRIES 64 | |||||
| #endif | |||||
| #define MMAP_ACCESS (PROT_READ | PROT_WRITE) | #define MMAP_ACCESS (PROT_READ | PROT_WRITE) | ||||
| #ifdef __NetBSD__ | #ifdef __NetBSD__ | ||||
| @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define INLINE inline | #define INLINE inline | ||||
| #ifdef F_INTERFACE_FLANG | |||||
| #if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI) | |||||
| #define RETURN_BY_STACK | #define RETURN_BY_STACK | ||||
| #else | #else | ||||
| #define RETURN_BY_COMPLEX | #define RETURN_BY_COMPLEX | ||||
| @@ -1418,6 +1418,15 @@ int get_cpuname(void){ | |||||
| case 9: | case 9: | ||||
| case 8: | case 8: | ||||
| switch (model) { | switch (model) { | ||||
| case 12: // Tiger Lake | |||||
| if(support_avx512()) | |||||
| return CPUTYPE_SKYLAKEX; | |||||
| if(support_avx2()) | |||||
| return CPUTYPE_HASWELL; | |||||
| if(support_avx()) | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| else | |||||
| return CPUTYPE_NEHALEM; | |||||
| case 14: // Kaby Lake and refreshes | case 14: // Kaby Lake and refreshes | ||||
| if(support_avx2()) | if(support_avx2()) | ||||
| return CPUTYPE_HASWELL; | return CPUTYPE_HASWELL; | ||||
| @@ -1436,6 +1445,15 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_SANDYBRIDGE; | return CPUTYPE_SANDYBRIDGE; | ||||
| else | else | ||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| case 7: // Rocket Lake | |||||
| if(support_avx512()) | |||||
| return CPUTYPE_SKYLAKEX; | |||||
| if(support_avx2()) | |||||
| return CPUTYPE_HASWELL; | |||||
| if(support_avx()) | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| else | |||||
| return CPUTYPE_NEHALEM; | |||||
| } | } | ||||
| break; | break; | ||||
| } | } | ||||
| @@ -2014,6 +2032,19 @@ int get_coretype(void){ | |||||
| #endif | #endif | ||||
| else | else | ||||
| return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
| case 7:// Rocket Lake | |||||
| #ifndef NO_AVX512 | |||||
| if(support_avx512()) | |||||
| return CORE_SKYLAKEX; | |||||
| #endif | |||||
| #ifndef NO_AVX2 | |||||
| if(support_avx2()) | |||||
| return CORE_HASWELL; | |||||
| #endif | |||||
| if(support_avx()) | |||||
| return CORE_SANDYBRIDGE; | |||||
| else | |||||
| return CORE_NEHALEM; | |||||
| } | } | ||||
| case 5: | case 5: | ||||
| switch (model) { | switch (model) { | ||||
| @@ -2102,6 +2133,16 @@ int get_coretype(void){ | |||||
| break; | break; | ||||
| case 9: | case 9: | ||||
| case 8: | case 8: | ||||
| if (model == 12) { // Tiger Lake | |||||
| if(support_avx512()) | |||||
| return CPUTYPE_SKYLAKEX; | |||||
| if(support_avx2()) | |||||
| return CPUTYPE_HASWELL; | |||||
| if(support_avx()) | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| else | |||||
| return CPUTYPE_NEHALEM; | |||||
| } | |||||
| if (model == 14) { // Kaby Lake | if (model == 14) { // Kaby Lake | ||||
| if(support_avx()) | if(support_avx()) | ||||
| #ifndef NO_AVX2 | #ifndef NO_AVX2 | ||||
| @@ -5,9 +5,18 @@ enable_language(Fortran) | |||||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") | ||||
| if(WIN32) | |||||
| FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1 | |||||
| "$ErrorActionPreference = \"Stop\"\n" | |||||
| "Get-Content $args[1] | & $args[0]\n" | |||||
| ) | |||||
| set(test_helper powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1") | |||||
| else() | |||||
| FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh | FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh | ||||
| "$1 < $2\n" | "$1 < $2\n" | ||||
| ) | ) | ||||
| set(test_helper sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh") | |||||
| endif() | |||||
| foreach(float_type ${FLOAT_TYPES}) | foreach(float_type ${FLOAT_TYPES}) | ||||
| string(SUBSTRING ${float_type} 0 1 float_char_upper) | string(SUBSTRING ${float_type} 0 1 float_char_upper) | ||||
| @@ -21,7 +30,7 @@ foreach(float_type ${FLOAT_TYPES}) | |||||
| c_${float_char}blas1.c) | c_${float_char}blas1.c) | ||||
| target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) | target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) | ||||
| add_test(NAME "x${float_char}cblat1" | add_test(NAME "x${float_char}cblat1" | ||||
| COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1") | |||||
| COMMAND $<TARGET_FILE:x${float_char}cblat1>) | |||||
| #level2 | #level2 | ||||
| add_executable(x${float_char}cblat2 | add_executable(x${float_char}cblat2 | ||||
| @@ -33,7 +42,7 @@ foreach(float_type ${FLOAT_TYPES}) | |||||
| constant.c) | constant.c) | ||||
| target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) | target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) | ||||
| add_test(NAME "x${float_char}cblat2" | add_test(NAME "x${float_char}cblat2" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") | |||||
| COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat2> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") | |||||
| #level3 | #level3 | ||||
| add_executable(x${float_char}cblat3 | add_executable(x${float_char}cblat3 | ||||
| @@ -45,6 +54,6 @@ foreach(float_type ${FLOAT_TYPES}) | |||||
| constant.c) | constant.c) | ||||
| target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | ||||
| add_test(NAME "x${float_char}cblat3" | add_test(NAME "x${float_char}cblat3" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") | |||||
| COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") | |||||
| endforeach() | endforeach() | ||||
| @@ -212,6 +212,9 @@ ifeq ($(C_COMPILER), CLANG) | |||||
| CEXTRALIB = -lomp | CEXTRALIB = -lomp | ||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(F_COMPILER), NAG) | |||||
| CEXTRALIB = -lgomp | |||||
| endif | |||||
| endif | endif | ||||
| ifeq ($(BUILD_SINGLE),1) | ifeq ($(BUILD_SINGLE),1) | ||||
| @@ -1024,38 +1024,39 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||||
| int i; | int i; | ||||
| if (!blas_server_avail) return 0; | |||||
| LOCK_COMMAND(&server_lock); | LOCK_COMMAND(&server_lock); | ||||
| for (i = 0; i < blas_num_threads - 1; i++) { | |||||
| if (blas_server_avail) { | |||||
| for (i = 0; i < blas_num_threads - 1; i++) { | |||||
| pthread_mutex_lock (&thread_status[i].lock); | |||||
| atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); | |||||
| thread_status[i].status = THREAD_STATUS_WAKEUP; | |||||
| pthread_cond_signal (&thread_status[i].wakeup); | |||||
| pthread_mutex_lock (&thread_status[i].lock); | |||||
| pthread_mutex_unlock(&thread_status[i].lock); | |||||
| atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); | |||||
| thread_status[i].status = THREAD_STATUS_WAKEUP; | |||||
| pthread_cond_signal (&thread_status[i].wakeup); | |||||
| } | |||||
| pthread_mutex_unlock(&thread_status[i].lock); | |||||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||||
| pthread_join(blas_threads[i], NULL); | |||||
| } | |||||
| } | |||||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||||
| pthread_mutex_destroy(&thread_status[i].lock); | |||||
| pthread_cond_destroy (&thread_status[i].wakeup); | |||||
| } | |||||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||||
| pthread_join(blas_threads[i], NULL); | |||||
| } | |||||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||||
| pthread_mutex_destroy(&thread_status[i].lock); | |||||
| pthread_cond_destroy (&thread_status[i].wakeup); | |||||
| } | |||||
| #ifdef NEED_STACKATTR | #ifdef NEED_STACKATTR | ||||
| pthread_attr_destory(&attr); | |||||
| pthread_attr_destroy(&attr); | |||||
| #endif | #endif | ||||
| blas_server_avail = 0; | |||||
| blas_server_avail = 0; | |||||
| } | |||||
| UNLOCK_COMMAND(&server_lock); | UNLOCK_COMMAND(&server_lock); | ||||
| return 0; | return 0; | ||||
| @@ -644,6 +644,21 @@ static gotoblas_t *get_coretype(void){ | |||||
| return NULL; | return NULL; | ||||
| case 9: | case 9: | ||||
| case 8: | case 8: | ||||
| if (model == 12) { // Tiger Lake | |||||
| if (support_avx512()) | |||||
| return &gotoblas_SKYLAKEX; | |||||
| if(support_avx2()){ | |||||
| openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||||
| return &gotoblas_HASWELL; | |||||
| } | |||||
| if(support_avx()) { | |||||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
| return &gotoblas_SANDYBRIDGE; | |||||
| } else { | |||||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||||
| return &gotoblas_NEHALEM; | |||||
| } | |||||
| } | |||||
| if (model == 14 ) { // Kaby Lake, Coffee Lake | if (model == 14 ) { // Kaby Lake, Coffee Lake | ||||
| if(support_avx2()) | if(support_avx2()) | ||||
| return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
| @@ -656,7 +671,7 @@ static gotoblas_t *get_coretype(void){ | |||||
| } | } | ||||
| } | } | ||||
| case 10: | case 10: | ||||
| if (model == 5 || model == 6) { | |||||
| if (model == 5 || model == 6) { | |||||
| if(support_avx2()) | if(support_avx2()) | ||||
| return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
| if(support_avx()) { | if(support_avx()) { | ||||
| @@ -666,7 +681,20 @@ static gotoblas_t *get_coretype(void){ | |||||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | ||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
| } | } | ||||
| } | |||||
| } | |||||
| if (model == 7) { | |||||
| if (support_avx512()) | |||||
| return &gotoblas_SKYLAKEX; | |||||
| if(support_avx2()) | |||||
| return &gotoblas_HASWELL; | |||||
| if(support_avx()) { | |||||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
| return &gotoblas_SANDYBRIDGE; | |||||
| } else { | |||||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| } | |||||
| return NULL; | return NULL; | ||||
| } | } | ||||
| case 0xf: | case 0xf: | ||||
| @@ -43,6 +43,63 @@ | |||||
| #endif | #endif | ||||
| extern gotoblas_t gotoblas_ARMV8; | extern gotoblas_t gotoblas_ARMV8; | ||||
| #ifdef DYNAMIC_LIST | |||||
| #ifdef DYN_CORTEXA53 | |||||
| extern gotoblas_t gotoblas_CORTEXA53; | |||||
| #else | |||||
| #define gotoblas_CORTEXA53 gotoblas_ARMV8 | |||||
| #endif | |||||
| #ifdef DYN_CORTEXA57 | |||||
| extern gotoblas_t gotoblas_CORTEXA57; | |||||
| #else | |||||
| #define gotoblas_CORTEXA57 gotoblas_ARMV8 | |||||
| #endif | |||||
| #ifdef DYN_CORTEXA72 | |||||
| extern gotoblas_t gotoblas_CORTEXA72; | |||||
| #else | |||||
| #define gotoblas_CORTEXA72 gotoblas_ARMV8 | |||||
| #endif | |||||
| #ifdef DYN_CORTEXA73 | |||||
| extern gotoblas_t gotoblas_CORTEXA73; | |||||
| #else | |||||
| #define gotoblas_CORTEXA73 gotoblas_ARMV8 | |||||
| #endif | |||||
| #ifdef DYN_FALKOR | |||||
| extern gotoblas_t gotoblas_FALKOR; | |||||
| #else | |||||
| #define gotoblas_FALKOR gotoblas_ARMV8 | |||||
| #endif | |||||
| #ifdef DYN_TSV110 | |||||
| extern gotoblas_t gotoblas_TSV110; | |||||
| #else | |||||
| #define gotoblas_TSV110 gotoblas_ARMV8 | |||||
| #endif | |||||
| #ifdef DYN_THUNDERX | |||||
| extern gotoblas_t gotoblas_THUNDERX; | |||||
| #else | |||||
| #define gotoblas_THUNDERX gotoblas_ARMV8 | |||||
| #endif | |||||
| #ifdef DYN_THUNDERX2T99 | |||||
| extern gotoblas_t gotoblas_THUNDERX2T99; | |||||
| #else | |||||
| #define gotoblas_THUNDERX2T99 gotoblas_ARMV8 | |||||
| #endif | |||||
| #ifdef DYN_THUNDERX3T110 | |||||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||||
| #else | |||||
| #define gotoblas_THUNDERX3T110 gotoblas_ARMV8 | |||||
| #endif | |||||
| #ifdef DYN_EMAG8180 | |||||
| extern gotoblas_t gotoblas_EMAG8180; | |||||
| #else | |||||
| #define gotoblas_EMAG8180 gotoblas_ARMV8 | |||||
| #endif | |||||
| #ifdef DYN_NEOVERSEN1 | |||||
| extern gotoblas_t gotoblas_NEOVERSEN1; | |||||
| #else | |||||
| #define gotoblas_NEOVERSEN1 gotoblas_ARMV8 | |||||
| #endif | |||||
| #else | |||||
| extern gotoblas_t gotoblas_CORTEXA53; | extern gotoblas_t gotoblas_CORTEXA53; | ||||
| extern gotoblas_t gotoblas_CORTEXA57; | extern gotoblas_t gotoblas_CORTEXA57; | ||||
| extern gotoblas_t gotoblas_CORTEXA72; | extern gotoblas_t gotoblas_CORTEXA72; | ||||
| @@ -54,6 +111,7 @@ extern gotoblas_t gotoblas_TSV110; | |||||
| extern gotoblas_t gotoblas_EMAG8180; | extern gotoblas_t gotoblas_EMAG8180; | ||||
| extern gotoblas_t gotoblas_NEOVERSEN1; | extern gotoblas_t gotoblas_NEOVERSEN1; | ||||
| extern gotoblas_t gotoblas_THUNDERX3T110; | extern gotoblas_t gotoblas_THUNDERX3T110; | ||||
| #endif | |||||
| extern void openblas_warning(int verbose, const char * msg); | extern void openblas_warning(int verbose, const char * msg); | ||||
| @@ -68,7 +126,7 @@ extern void openblas_warning(int verbose, const char * msg); | |||||
| #endif | #endif | ||||
| #define get_cpu_ftr(id, var) ({ \ | #define get_cpu_ftr(id, var) ({ \ | ||||
| __asm__("mrs %0, "#id : "=r" (var)); \ | |||||
| __asm__ ("mrs %0, "#id : "=r" (var)); \ | |||||
| }) | }) | ||||
| static char *corename[] = { | static char *corename[] = { | ||||
| @@ -27,7 +27,9 @@ static char *corename[] = { | |||||
| #define NUM_CORETYPES 4 | #define NUM_CORETYPES 4 | ||||
| char *gotoblas_corename(void) { | char *gotoblas_corename(void) { | ||||
| #ifndef C_PGI | |||||
| if (gotoblas == &gotoblas_POWER6) return corename[1]; | if (gotoblas == &gotoblas_POWER6) return corename[1]; | ||||
| #endif | |||||
| if (gotoblas == &gotoblas_POWER8) return corename[2]; | if (gotoblas == &gotoblas_POWER8) return corename[2]; | ||||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | #if (!defined __GNUC__) || ( __GNUC__ >= 6) | ||||
| if (gotoblas == &gotoblas_POWER9) return corename[3]; | if (gotoblas == &gotoblas_POWER9) return corename[3]; | ||||
| @@ -38,10 +40,164 @@ char *gotoblas_corename(void) { | |||||
| return corename[0]; | return corename[0]; | ||||
| } | } | ||||
| #if defined(__clang__) | |||||
| static int __builtin_cpu_supports(char* arg) | |||||
| { | |||||
| return 0; | |||||
| } | |||||
| #endif | |||||
| #if defined(C_PGI) || defined(__clang__) | |||||
| /* | |||||
| * NV HPC compilers do not yet implement __builtin_cpu_is(). | |||||
| * Fake a version here for use in the CPU detection code below. | |||||
| * | |||||
| * Strategy here is to first check the CPU to see what it actually is, | |||||
| * and then test the input to see if what the CPU actually is matches | |||||
| * what was requested. | |||||
| */ | |||||
| #include <string.h> | |||||
| /* | |||||
| * Define POWER processor version table. | |||||
| * | |||||
| * NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time | |||||
| */ | |||||
| #define CPU_UNKNOWN 0 | |||||
| #define CPU_POWER5 5 | |||||
| #define CPU_POWER6 6 | |||||
| #define CPU_POWER8 8 | |||||
| #define CPU_POWER9 9 | |||||
| #define CPU_POWER10 10 | |||||
| static struct { | |||||
| uint32_t pvr_mask; | |||||
| uint32_t pvr_value; | |||||
| const char* cpu_name; | |||||
| uint32_t cpu_type; | |||||
| } pvrPOWER [] = { | |||||
| { /* POWER6 in P5+ mode; 2.04-compliant processor */ | |||||
| .pvr_mask = 0xffffffff, | |||||
| .pvr_value = 0x0f000001, | |||||
| .cpu_name = "POWER5+", | |||||
| .cpu_type = CPU_POWER5, | |||||
| }, | |||||
| { /* Power6 aka POWER6X*/ | |||||
| .pvr_mask = 0xffff0000, | |||||
| .pvr_value = 0x003e0000, | |||||
| .cpu_name = "POWER6 (raw)", | |||||
| .cpu_type = CPU_POWER6, | |||||
| }, | |||||
| { /* Power7 */ | |||||
| .pvr_mask = 0xffff0000, | |||||
| .pvr_value = 0x003f0000, | |||||
| .cpu_name = "POWER7 (raw)", | |||||
| .cpu_type = CPU_POWER6, | |||||
| }, | |||||
| { /* Power7+ */ | |||||
| .pvr_mask = 0xffff0000, | |||||
| .pvr_value = 0x004A0000, | |||||
| .cpu_name = "POWER7+ (raw)", | |||||
| .cpu_type = CPU_POWER6, | |||||
| }, | |||||
| { /* Power8E */ | |||||
| .pvr_mask = 0xffff0000, | |||||
| .pvr_value = 0x004b0000, | |||||
| .cpu_name = "POWER8E (raw)", | |||||
| .cpu_type = CPU_POWER8, | |||||
| }, | |||||
| { /* Power8NVL */ | |||||
| .pvr_mask = 0xffff0000, | |||||
| .pvr_value = 0x004c0000, | |||||
| .cpu_name = "POWER8NVL (raw)", | |||||
| .cpu_type = CPU_POWER8, | |||||
| }, | |||||
| { /* Power8 */ | |||||
| .pvr_mask = 0xffff0000, | |||||
| .pvr_value = 0x004d0000, | |||||
| .cpu_name = "POWER8 (raw)", | |||||
| .cpu_type = CPU_POWER8, | |||||
| }, | |||||
| { /* Power9 DD2.0 */ | |||||
| .pvr_mask = 0xffffefff, | |||||
| .pvr_value = 0x004e0200, | |||||
| .cpu_name = "POWER9 (raw)", | |||||
| .cpu_type = CPU_POWER9, | |||||
| }, | |||||
| { /* Power9 DD 2.1 */ | |||||
| .pvr_mask = 0xffffefff, | |||||
| .pvr_value = 0x004e0201, | |||||
| .cpu_name = "POWER9 (raw)", | |||||
| .cpu_type = CPU_POWER9, | |||||
| }, | |||||
| { /* Power9 DD2.2 or later */ | |||||
| .pvr_mask = 0xffff0000, | |||||
| .pvr_value = 0x004e0000, | |||||
| .cpu_name = "POWER9 (raw)", | |||||
| .cpu_type = CPU_POWER9, | |||||
| }, | |||||
| { /* Power10 */ | |||||
| .pvr_mask = 0xffff0000, | |||||
| .pvr_value = 0x00800000, | |||||
| .cpu_name = "POWER10 (raw)", | |||||
| .cpu_type = CPU_POWER10, | |||||
| }, | |||||
| { /* End of table, pvr_mask and pvr_value must be zero */ | |||||
| .pvr_mask = 0x0, | |||||
| .pvr_value = 0x0, | |||||
| .cpu_name = "Unknown", | |||||
| .cpu_type = CPU_UNKNOWN, | |||||
| }, | |||||
| }; | |||||
| static int __builtin_cpu_is(const char *cpu) { | |||||
| int i; | |||||
| uint32_t pvr; | |||||
| uint32_t cpu_type; | |||||
| asm("mfpvr %0" : "=r"(pvr)); | |||||
| for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) { | |||||
| if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) { | |||||
| break; | |||||
| } | |||||
| } | |||||
| #if defined(DEBUG) | |||||
| printf("%s: returning CPU=%s, cpu_type=%p\n", __func__, | |||||
| pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); | |||||
| #endif | |||||
| cpu_type = pvrPOWER[i].cpu_type; | |||||
| if (!strcmp(cpu, "power8")) | |||||
| return cpu_type == CPU_POWER8; | |||||
| if (!strcmp(cpu, "power9")) | |||||
| return cpu_type == CPU_POWER9; | |||||
| return 0; | |||||
| } | |||||
| #endif /* C_PGI */ | |||||
| static gotoblas_t *get_coretype(void) { | static gotoblas_t *get_coretype(void) { | ||||
| #ifndef C_PGI | |||||
| if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) | if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) | ||||
| return &gotoblas_POWER6; | return &gotoblas_POWER6; | ||||
| #endif | |||||
| if (__builtin_cpu_is("power8")) | if (__builtin_cpu_is("power8")) | ||||
| return &gotoblas_POWER8; | return &gotoblas_POWER8; | ||||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | #if (!defined __GNUC__) || ( __GNUC__ >= 6) | ||||
| @@ -53,7 +209,7 @@ static gotoblas_t *get_coretype(void) { | |||||
| return &gotoblas_POWER10; | return &gotoblas_POWER10; | ||||
| #endif | #endif | ||||
| /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ | /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ | ||||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||||
| #if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) | |||||
| if (__builtin_cpu_is("power10")) | if (__builtin_cpu_is("power10")) | ||||
| return &gotoblas_POWER9; | return &gotoblas_POWER9; | ||||
| #endif | #endif | ||||
| @@ -77,7 +233,9 @@ static gotoblas_t *force_coretype(char * coretype) { | |||||
| switch (found) | switch (found) | ||||
| { | { | ||||
| #ifndef C_PGI | |||||
| case 1: return (&gotoblas_POWER6); | case 1: return (&gotoblas_POWER6); | ||||
| #endif | |||||
| case 2: return (&gotoblas_POWER8); | case 2: return (&gotoblas_POWER8); | ||||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | #if (!defined __GNUC__) || ( __GNUC__ >= 6) | ||||
| case 3: return (&gotoblas_POWER9); | case 3: return (&gotoblas_POWER9); | ||||
| @@ -222,11 +222,11 @@ int get_num_procs(void); | |||||
| #else | #else | ||||
| int get_num_procs(void) { | int get_num_procs(void) { | ||||
| static int nums = 0; | static int nums = 0; | ||||
| #if defined(__GLIBC_PREREQ) | |||||
| cpu_set_t cpuset,*cpusetp; | cpu_set_t cpuset,*cpusetp; | ||||
| size_t size; | size_t size; | ||||
| int ret; | int ret; | ||||
| #if defined(__GLIBC_PREREQ) | |||||
| #if !__GLIBC_PREREQ(2, 7) | #if !__GLIBC_PREREQ(2, 7) | ||||
| int i; | int i; | ||||
| #if !__GLIBC_PREREQ(2, 6) | #if !__GLIBC_PREREQ(2, 6) | ||||
| @@ -1241,7 +1241,7 @@ UNLOCK_COMMAND(&alloc_lock); | |||||
| func = &memoryalloc[0]; | func = &memoryalloc[0]; | ||||
| while ((func != NULL) && (map_address == (void *) -1)) { | |||||
| while ((*func != NULL) && (map_address == (void *) -1)) { | |||||
| map_address = (*func)((void *)base_address); | map_address = (*func)((void *)base_address); | ||||
| @@ -1619,10 +1619,12 @@ static int on_process_term(void) | |||||
| #else | #else | ||||
| #pragma data_seg(".CRT$XLB") | #pragma data_seg(".CRT$XLB") | ||||
| #endif | #endif | ||||
| static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; | |||||
| #ifdef _WIN64 | #ifdef _WIN64 | ||||
| static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; | |||||
| #pragma const_seg() | #pragma const_seg() | ||||
| #else | #else | ||||
| static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; | |||||
| #pragma data_seg() | #pragma data_seg() | ||||
| #endif | #endif | ||||
| @@ -1631,10 +1633,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI | |||||
| #else | #else | ||||
| #pragma data_seg(".CRT$XTU") | #pragma data_seg(".CRT$XTU") | ||||
| #endif | #endif | ||||
| static int(*p_process_term)(void) = on_process_term; | |||||
| #ifdef _WIN64 | #ifdef _WIN64 | ||||
| static const int(*p_process_term)(void) = on_process_term; | |||||
| #pragma const_seg() | #pragma const_seg() | ||||
| #else | #else | ||||
| static int(*p_process_term)(void) = on_process_term; | |||||
| #pragma data_seg() | #pragma data_seg() | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -1668,16 +1672,23 @@ void gotoblas_dummy_for_PGI(void) { | |||||
| #ifndef MEM_LARGE_PAGES | #ifndef MEM_LARGE_PAGES | ||||
| #define MEM_LARGE_PAGES 0x20000000 | #define MEM_LARGE_PAGES 0x20000000 | ||||
| #endif | #endif | ||||
| #else | |||||
| #elif !defined(OS_EMBEDDED) | |||||
| #define ALLOC_MMAP | #define ALLOC_MMAP | ||||
| #define ALLOC_MALLOC | #define ALLOC_MALLOC | ||||
| #else | |||||
| #define ALLOC_MALLOC | |||||
| inline int puts(const char *str) { return 0; } | |||||
| inline int printf(const char *format, ...) { return 0; } | |||||
| inline char *getenv(const char *name) { return ""; } | |||||
| inline int atoi(const char *str) { return 0; } | |||||
| #endif | #endif | ||||
| #include <stdlib.h> | #include <stdlib.h> | ||||
| #include <stdio.h> | #include <stdio.h> | ||||
| #include <fcntl.h> | #include <fcntl.h> | ||||
| #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) | |||||
| #if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED) | |||||
| #include <sys/mman.h> | #include <sys/mman.h> | ||||
| #ifndef NO_SYSV_IPC | #ifndef NO_SYSV_IPC | ||||
| #include <sys/shm.h> | #include <sys/shm.h> | ||||
| @@ -1,4 +1,4 @@ | |||||
| #!/usr/bin/perl | |||||
| #!/usr/bin/env perl | |||||
| # Changelog | # Changelog | ||||
| # 2017/09/03 staticfloat | # 2017/09/03 staticfloat | ||||
| @@ -1,4 +1,4 @@ | |||||
| #!/usr/bin/perl | |||||
| #!/usr/bin/env perl | |||||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | ||||
| @@ -32,9 +32,9 @@ if ($compiler eq "") { | |||||
| "xlf95", "xlf90", "xlf", | "xlf95", "xlf90", "xlf", | ||||
| "ppuf77", "ppuf95", "ppuf90", "ppuxlf", | "ppuf77", "ppuf95", "ppuf90", "ppuxlf", | ||||
| "pathf90", "pathf95", | "pathf90", "pathf95", | ||||
| "pgf95", "pgf90", "pgf77", | |||||
| "pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran", | |||||
| "flang", "egfortran", | "flang", "egfortran", | ||||
| "ifort"); | |||||
| "ifort", "nagfor"); | |||||
| OUTER: | OUTER: | ||||
| foreach $lists (@lists) { | foreach $lists (@lists) { | ||||
| @@ -64,7 +64,9 @@ if ($compiler eq "") { | |||||
| if (!$?) { | if (!$?) { | ||||
| $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; | $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; | ||||
| if ($data eq "") { | |||||
| $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`; | |||||
| } | |||||
| if ($data =~ /zhoge_/) { | if ($data =~ /zhoge_/) { | ||||
| $bu = "_"; | $bu = "_"; | ||||
| } | } | ||||
| @@ -76,6 +78,7 @@ if ($compiler eq "") { | |||||
| } elsif ($data =~ /GNU/ || $data =~ /GCC/ ) { | } elsif ($data =~ /GNU/ || $data =~ /GCC/ ) { | ||||
| $data =~ s/\(+.*?\)+//g; | |||||
| $data =~ /(\d+)\.(\d+).(\d+)/; | $data =~ /(\d+)\.(\d+).(\d+)/; | ||||
| $major = $1; | $major = $1; | ||||
| $minor = $2; | $minor = $2; | ||||
| @@ -87,7 +90,7 @@ if ($compiler eq "") { | |||||
| if ($compiler =~ /flang/) { | if ($compiler =~ /flang/) { | ||||
| $vendor = FLANG; | $vendor = FLANG; | ||||
| $openmp = "-fopenmp"; | $openmp = "-fopenmp"; | ||||
| } elsif ($compiler =~ /pgf/) { | |||||
| } elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) { | |||||
| $vendor = PGI; | $vendor = PGI; | ||||
| $openmp = "-mp"; | $openmp = "-mp"; | ||||
| } else { | } else { | ||||
| @@ -123,7 +126,7 @@ if ($compiler eq "") { | |||||
| $openmp = "-mp"; | $openmp = "-mp"; | ||||
| } | } | ||||
| if ($data =~ /PGF/) { | |||||
| if ($data =~ /PGF/ || $data =~ /NVF/) { | |||||
| $vendor = PGI; | $vendor = PGI; | ||||
| $openmp = "-mp"; | $openmp = "-mp"; | ||||
| } | } | ||||
| @@ -133,8 +136,16 @@ if ($compiler eq "") { | |||||
| $openmp = "-openmp"; | $openmp = "-openmp"; | ||||
| } | } | ||||
| if ($data =~ /NAG/) { | |||||
| $vendor = NAG; | |||||
| $openmp = "-openmp"; | |||||
| } | |||||
| # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. | # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. | ||||
| $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; | $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; | ||||
| if ($data eq "") { | |||||
| $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`; | |||||
| } | |||||
| if ($data =~ / zho_ge__/) { | if ($data =~ / zho_ge__/) { | ||||
| $need2bu = 1; | $need2bu = 1; | ||||
| } | } | ||||
| @@ -177,7 +188,7 @@ if ($compiler eq "") { | |||||
| $openmp = "-mp"; | $openmp = "-mp"; | ||||
| } | } | ||||
| if ($compiler =~ /pgf/) { | |||||
| if ($compiler =~ /pgf/ || $compiler =~ /nvf/) { | |||||
| $vendor = PGI; | $vendor = PGI; | ||||
| $bu = "_"; | $bu = "_"; | ||||
| $openmp = "-mp"; | $openmp = "-mp"; | ||||
| @@ -222,6 +233,12 @@ if ($compiler eq "") { | |||||
| $openmp = "-fopenmp"; | $openmp = "-fopenmp"; | ||||
| } | } | ||||
| if ($compiler =~ /nagfor/) { | |||||
| $vendor = NAG; | |||||
| $bu = "_"; | |||||
| $openmp = "-openmp"; | |||||
| } | |||||
| if ($vendor eq "") { | if ($vendor eq "") { | ||||
| $nofortran = 1; | $nofortran = 1; | ||||
| $compiler = "gfortran"; | $compiler = "gfortran"; | ||||
| @@ -275,14 +292,20 @@ if (!$?) { | |||||
| if ($?) { | if ($?) { | ||||
| $link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | $link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | ||||
| } | } | ||||
| #For nagfor | |||||
| if ($?) { | |||||
| $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; | |||||
| } | |||||
| $binary = "" if ($?); | $binary = "" if ($?); | ||||
| } | } | ||||
| if ($binary eq "") { | if ($binary eq "") { | ||||
| $link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`; | $link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`; | ||||
| } | } | ||||
| } | } | ||||
| if ( $vendor eq "NAG") { | |||||
| $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; | |||||
| } | |||||
| $linker_L = ""; | $linker_L = ""; | ||||
| $linker_l = ""; | $linker_l = ""; | ||||
| $linker_a = ""; | $linker_a = ""; | ||||
| @@ -330,12 +353,13 @@ if ($link ne "") { | |||||
| $flags =~ s/\@/\,/g; | $flags =~ s/\@/\,/g; | ||||
| $linker_L .= "-Wl,". $flags . " " ; | $linker_L .= "-Wl,". $flags . " " ; | ||||
| } | } | ||||
| if ($flags =~ /-lgomp/ && $CC =~ /clang/) { | |||||
| if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { | |||||
| $flags = "-lomp"; | $flags = "-lomp"; | ||||
| } | } | ||||
| if ( | if ( | ||||
| ($flags =~ /^\-l/) | ($flags =~ /^\-l/) | ||||
| && ($flags !~ /ibrary/) | |||||
| && ($flags !~ /gfortranbegin/) | && ($flags !~ /gfortranbegin/) | ||||
| && ($flags !~ /frtbegin/) | && ($flags !~ /frtbegin/) | ||||
| && ($flags !~ /pathfstart/) | && ($flags !~ /pathfstart/) | ||||
| @@ -352,6 +376,16 @@ if ($link ne "") { | |||||
| $linker_l .= $flags . " "; | $linker_l .= $flags . " "; | ||||
| } | } | ||||
| if ( $flags =~ /quickfit.o/ && $vendor == NAG) { | |||||
| $linker_l .= $flags . " "; | |||||
| } | |||||
| if ( $flags =~ /safefit.o/ && $vendor == NAG) { | |||||
| $linker_l .= $flags . " "; | |||||
| } | |||||
| if ( $flags =~ /thsafe.o/ && $vendor == NAG) { | |||||
| $linker_l .= $flags . " "; | |||||
| } | |||||
| $linker_a .= $flags . " " if $flags =~ /\.a$/; | $linker_a .= $flags . " " if $flags =~ /\.a$/; | ||||
| } | } | ||||
| @@ -1375,6 +1375,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #ifdef __riscv | #ifdef __riscv | ||||
| #include "cpuid_riscv64.c" | #include "cpuid_riscv64.c" | ||||
| #define OPENBLAS_SUPPORTED | |||||
| #endif | #endif | ||||
| #ifdef __arm__ | #ifdef __arm__ | ||||
| @@ -4,7 +4,7 @@ | |||||
| #else | #else | ||||
| #include "config_kernel.h" | #include "config_kernel.h" | ||||
| #endif | #endif | ||||
| #include "param.h" | |||||
| #include "common.h" | |||||
| int main(int argc, char **argv) { | int main(int argc, char **argv) { | ||||
| @@ -316,7 +316,7 @@ CCBLAS1OBJS = \ | |||||
| cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | ||||
| cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | ||||
| cblas_caxpby.$(SUFFIX) \ | cblas_caxpby.$(SUFFIX) \ | ||||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) | |||||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) | |||||
| CCBLAS2OBJS = \ | CCBLAS2OBJS = \ | ||||
| cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ | cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ | ||||
| @@ -346,7 +346,7 @@ CZBLAS1OBJS = \ | |||||
| cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | ||||
| cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | ||||
| cblas_zaxpby.$(SUFFIX) \ | cblas_zaxpby.$(SUFFIX) \ | ||||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) | |||||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) | |||||
| CZBLAS2OBJS = \ | CZBLAS2OBJS = \ | ||||
| @@ -1634,6 +1634,12 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c | |||||
| cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c | cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| cblas_crotg.$(SUFFIX) cblas_crotg.$(PSUFFIX): zrotg.c | |||||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | |||||
| cblas_zrotg.$(SUFFIX) cblas_zrotg.$(PSUFFIX): zrotg.c | |||||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | |||||
| cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c | cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| @@ -1664,6 +1670,12 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c | |||||
| cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c | cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) | ||||
| cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
| cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
| ifeq ($(BUILD_BFLOAT16),1) | ifeq ($(BUILD_BFLOAT16),1) | ||||
| cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c | cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c | ||||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | ||||
| @@ -1,4 +1,4 @@ | |||||
| #!/usr/bin/perl | |||||
| #!/usr/bin/env perl | |||||
| $count = 0; | $count = 0; | ||||
| @@ -246,6 +246,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| #ifdef SMP | #ifdef SMP | ||||
| double MNK; | double MNK; | ||||
| #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) | |||||
| #ifndef COMPLEX | #ifndef COMPLEX | ||||
| #ifdef XDOUBLE | #ifdef XDOUBLE | ||||
| int mode = BLAS_XDOUBLE | BLAS_REAL; | int mode = BLAS_XDOUBLE | BLAS_REAL; | ||||
| @@ -264,6 +265,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #endif | |||||
| #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) | #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) | ||||
| int nodes; | int nodes; | ||||
| @@ -417,8 +419,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | ||||
| #ifdef SMP | #ifdef SMP | ||||
| #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) | |||||
| mode |= (transa << BLAS_TRANSA_SHIFT); | mode |= (transa << BLAS_TRANSA_SHIFT); | ||||
| mode |= (transb << BLAS_TRANSB_SHIFT); | mode |= (transb << BLAS_TRANSB_SHIFT); | ||||
| #endif | |||||
| MNK = (double) args.m * (double) args.n * (double) args.k; | MNK = (double) args.m * (double) args.n * (double) args.k; | ||||
| if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) | if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) | ||||
| @@ -107,7 +107,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||||
| dq1 = dp1 * *dx1; | dq1 = dp1 * *dx1; | ||||
| if(ABS(dq1) > ABS(dq2)) | if(ABS(dq1) > ABS(dq2)) | ||||
| { | { | ||||
| dflag = ZERO; | |||||
| dh11 = ONE; | dh11 = ONE; | ||||
| dh22 = ONE; | dh22 = ONE; | ||||
| dh21 = - dy1 / *dx1; | dh21 = - dy1 / *dx1; | ||||
| @@ -187,10 +187,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| endif () | endif () | ||||
| # Makefile.L3 | # Makefile.L3 | ||||
| set(USE_TRMM false) | set(USE_TRMM false) | ||||
| if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE)) | |||||
| string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) | |||||
| if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE)) | |||||
| set(USE_TRMM true) | set(USE_TRMM true) | ||||
| endif () | endif () | ||||
| if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) | |||||
| if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) | |||||
| set(USE_TRMM true) | set(USE_TRMM true) | ||||
| endif () | endif () | ||||
| @@ -36,7 +36,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE) | |||||
| ifeq ($(GCCVERSIONGTEQ10), 1) | ifeq ($(GCCVERSIONGTEQ10), 1) | ||||
| override CFLAGS += -march=cooperlake | override CFLAGS += -march=cooperlake | ||||
| else | else | ||||
| override CFLAGS += -march=skylake-avx512 | |||||
| override CFLAGS += -march=skylake-avx512 -mavx512f | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), CYGWIN_NT) | ifeq ($(OSNAME), CYGWIN_NT) | ||||
| override CFLAGS += -fno-asynchronous-unwind-tables | override CFLAGS += -fno-asynchronous-unwind-tables | ||||
| @@ -47,7 +47,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE) | |||||
| endif | endif | ||||
| endif | endif | ||||
| else ifeq ($(TARGET_CORE), SKYLAKEX) | else ifeq ($(TARGET_CORE), SKYLAKEX) | ||||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 | |||||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 -mavx512f | |||||
| ifeq ($(OSNAME), CYGWIN_NT) | ifeq ($(OSNAME), CYGWIN_NT) | ||||
| override CFLAGS += -fno-asynchronous-unwind-tables | override CFLAGS += -fno-asynchronous-unwind-tables | ||||
| endif | endif | ||||
| @@ -1,3 +1,11 @@ | |||||
| FMAFLAG= | |||||
| ifndef OLDGCC | |||||
| ifdef HAVE_FMA3 | |||||
| FMAFLAG = -mfma | |||||
| endif | |||||
| endif | |||||
| ### AMAX ### | ### AMAX ### | ||||
| ifndef SAMAXKERNEL | ifndef SAMAXKERNEL | ||||
| @@ -828,10 +836,10 @@ $(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KE | |||||
| $(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@ | $(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@ | ||||
| $(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL) | $(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ | |||||
| $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) | $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||||
| $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) | $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*************************************************************************** | /*************************************************************************** | ||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||
| @@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| /***************************************************** | |||||
| * 2014/06/09 Saar | |||||
| * | |||||
| * Order rowMajor | |||||
| * Trans | |||||
| * | |||||
| ******************************************************/ | |||||
| int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) | int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) | ||||
| { | { | ||||
| BLASLONG i,j; | |||||
| FLOAT *aptr,*bptr; | |||||
| BLASLONG i, j; | |||||
| FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||||
| FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; | |||||
| if ( rows <= 0 ) return(0); | |||||
| if ( cols <= 0 ) return(0); | |||||
| if (rows <= 0) return 0; | |||||
| if (cols <= 0) return 0; | |||||
| aptr = a; | |||||
| a_offset = a; | |||||
| b_offset = b; | |||||
| for ( i=0; i<rows ; i++ ) | |||||
| { | |||||
| bptr = &b[i]; | |||||
| for(j=0; j<cols; j++) | |||||
| { | |||||
| bptr[j*ldb] = alpha * aptr[j]; | |||||
| } | |||||
| aptr += lda; | |||||
| } | |||||
| i = (rows >> 2); | |||||
| if (i > 0) { | |||||
| do { | |||||
| a_offset1 = a_offset; | |||||
| a_offset2 = a_offset1 + lda; | |||||
| a_offset3 = a_offset2 + lda; | |||||
| a_offset4 = a_offset3 + lda; | |||||
| a_offset += 4 * lda; | |||||
| return(0); | |||||
| b_offset1 = b_offset; | |||||
| b_offset2 = b_offset1 + ldb; | |||||
| b_offset3 = b_offset2 + ldb; | |||||
| b_offset4 = b_offset3 + ldb; | |||||
| b_offset += 4; | |||||
| j = (cols >> 2); | |||||
| if (j > 0) { | |||||
| do { | |||||
| /* Column 1 of MAT_B */ | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A | |||||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||||
| *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; | |||||
| *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; | |||||
| /* Column 2 of MAT_B */ | |||||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A | |||||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||||
| *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; | |||||
| *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; | |||||
| /* Column 3 of MAT_B */ | |||||
| *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A | |||||
| *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; | |||||
| *(b_offset3 + 2) = *(a_offset3 + 2)*alpha; | |||||
| *(b_offset4 + 2) = *(a_offset3 + 3)*alpha; | |||||
| /* Column 4 of MAT_B */ | |||||
| *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A | |||||
| *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; | |||||
| *(b_offset3 + 3) = *(a_offset4 + 2)*alpha; | |||||
| *(b_offset4 + 3) = *(a_offset4 + 3)*alpha; | |||||
| a_offset1 += 4; | |||||
| a_offset2 += 4; | |||||
| a_offset3 += 4; | |||||
| a_offset4 += 4; | |||||
| b_offset1 += ldb * 4; | |||||
| b_offset2 += ldb * 4; | |||||
| b_offset3 += ldb * 4; | |||||
| b_offset4 += ldb * 4; | |||||
| j--; | |||||
| } while (j > 0); | |||||
| } // if(j > 0) | |||||
| if (cols & 2) { | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||||
| *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; | |||||
| *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; | |||||
| *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; | |||||
| *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; | |||||
| a_offset1 += 2; | |||||
| a_offset2 += 2; | |||||
| a_offset3 += 2; | |||||
| a_offset4 += 2; | |||||
| b_offset1 += ldb*2; | |||||
| } | |||||
| if (cols & 1) { | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||||
| *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; | |||||
| *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; | |||||
| } | |||||
| i--; | |||||
| } while (i > 0); | |||||
| } | |||||
| } | |||||
| if (rows & 2) { | |||||
| a_offset1 = a_offset; | |||||
| a_offset2 = a_offset1 + lda; | |||||
| a_offset += 2 * lda; | |||||
| b_offset1 = b_offset; | |||||
| b_offset2 = b_offset1 + ldb; | |||||
| b_offset3 = b_offset2 + ldb; | |||||
| b_offset4 = b_offset3 + ldb; | |||||
| b_offset += 2; | |||||
| j = (cols >> 2); | |||||
| if (j > 0){ | |||||
| do { | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||||
| *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; | |||||
| *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; | |||||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||||
| *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; | |||||
| *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; | |||||
| a_offset1 += 4; | |||||
| a_offset2 += 4; | |||||
| b_offset1 += ldb * 4; | |||||
| b_offset2 += ldb * 4; | |||||
| b_offset3 += ldb * 4; | |||||
| b_offset4 += ldb * 4; | |||||
| j--; | |||||
| } while (j > 0); | |||||
| } | |||||
| if (cols & 2){ | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||||
| a_offset1 += 2; | |||||
| a_offset2 += 2; | |||||
| b_offset1 += ldb*2; | |||||
| } | |||||
| if (cols & 1){ | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||||
| } | |||||
| } // if (rows & 2) | |||||
| if (rows & 1) { | |||||
| a_offset1 = a_offset; | |||||
| a_offset += lda; | |||||
| b_offset1 = b_offset; | |||||
| b_offset2 = b_offset1 + ldb; | |||||
| b_offset3 = b_offset2 + ldb; | |||||
| b_offset4 = b_offset3 + ldb; | |||||
| j = (cols >> 2); | |||||
| if (j > 0){ | |||||
| do { | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||||
| *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; | |||||
| *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; | |||||
| a_offset1 += 4; | |||||
| b_offset1 += ldb * 4; | |||||
| b_offset2 += ldb * 4; | |||||
| b_offset3 += ldb * 4; | |||||
| b_offset4 += ldb * 4; | |||||
| j--; | |||||
| } while (j > 0); | |||||
| } | |||||
| if (cols & 2){ | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||||
| a_offset1 += 2; | |||||
| b_offset1 += ldb * 2; | |||||
| } | |||||
| if (cols & 1){ | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
| dot[0]=0.0; | dot[0]=0.0; | ||||
| dot[1]=0.0; | dot[1]=0.0; | ||||
| #if !defined(__PPC__) && !defined(__SunOS) | |||||
| #if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) | |||||
| CREAL(result) = 0.0 ; | CREAL(result) = 0.0 ; | ||||
| CIMAG(result) = 0.0 ; | CIMAG(result) = 0.0 ; | ||||
| #else | #else | ||||
| @@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
| i++ ; | i++ ; | ||||
| } | } | ||||
| #if !defined(__PPC__) && !defined(__SunOS) | |||||
| #if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) | |||||
| CREAL(result) = dot[0]; | CREAL(result) = dot[0]; | ||||
| CIMAG(result) = dot[1]; | CIMAG(result) = dot[1]; | ||||
| #else | #else | ||||
| @@ -97,9 +97,18 @@ CNRM2KERNEL = znrm2.S | |||||
| ZNRM2KERNEL = znrm2.S | ZNRM2KERNEL = znrm2.S | ||||
| DDOTKERNEL = dot.S | DDOTKERNEL = dot.S | ||||
| ifneq ($(C_COMPILER), PGI) | |||||
| SDOTKERNEL = ../generic/dot.c | SDOTKERNEL = ../generic/dot.c | ||||
| else | |||||
| SDOTKERNEL = dot.S | |||||
| endif | |||||
| ifneq ($(C_COMPILER), PGI) | |||||
| CDOTKERNEL = zdot.S | CDOTKERNEL = zdot.S | ||||
| ZDOTKERNEL = zdot.S | ZDOTKERNEL = zdot.S | ||||
| else | |||||
| CDOTKERNEL = ../arm/zdot.c | |||||
| ZDOTKERNEL = ../arm/zdot.c | |||||
| endif | |||||
| DSDOTKERNEL = dot.S | DSDOTKERNEL = dot.S | ||||
| DGEMM_BETA = dgemm_beta.S | DGEMM_BETA = dgemm_beta.S | ||||
| @@ -96,11 +96,20 @@ DNRM2KERNEL = nrm2.S | |||||
| CNRM2KERNEL = znrm2.S | CNRM2KERNEL = znrm2.S | ||||
| ZNRM2KERNEL = znrm2.S | ZNRM2KERNEL = znrm2.S | ||||
| DDOTKERNEL = dot.S | |||||
| SDOTKERNEL = ../generic/dot.c | |||||
| CDOTKERNEL = zdot.S | |||||
| ZDOTKERNEL = zdot.S | |||||
| DSDOTKERNEL = dot.S | |||||
| ifneq ($(C_COMPILER), PGI) | |||||
| SDOTKERNEL = ../generic/dot.c | |||||
| else | |||||
| SDOTKERNEL = dot.S | |||||
| endif | |||||
| DDOTKERNEL = dot.S | |||||
| ifneq ($(C_COMPILER), PGI) | |||||
| CDOTKERNEL = zdot.S | |||||
| ZDOTKERNEL = zdot.S | |||||
| else | |||||
| CDOTKERNEL = ../arm/zdot.c | |||||
| ZDOTKERNEL = ../arm/zdot.c | |||||
| endif | |||||
| DSDOTKERNEL = dot.S | |||||
| DGEMM_BETA = dgemm_beta.S | DGEMM_BETA = dgemm_beta.S | ||||
| SGEMM_BETA = sgemm_beta.S | SGEMM_BETA = sgemm_beta.S | ||||
| @@ -70,10 +70,19 @@ DCOPYKERNEL = copy.S | |||||
| CCOPYKERNEL = copy.S | CCOPYKERNEL = copy.S | ||||
| ZCOPYKERNEL = copy.S | ZCOPYKERNEL = copy.S | ||||
| ifneq ($(C_COMPILER), PGI) | |||||
| SDOTKERNEL = ../generic/dot.c | SDOTKERNEL = ../generic/dot.c | ||||
| else | |||||
| SDOTKERNEL = dot.S | |||||
| endif | |||||
| DDOTKERNEL = dot.S | DDOTKERNEL = dot.S | ||||
| ifneq ($(C_COMPILER), PGI) | |||||
| CDOTKERNEL = zdot.S | CDOTKERNEL = zdot.S | ||||
| ZDOTKERNEL = zdot.S | ZDOTKERNEL = zdot.S | ||||
| else | |||||
| CDOTKERNEL = ../arm/zdot.c | |||||
| ZDOTKERNEL = ../arm/zdot.c | |||||
| endif | |||||
| DSDOTKERNEL = dot.S | DSDOTKERNEL = dot.S | ||||
| SNRM2KERNEL = nrm2.S | SNRM2KERNEL = nrm2.S | ||||
| @@ -47,8 +47,13 @@ ZCOPYKERNEL = copy.S | |||||
| SDOTKERNEL = dot_thunderx.c | SDOTKERNEL = dot_thunderx.c | ||||
| DDOTKERNEL = ddot_thunderx.c | DDOTKERNEL = ddot_thunderx.c | ||||
| ifneq ($(C_COMPILER), PGI) | |||||
| CDOTKERNEL = zdot.S | CDOTKERNEL = zdot.S | ||||
| ZDOTKERNEL = zdot.S | ZDOTKERNEL = zdot.S | ||||
| else | |||||
| CDOTKERNEL = ../arm/zdot.c | |||||
| ZDOTKERNEL = ../arm/zdot.c | |||||
| endif | |||||
| DSDOTKERNEL = dot.S | DSDOTKERNEL = dot.S | ||||
| SNRM2KERNEL = nrm2.S | SNRM2KERNEL = nrm2.S | ||||
| @@ -72,8 +72,13 @@ ZCOPYKERNEL = copy.S | |||||
| SDOTKERNEL = dot.S | SDOTKERNEL = dot.S | ||||
| DDOTKERNEL = dot.S | DDOTKERNEL = dot.S | ||||
| ifneq ($(C_COMPILER), PGI) | |||||
| CDOTKERNEL = zdot.S | CDOTKERNEL = zdot.S | ||||
| ZDOTKERNEL = zdot.S | ZDOTKERNEL = zdot.S | ||||
| else | |||||
| CDOTKERNEL = ../arm/zdot.c | |||||
| ZDOTKERNEL = ../arm/zdot.c | |||||
| endif | |||||
| DSDOTKERNEL = dot.S | DSDOTKERNEL = dot.S | ||||
| SNRM2KERNEL = nrm2.S | SNRM2KERNEL = nrm2.S | ||||
| @@ -58,6 +58,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n | |||||
| #define CUR_MAXINV "d8" | #define CUR_MAXINV "d8" | ||||
| #define CUR_MAXINV_V "v8.2d" | #define CUR_MAXINV_V "v8.2d" | ||||
| #define CUR_MAX_V "v8.2d" | #define CUR_MAX_V "v8.2d" | ||||
| #define REGINF "d9" | |||||
| static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | ||||
| double *ssq, double *scale) | double *ssq, double *scale) | ||||
| @@ -79,8 +80,10 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||||
| " ble 9f //nrm2_kernel_L999 \n" | " ble 9f //nrm2_kernel_L999 \n" | ||||
| "1: //nrm2_kernel_F_BEGIN: \n" | "1: //nrm2_kernel_F_BEGIN: \n" | ||||
| " mov x6, #0x7FF0000000000000 //+Infinity \n" | |||||
| " fmov "REGZERO", xzr \n" | " fmov "REGZERO", xzr \n" | ||||
| " fmov "REGONE", #1.0 \n" | " fmov "REGONE", #1.0 \n" | ||||
| " fmov "REGINF", x6 \n" | |||||
| " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" | " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" | ||||
| " mov "J", "N" \n" | " mov "J", "N" \n" | ||||
| " cmp "J", xzr \n" | " cmp "J", xzr \n" | ||||
| @@ -104,6 +107,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||||
| " ldr d4, ["X"] \n" | " ldr d4, ["X"] \n" | ||||
| " fabs d4, d4 \n" | " fabs d4, d4 \n" | ||||
| " fmax "CUR_MAX", "SCALE", d4 \n" | " fmax "CUR_MAX", "SCALE", d4 \n" | ||||
| " fcmp "CUR_MAX", "REGINF" \n" | |||||
| " beq 10f \n" | |||||
| " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | ||||
| " fmul "SCALE", "SCALE", "SCALE" \n" | " fmul "SCALE", "SCALE", "SCALE" \n" | ||||
| " fmul "SSQ", "SSQ", "SCALE" \n" | " fmul "SSQ", "SSQ", "SCALE" \n" | ||||
| @@ -116,6 +121,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||||
| " ldr d3, ["X", #8] \n" | " ldr d3, ["X", #8] \n" | ||||
| " fabs d3, d3 \n" | " fabs d3, d3 \n" | ||||
| " fmax "CUR_MAX", "SCALE", d3 \n" | " fmax "CUR_MAX", "SCALE", d3 \n" | ||||
| " fcmp "CUR_MAX", "REGINF" \n" | |||||
| " beq 10f \n" | |||||
| " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | ||||
| " fmul "SCALE", "SCALE", "SCALE" \n" | " fmul "SCALE", "SCALE", "SCALE" \n" | ||||
| " fmul "SSQ", "SSQ", "SCALE" \n" | " fmul "SSQ", "SSQ", "SCALE" \n" | ||||
| @@ -158,6 +165,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||||
| " fmaxp v24.2d, v24.2d, v26.2d \n" | " fmaxp v24.2d, v24.2d, v26.2d \n" | ||||
| " fmaxp v24.2d, v24.2d, v24.2d \n" | " fmaxp v24.2d, v24.2d, v24.2d \n" | ||||
| " fmax "CUR_MAX", "SCALE", d24 \n" | " fmax "CUR_MAX", "SCALE", d24 \n" | ||||
| " fcmp "CUR_MAX", "REGINF" \n" | |||||
| " beq 10f \n" | |||||
| " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" | " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" | ||||
| " //dup "CUR_MAX_V", v7.d[0] \n" | " //dup "CUR_MAX_V", v7.d[0] \n" | ||||
| " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | ||||
| @@ -217,6 +226,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||||
| " fmaxp v24.2d, v24.2d, v26.2d \n" | " fmaxp v24.2d, v24.2d, v26.2d \n" | ||||
| " fmaxp v24.2d, v24.2d, v24.2d \n" | " fmaxp v24.2d, v24.2d, v24.2d \n" | ||||
| " fmax "CUR_MAX", "SCALE", d24 \n" | " fmax "CUR_MAX", "SCALE", d24 \n" | ||||
| " fcmp "CUR_MAX", "REGINF" \n" | |||||
| " beq 10f \n" | |||||
| " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" | " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" | ||||
| " //dup "CUR_MAX_V", v7.d[0] \n" | " //dup "CUR_MAX_V", v7.d[0] \n" | ||||
| " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | ||||
| @@ -265,6 +276,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||||
| " ldr d4, ["X"] \n" | " ldr d4, ["X"] \n" | ||||
| " fabs d4, d4 \n" | " fabs d4, d4 \n" | ||||
| " fmax "CUR_MAX", "SCALE", d4 \n" | " fmax "CUR_MAX", "SCALE", d4 \n" | ||||
| " fcmp "CUR_MAX", "REGINF" \n" | |||||
| " beq 10f \n" | |||||
| " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | ||||
| " fmul "SCALE", "SCALE", "SCALE" \n" | " fmul "SCALE", "SCALE", "SCALE" \n" | ||||
| " fmul "SSQ", "SSQ", "SCALE" \n" | " fmul "SSQ", "SSQ", "SCALE" \n" | ||||
| @@ -276,6 +289,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||||
| " ldr d3, ["X", #8] \n" | " ldr d3, ["X", #8] \n" | ||||
| " fabs d3, d3 \n" | " fabs d3, d3 \n" | ||||
| " fmax "CUR_MAX", "SCALE", d3 \n" | " fmax "CUR_MAX", "SCALE", d3 \n" | ||||
| " fcmp "CUR_MAX", "REGINF" \n" | |||||
| " beq 10f \n" | |||||
| " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | ||||
| " fmul "SCALE", "SCALE", "SCALE" \n" | " fmul "SCALE", "SCALE", "SCALE" \n" | ||||
| " fmul "SSQ", "SSQ", "SCALE" \n" | " fmul "SSQ", "SSQ", "SCALE" \n" | ||||
| @@ -291,6 +306,11 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||||
| "9: //nrm2_kernel_L999: \n" | "9: //nrm2_kernel_L999: \n" | ||||
| " str "SSQ", [%[SSQ_]] \n" | " str "SSQ", [%[SSQ_]] \n" | ||||
| " str "SCALE", [%[SCALE_]] \n" | " str "SCALE", [%[SCALE_]] \n" | ||||
| " b 11f \n" | |||||
| "10: \n" | |||||
| " str "REGINF", [%[SSQ_]] \n" | |||||
| " str "REGINF", [%[SCALE_]] \n" | |||||
| "11: \n" | |||||
| : | : | ||||
| : [SSQ_] "r" (ssq), //%0 | : [SSQ_] "r" (ssq), //%0 | ||||
| @@ -300,7 +320,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||||
| [INCX_] "r" (inc_x) //%4 | [INCX_] "r" (inc_x) //%4 | ||||
| : "cc", | : "cc", | ||||
| "memory", | "memory", | ||||
| "x0", "x1", "x2", "x3", "x4", "x5", | |||||
| "x0", "x1", "x2", "x3", "x4", "x5", "x6", | |||||
| "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8" | "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8" | ||||
| ); | ); | ||||
| @@ -359,6 +379,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| cur_ssq = *ptr; | cur_ssq = *ptr; | ||||
| cur_scale = *(ptr + 1); | cur_scale = *(ptr + 1); | ||||
| if (cur_ssq == INFINITY) { | |||||
| ssq = INFINITY; | |||||
| scale = INFINITY; | |||||
| break; | |||||
| } | |||||
| if (cur_scale != 0) { | if (cur_scale != 0) { | ||||
| if (cur_scale > scale) { | if (cur_scale > scale) { | ||||
| scale = (scale / cur_scale); | scale = (scale / cur_scale); | ||||
| @@ -154,11 +154,7 @@ ZCOPYKERNEL = zcopy_power10.c | |||||
| SDOTKERNEL = sdot_power10.c | SDOTKERNEL = sdot_power10.c | ||||
| DDOTKERNEL = ddot_power10.c | DDOTKERNEL = ddot_power10.c | ||||
| DSDOTKERNEL = sdot_power10.c | DSDOTKERNEL = sdot_power10.c | ||||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||||
| CDOTKERNEL = cdot_power9.S | |||||
| else | |||||
| CDOTKERNEL = cdot.c | CDOTKERNEL = cdot.c | ||||
| endif | |||||
| ZDOTKERNEL = zdot.c | ZDOTKERNEL = zdot.c | ||||
| # | # | ||||
| SNRM2KERNEL = ../arm/nrm2.c | SNRM2KERNEL = ../arm/nrm2.c | ||||
| @@ -0,0 +1,115 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL 1 | |||||
| static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| { | |||||
| __asm__ | |||||
| ( | |||||
| "lxvp 32, 0(%2) \n\t" | |||||
| "lxvp 34, 32(%2) \n\t" | |||||
| "lxvp 36, 64(%2) \n\t" | |||||
| "lxvp 38, 96(%2) \n\t" | |||||
| "lxvp 40, 128(%2) \n\t" | |||||
| "lxvp 42, 160(%2) \n\t" | |||||
| "lxvp 44, 192(%2) \n\t" | |||||
| "lxvp 46, 224(%2) \n\t" | |||||
| "addi %2, %2, 256 \n\t" | |||||
| "addic. %1, %1, -32 \n\t" | |||||
| "ble two%= \n\t" | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "stxv 33, 0(%3) \n\t" | |||||
| "stxv 32, 16(%3) \n\t" | |||||
| "stxv 35, 32(%3) \n\t" | |||||
| "stxv 34, 48(%3) \n\t" | |||||
| "stxv 37, 64(%3) \n\t" | |||||
| "stxv 36, 80(%3) \n\t" | |||||
| "stxv 39, 96(%3) \n\t" | |||||
| "stxv 38, 112(%3) \n\t" | |||||
| "lxvp 32, 0(%2) \n\t" | |||||
| "lxvp 34, 32(%2) \n\t" | |||||
| "lxvp 36, 64(%2) \n\t" | |||||
| "lxvp 38, 96(%2) \n\t" | |||||
| "stxv 41, 128(%3) \n\t" | |||||
| "stxv 40, 144(%3) \n\t" | |||||
| "stxv 43, 160(%3) \n\t" | |||||
| "stxv 42, 176(%3) \n\t" | |||||
| "stxv 45, 192(%3) \n\t" | |||||
| "stxv 44, 208(%3) \n\t" | |||||
| "stxv 47, 224(%3) \n\t" | |||||
| "stxv 46, 240(%3) \n\t" | |||||
| "lxvp 40, 128(%2) \n\t" | |||||
| "lxvp 42, 160(%2) \n\t" | |||||
| "lxvp 44, 192(%2) \n\t" | |||||
| "lxvp 46, 224(%2) \n\t" | |||||
| "addi %3, %3, 256 \n\t" | |||||
| "addi %2, %2, 256 \n\t" | |||||
| "addic. %1, %1, -32 \n\t" | |||||
| "bgt one%= \n" | |||||
| "two%=: \n\t" | |||||
| "stxv 33, 0(%3) \n\t" | |||||
| "stxv 32, 16(%3) \n\t" | |||||
| "stxv 35, 32(%3) \n\t" | |||||
| "stxv 34, 48(%3) \n\t" | |||||
| "stxv 37, 64(%3) \n\t" | |||||
| "stxv 36, 80(%3) \n\t" | |||||
| "stxv 39, 96(%3) \n\t" | |||||
| "stxv 38, 112(%3) \n\t" | |||||
| "stxv 41, 128(%3) \n\t" | |||||
| "stxv 40, 144(%3) \n\t" | |||||
| "stxv 43, 160(%3) \n\t" | |||||
| "stxv 42, 176(%3) \n\t" | |||||
| "stxv 45, 192(%3) \n\t" | |||||
| "stxv 44, 208(%3) \n\t" | |||||
| "stxv 47, 224(%3) \n\t" | |||||
| "stxv 46, 240(%3) \n\t" | |||||
| "#n=%1 x=%4=%2 y=%0=%3" | |||||
| : | |||||
| "=m" (*y), | |||||
| "+r" (n), // 1 | |||||
| "+b" (x), // 2 | |||||
| "+b" (y) // 3 | |||||
| : | |||||
| "m" (*x) | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" | |||||
| ); | |||||
| } | |||||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | #if defined(__VEC__) || defined(__ALTIVEC__) | ||||
| #include "copy_microk_power10.c" | |||||
| #include "ccopy_microk_power10.c" | |||||
| #endif | #endif | ||||
| #ifndef HAVE_KERNEL | #ifndef HAVE_KERNEL | ||||
| @@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| if ( (inc_x == 1) && (inc_y == 1 )) | if ( (inc_x == 1) && (inc_y == 1 )) | ||||
| { | { | ||||
| BLASLONG n1 = n & -64; | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| copy_kernel(n1, x, y); | copy_kernel(n1, x, y); | ||||
| @@ -28,6 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER10) | |||||
| #include "cdot_microk_power10.c" | |||||
| #else | |||||
| #ifndef HAVE_KERNEL_8 | #ifndef HAVE_KERNEL_8 | ||||
| #include <altivec.h> | #include <altivec.h> | ||||
| @@ -99,6 +102,7 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) | |||||
| } | } | ||||
| #endif | #endif | ||||
| #endif | |||||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | ||||
| @@ -116,7 +120,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | if ((inc_x == 1) && (inc_y == 1)) { | ||||
| #if defined(POWER10) | |||||
| BLASLONG n1 = n & -16; | |||||
| #else | |||||
| BLASLONG n1 = n & -8; | BLASLONG n1 = n & -8; | ||||
| #endif | |||||
| BLASLONG j=0; | BLASLONG j=0; | ||||
| if (n1){ | if (n1){ | ||||
| @@ -0,0 +1,177 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_8 1 | |||||
| static void cdot_kernel_8 (long n, float *x, float *y, float *dot) | |||||
| { | |||||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||||
| __asm__ | |||||
| ( | |||||
| "dcbt 0, %2 \n\t" | |||||
| "dcbt 0, %3 \n\t" | |||||
| "xxlxor 32, 32, 32 \n\t" | |||||
| "xxlxor 33, 33, 33 \n\t" | |||||
| "xxlxor 34, 34, 34 \n\t" | |||||
| "xxlxor 35, 35, 35 \n\t" | |||||
| "xxlxor 36, 36, 36 \n\t" | |||||
| "xxlxor 37, 37, 37 \n\t" | |||||
| "xxlxor 38, 38, 38 \n\t" | |||||
| "xxlxor 39, 39, 39 \n\t" | |||||
| "lxvp 40, 0(%2) \n\t" | |||||
| "lxvp 42, 32(%2) \n\t" | |||||
| "lxvp 44, 64(%2) \n\t" | |||||
| "lxvp 46, 96(%2) \n\t" | |||||
| "lxvp 48, 0(%3) \n\t" | |||||
| "lxvp 50, 32(%3) \n\t" | |||||
| "lxvp 52, 64(%3) \n\t" | |||||
| "lxvp 54, 96(%3) \n\t" | |||||
| "xxperm 56, 48, %x7 \n\t" | |||||
| "xxperm 57, 49, %x7 \n\t" | |||||
| "xxperm 58, 50, %x7 \n\t" | |||||
| "xxperm 59, 51, %x7 \n\t" | |||||
| "xxperm 60, 52, %x7 \n\t" | |||||
| "xxperm 61, 53, %x7 \n\t" | |||||
| "xxperm 62, 54, %x7 \n\t" | |||||
| "xxperm 63, 55, %x7 \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addi %3, %3, 128 \n\t" | |||||
| "addic. %1, %1, -16 \n\t" | |||||
| "ble two%= \n\t" | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i | |||||
| "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i | |||||
| "lxvp 48, 0(%3) \n\t" | |||||
| "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i | |||||
| "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i | |||||
| "lxvp 50, 32(%3) \n\t" | |||||
| "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r | |||||
| "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r | |||||
| "lxvp 40, 0(%2) \n\t" | |||||
| "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r | |||||
| "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r | |||||
| "lxvp 42, 32(%2) \n\t" | |||||
| "xxperm 56, 48, %x7 \n\t" | |||||
| "xxperm 57, 49, %x7 \n\t" | |||||
| "xxperm 58, 50, %x7 \n\t" | |||||
| "xxperm 59, 51, %x7 \n\t" | |||||
| "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i | |||||
| "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i | |||||
| "lxvp 52, 64(%3) \n\t" | |||||
| "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i | |||||
| "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i | |||||
| "lxvp 54, 96(%3) \n\t" | |||||
| "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r | |||||
| "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r | |||||
| "lxvp 44, 64(%2) \n\t" | |||||
| "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r | |||||
| "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r | |||||
| "lxvp 46, 96(%2) \n\t" | |||||
| "xxperm 60, 52, %x7 \n\t" | |||||
| "xxperm 61, 53, %x7 \n\t" | |||||
| "xxperm 62, 54, %x7 \n\t" | |||||
| "xxperm 63, 55, %x7 \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addi %3, %3, 128 \n\t" | |||||
| "addic. %1, %1, -16 \n\t" | |||||
| "bgt one%= \n" | |||||
| "two%=: \n\t" | |||||
| "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i | |||||
| "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i | |||||
| "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i | |||||
| "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i | |||||
| "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r | |||||
| "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r | |||||
| "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r | |||||
| "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r | |||||
| "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i | |||||
| "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i | |||||
| "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i | |||||
| "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i | |||||
| "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r | |||||
| "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r | |||||
| "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r | |||||
| "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r | |||||
| "xvaddsp 32, 32, 34 \n\t" | |||||
| "xvaddsp 36, 36, 38 \n\t" | |||||
| "xvaddsp 33, 33, 35 \n\t" | |||||
| "xvaddsp 37, 37, 39 \n\t" | |||||
| "xvaddsp 35, 32, 36 \n\t" | |||||
| "xvaddsp 34, 33, 37 \n\t" | |||||
| "xxswapd 32, 35 \n\t" | |||||
| "xxswapd 33, 34 \n\t" | |||||
| "xvaddsp 35, 35, 32 \n\t" | |||||
| "xvaddsp 34, 34, 33 \n\t" | |||||
| "xxpermdi 34, 34, 35, 2 \n\t" | |||||
| "stxv 34, 0(%6) \n\t" | |||||
| "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" | |||||
| : | |||||
| "=m" (*dot), | |||||
| "+r" (n), // 1 | |||||
| "+b" (x), // 2 | |||||
| "+b" (y) // 3 | |||||
| : | |||||
| "m" (*x), | |||||
| "m" (*y), | |||||
| "b" (dot), // 6 | |||||
| "wa" (mask) | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||||
| "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" | |||||
| ); | |||||
| } | |||||
| @@ -62,38 +62,39 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||||
| "one%=: \n\t" | "one%=: \n\t" | ||||
| "stxvp 32, 0(%3) \n\t" | "stxvp 32, 0(%3) \n\t" | ||||
| "lxvp 32, 0(%2) \n\t" | |||||
| "stxvp 34, 32(%3) \n\t" | "stxvp 34, 32(%3) \n\t" | ||||
| "lxvp 34, 32(%2) \n\t" | |||||
| "stxvp 36, 64(%3) \n\t" | "stxvp 36, 64(%3) \n\t" | ||||
| "lxvp 36, 64(%2) \n\t" | |||||
| "stxvp 38, 96(%3) \n\t" | "stxvp 38, 96(%3) \n\t" | ||||
| "lxvp 32, 0(%2) \n\t" | |||||
| "lxvp 34, 32(%2) \n\t" | |||||
| "lxvp 36, 64(%2) \n\t" | |||||
| "lxvp 38, 96(%2) \n\t" | "lxvp 38, 96(%2) \n\t" | ||||
| "stxvp 40, 128(%3) \n\t" | "stxvp 40, 128(%3) \n\t" | ||||
| "lxvp 40, 128(%2) \n\t" | |||||
| "stxvp 42, 160(%3) \n\t" | "stxvp 42, 160(%3) \n\t" | ||||
| "lxvp 42, 160(%2) \n\t" | |||||
| "stxvp 44, 192(%3) \n\t" | "stxvp 44, 192(%3) \n\t" | ||||
| "lxvp 44, 192(%2) \n\t" | |||||
| "stxvp 46, 224(%3) \n\t" | "stxvp 46, 224(%3) \n\t" | ||||
| "lxvp 40, 128(%2) \n\t" | |||||
| "lxvp 42, 160(%2) \n\t" | |||||
| "lxvp 44, 192(%2) \n\t" | |||||
| "lxvp 46, 224(%2) \n\t" | "lxvp 46, 224(%2) \n\t" | ||||
| "stxvp 48, 256(%3) \n\t" | "stxvp 48, 256(%3) \n\t" | ||||
| "lxvp 48, 256(%2) \n\t" | |||||
| "stxvp 50, 288(%3) \n\t" | "stxvp 50, 288(%3) \n\t" | ||||
| "lxvp 50, 288(%2) \n\t" | |||||
| "stxvp 52, 320(%3) \n\t" | "stxvp 52, 320(%3) \n\t" | ||||
| "lxvp 52, 320(%2) \n\t" | |||||
| "stxvp 54, 352(%3) \n\t" | "stxvp 54, 352(%3) \n\t" | ||||
| "lxvp 48, 256(%2) \n\t" | |||||
| "lxvp 50, 288(%2) \n\t" | |||||
| "lxvp 52, 320(%2) \n\t" | |||||
| "lxvp 54, 352(%2) \n\t" | "lxvp 54, 352(%2) \n\t" | ||||
| "stxvp 56, 384(%3) \n\t" | "stxvp 56, 384(%3) \n\t" | ||||
| "lxvp 56, 384(%2) \n\t" | |||||
| "stxvp 58, 416(%3) \n\t" | "stxvp 58, 416(%3) \n\t" | ||||
| "lxvp 58, 416(%2) \n\t" | |||||
| "stxvp 60, 448(%3) \n\t" | "stxvp 60, 448(%3) \n\t" | ||||
| "lxvp 60, 448(%2) \n\t" | |||||
| "stxvp 62, 480(%3) \n\t" | "stxvp 62, 480(%3) \n\t" | ||||
| "lxvp 56, 384(%2) \n\t" | |||||
| "lxvp 58, 416(%2) \n\t" | |||||
| "lxvp 60, 448(%2) \n\t" | |||||
| "lxvp 62, 480(%2) \n\t" | "lxvp 62, 480(%2) \n\t" | ||||
| "addi %3, %3, 512 \n\t" | "addi %3, %3, 512 \n\t" | ||||
| @@ -0,0 +1,176 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_8 1 | |||||
| static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) | |||||
| { | |||||
| __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i}; | |||||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||||
| __asm__ | |||||
| ( | |||||
| "dcbt 0, %2 \n\t" | |||||
| "xscvdpspn 32, %x3 \n\t" | |||||
| "xxspltw 32, 32, 0 \n\t" | |||||
| "lxvp 40, 0(%2) \n\t" | |||||
| "lxvp 42, 32(%2) \n\t" | |||||
| "lxvp 44, 64(%2) \n\t" | |||||
| "lxvp 46, 96(%2) \n\t" | |||||
| "addic. %1, %1, -16 \n\t" | |||||
| "ble two%= \n\t" | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r | |||||
| "xvmulsp 49, 41, 32 \n\t" | |||||
| "xvmulsp 50, 42, 32 \n\t" | |||||
| "xvmulsp 51, 43, 32 \n\t" | |||||
| "xvmulsp 52, 44, 32 \n\t" | |||||
| "xvmulsp 53, 45, 32 \n\t" | |||||
| "xvmulsp 54, 46, 32 \n\t" | |||||
| "xvmulsp 55, 47, 32 \n\t" | |||||
| "xxperm 34, 40, %x5 \n\t" | |||||
| "xxperm 35, 41, %x5 \n\t" | |||||
| "xxperm 36, 42, %x5 \n\t" | |||||
| "xxperm 37, 43, %x5 \n\t" | |||||
| "xxperm 38, 44, %x5 \n\t" | |||||
| "xxperm 39, 45, %x5 \n\t" | |||||
| "xxperm 56, 46, %x5 \n\t" | |||||
| "xxperm 57, 47, %x5 \n\t" | |||||
| "xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i | |||||
| "xvmulsp 35, 35, %x4 \n\t" | |||||
| "lxvp 40, 128(%2) \n\t" | |||||
| "xvmulsp 36, 36, %x4 \n\t" | |||||
| "xvmulsp 37, 37, %x4 \n\t" | |||||
| "lxvp 42, 160(%2) \n\t" | |||||
| "xvmulsp 38, 38, %x4 \n\t" | |||||
| "xvmulsp 39, 39, %x4 \n\t" | |||||
| "lxvp 44, 192(%2) \n\t" | |||||
| "xvmulsp 56, 56, %x4 \n\t" | |||||
| "xvmulsp 57, 57, %x4 \n\t" | |||||
| "lxvp 46, 224(%2) \n\t" | |||||
| "xvaddsp 48, 48, 34 \n\t" | |||||
| "xvaddsp 49, 49, 35 \n\t" | |||||
| "xvaddsp 50, 50, 36 \n\t" | |||||
| "xvaddsp 51, 51, 37 \n\t" | |||||
| "stxvp 48, 0(%2) \n\t" | |||||
| "xvaddsp 52, 52, 38 \n\t" | |||||
| "xvaddsp 53, 53, 39 \n\t" | |||||
| "stxvp 50, 32(%2) \n\t" | |||||
| "xvaddsp 54, 54, 56 \n\t" | |||||
| "xvaddsp 55, 55, 57 \n\t" | |||||
| "stxvp 52, 64(%2) \n\t" | |||||
| "stxvp 54, 96(%2) \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addic. %1, %1, -16 \n\t" | |||||
| "bgt one%= \n" | |||||
| "two%=: \n\t" | |||||
| "xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r | |||||
| "xvmulsp 49, 41, 32 \n\t" | |||||
| "xvmulsp 50, 42, 32 \n\t" | |||||
| "xvmulsp 51, 43, 32 \n\t" | |||||
| "xvmulsp 52, 44, 32 \n\t" | |||||
| "xvmulsp 53, 45, 32 \n\t" | |||||
| "xvmulsp 54, 46, 32 \n\t" | |||||
| "xvmulsp 55, 47, 32 \n\t" | |||||
| "xxperm 34, 40, %x5 \n\t" | |||||
| "xxperm 35, 41, %x5 \n\t" | |||||
| "xxperm 36, 42, %x5 \n\t" | |||||
| "xxperm 37, 43, %x5 \n\t" | |||||
| "xxperm 38, 44, %x5 \n\t" | |||||
| "xxperm 39, 45, %x5 \n\t" | |||||
| "xxperm 56, 46, %x5 \n\t" | |||||
| "xxperm 57, 47, %x5 \n\t" | |||||
| "xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i | |||||
| "xvmulsp 35, 35, %x4 \n\t" | |||||
| "xvmulsp 36, 36, %x4 \n\t" | |||||
| "xvmulsp 37, 37, %x4 \n\t" | |||||
| "xvmulsp 38, 38, %x4 \n\t" | |||||
| "xvmulsp 39, 39, %x4 \n\t" | |||||
| "xvmulsp 56, 56, %x4 \n\t" | |||||
| "xvmulsp 57, 57, %x4 \n\t" | |||||
| "xvaddsp 48, 48, 34 \n\t" | |||||
| "xvaddsp 49, 49, 35 \n\t" | |||||
| "xvaddsp 50, 50, 36 \n\t" | |||||
| "xvaddsp 51, 51, 37 \n\t" | |||||
| "stxvp 48, 0(%2) \n\t" | |||||
| "xvaddsp 52, 52, 38 \n\t" | |||||
| "xvaddsp 53, 53, 39 \n\t" | |||||
| "stxvp 50, 32(%2) \n\t" | |||||
| "xvaddsp 54, 54, 56 \n\t" | |||||
| "xvaddsp 55, 55, 57 \n\t" | |||||
| "stxvp 52, 64(%2) \n\t" | |||||
| "stxvp 54, 96(%2) \n\t" | |||||
| "#n=%1 x=%0=%2 alpha=(%3,%4)\n" | |||||
| : | |||||
| "+m" (*x), | |||||
| "+r" (n), // 1 | |||||
| "+b" (x) // 2 | |||||
| : | |||||
| "f" (alpha_r), // 3 | |||||
| "wa" (t0), // 4 | |||||
| "wa" (mask) // 5 | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||||
| "vs56","vs57" | |||||
| ); | |||||
| } | |||||
| @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | #if defined(__VEC__) || defined(__ALTIVEC__) | ||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "cswap_microk_power8.c" | #include "cswap_microk_power8.c" | ||||
| #elif defined(POWER10) | |||||
| #include "cswap_microk_power10.c" | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,127 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #if defined(DOUBLE) | |||||
| #define HAVE_KERNEL_16 1 | |||||
| static void zswap_kernel_16 (long n, double *x, double *y) | |||||
| #else | |||||
| #define HAVE_KERNEL_32 1 | |||||
| static void cswap_kernel_32 (long n, float *x, float *y) | |||||
| #endif | |||||
| { | |||||
| __asm__ | |||||
| ( | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "lxvp 32, 0(%4) \n\t" | |||||
| "lxvp 34, 32(%4) \n\t" | |||||
| "lxvp 36, 64(%4) \n\t" | |||||
| "lxvp 38, 96(%4) \n\t" | |||||
| "lxvp 40, 128(%4) \n\t" | |||||
| "lxvp 42, 160(%4) \n\t" | |||||
| "lxvp 44, 192(%4) \n\t" | |||||
| "lxvp 46, 224(%4) \n\t" | |||||
| "lxvp 48, 0(%3) \n\t" | |||||
| "lxvp 50, 32(%3) \n\t" | |||||
| "lxvp 52, 64(%3) \n\t" | |||||
| "lxvp 54, 96(%3) \n\t" | |||||
| "lxvp 56, 128(%3) \n\t" | |||||
| "lxvp 58, 160(%3) \n\t" | |||||
| "lxvp 60, 192(%3) \n\t" | |||||
| "lxvp 62, 224(%3) \n\t" | |||||
| "stxv 33, 0(%3) \n\t" | |||||
| "stxv 32, 16(%3) \n\t" | |||||
| "stxv 35, 32(%3) \n\t" | |||||
| "stxv 34, 48(%3) \n\t" | |||||
| "stxv 37, 64(%3) \n\t" | |||||
| "stxv 36, 80(%3) \n\t" | |||||
| "stxv 39, 96(%3) \n\t" | |||||
| "stxv 38, 112(%3) \n\t" | |||||
| "addi %3, %3, 128 \n\t" | |||||
| "stxv 41, 0(%3) \n\t" | |||||
| "stxv 40, 16(%3) \n\t" | |||||
| "stxv 43, 32(%3) \n\t" | |||||
| "stxv 42, 48(%3) \n\t" | |||||
| "stxv 45, 64(%3) \n\t" | |||||
| "stxv 44, 80(%3) \n\t" | |||||
| "stxv 47, 96(%3) \n\t" | |||||
| "stxv 46, 112(%3) \n\t" | |||||
| "addi %3, %3, 128 \n\t" | |||||
| "stxv 49, 0(%4) \n\t" | |||||
| "stxv 48, 16(%4) \n\t" | |||||
| "stxv 51, 32(%4) \n\t" | |||||
| "stxv 50, 48(%4) \n\t" | |||||
| "stxv 53, 64(%4) \n\t" | |||||
| "stxv 52, 80(%4) \n\t" | |||||
| "stxv 55, 96(%4) \n\t" | |||||
| "stxv 54, 112(%4) \n\t" | |||||
| "addi %4, %4, 128 \n\t" | |||||
| "stxv 57, 0(%4) \n\t" | |||||
| "stxv 56, 16(%4) \n\t" | |||||
| "stxv 59, 32(%4) \n\t" | |||||
| "stxv 58, 48(%4) \n\t" | |||||
| "stxv 61, 64(%4) \n\t" | |||||
| "stxv 60, 80(%4) \n\t" | |||||
| "stxv 63, 96(%4) \n\t" | |||||
| "stxv 62, 112(%4) \n\t" | |||||
| "addi %4, %4, 128 \n\t" | |||||
| #if defined(DOUBLE) | |||||
| "addic. %2, %2, -16 \n\t" | |||||
| #else | |||||
| "addic. %2, %2, -32 \n\t" | |||||
| #endif | |||||
| "bgt one%= \n" | |||||
| "#n=%2 x=%0=%3 y=%1=%4" | |||||
| : | |||||
| "+m" (*x), | |||||
| "+m" (*y), | |||||
| "+r" (n), // 2 | |||||
| "+b" (x), // 3 | |||||
| "+b" (y) // 4 | |||||
| : | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||||
| "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" | |||||
| ); | |||||
| } | |||||
| @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | #if defined(__VEC__) || defined(__ALTIVEC__) | ||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "dasum_microk_power8.c" | #include "dasum_microk_power8.c" | ||||
| #elif defined(POWER10) | |||||
| #include "dasum_microk_power10.c" | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| if ( inc_x == 1 ) | if ( inc_x == 1 ) | ||||
| { | { | ||||
| #if defined(POWER10) | |||||
| if ( n >= 16 ) | |||||
| { | |||||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||||
| for (i = 0; i < align; i++) { | |||||
| sumf += ABS(x[i]); | |||||
| } | |||||
| } | |||||
| n1 = (n-i) & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| sumf += dasum_kernel_16(n1, &x[i]); | |||||
| i+=n1; | |||||
| } | |||||
| #else | |||||
| n1 = n & -16; | n1 = n & -16; | ||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| @@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| sumf = dasum_kernel_16(n1, x); | sumf = dasum_kernel_16(n1, x); | ||||
| i=n1; | i=n1; | ||||
| } | } | ||||
| #endif | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| @@ -0,0 +1,152 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_16 1 | |||||
| static double dasum_kernel_16 (long n, double *x) | |||||
| { | |||||
| double sum; | |||||
| __vector double t0; | |||||
| __vector double t1; | |||||
| __vector double t2; | |||||
| __vector double t3; | |||||
| __asm__ | |||||
| ( | |||||
| "dcbt 0, %2 \n\t" | |||||
| "xxlxor 32, 32, 32 \n\t" | |||||
| "xxlxor 33, 33, 33 \n\t" | |||||
| "xxlxor 34, 34, 34 \n\t" | |||||
| "xxlxor 35, 35, 35 \n\t" | |||||
| "xxlxor 36, 36, 36 \n\t" | |||||
| "xxlxor 37, 37, 37 \n\t" | |||||
| "xxlxor 38, 38, 38 \n\t" | |||||
| "xxlxor 39, 39, 39 \n\t" | |||||
| "lxvp 40, 0(%2) \n\t" | |||||
| "lxvp 42, 32(%2) \n\t" | |||||
| "lxvp 44, 64(%2) \n\t" | |||||
| "lxvp 46, 96(%2) \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addic. %1, %1, -16 \n\t" | |||||
| "ble two%= \n\t" | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "xvabsdp 48, 40 \n\t" | |||||
| "xvabsdp 49, 41 \n\t" | |||||
| "xvabsdp 50, 42 \n\t" | |||||
| "xvabsdp 51, 43 \n\t" | |||||
| "lxvp 40, 0(%2) \n\t" | |||||
| "xvabsdp %x3, 44 \n\t" | |||||
| "xvabsdp %x4, 45 \n\t" | |||||
| "lxvp 42, 32(%2) \n\t" | |||||
| "xvabsdp %x5, 46 \n\t" | |||||
| "xvabsdp %x6, 47 \n\t" | |||||
| "lxvp 44, 64(%2) \n\t" | |||||
| "xvadddp 32, 32, 48 \n\t" | |||||
| "xvadddp 33, 33, 49 \n\t" | |||||
| "lxvp 46, 96(%2) \n\t" | |||||
| "xvadddp 34, 34, 50 \n\t" | |||||
| "xvadddp 35, 35, 51 \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "xvadddp 36, 36, %x3 \n\t" | |||||
| "xvadddp 37, 37, %x4 \n\t" | |||||
| "addic. %1, %1, -16 \n\t" | |||||
| "xvadddp 38, 38, %x5 \n\t" | |||||
| "xvadddp 39, 39, %x6 \n\t" | |||||
| "bgt one%= \n" | |||||
| "two%=: \n\t" | |||||
| "xvabsdp 48, 40 \n\t" | |||||
| "xvabsdp 49, 41 \n\t" | |||||
| "xvabsdp 50, 42 \n\t" | |||||
| "xvabsdp 51, 43 \n\t" | |||||
| "xvabsdp %x3, 44 \n\t" | |||||
| "xvabsdp %x4, 45 \n\t" | |||||
| "xvabsdp %x5, 46 \n\t" | |||||
| "xvabsdp %x6, 47 \n\t" | |||||
| "xvadddp 32, 32, 48 \n\t" | |||||
| "xvadddp 33, 33, 49 \n\t" | |||||
| "xvadddp 34, 34, 50 \n\t" | |||||
| "xvadddp 35, 35, 51 \n\t" | |||||
| "xvadddp 36, 36, %x3 \n\t" | |||||
| "xvadddp 37, 37, %x4 \n\t" | |||||
| "xvadddp 38, 38, %x5 \n\t" | |||||
| "xvadddp 39, 39, %x6 \n\t" | |||||
| "xvadddp 32, 32, 33 \n\t" | |||||
| "xvadddp 34, 34, 35 \n\t" | |||||
| "xvadddp 36, 36, 37 \n\t" | |||||
| "xvadddp 38, 38, 39 \n\t" | |||||
| "xvadddp 32, 32, 34 \n\t" | |||||
| "xvadddp 36, 36, 38 \n\t" | |||||
| "xvadddp 32, 32, 36 \n\t" | |||||
| XXSWAPD_S(33,32) | |||||
| "xsadddp %x0, 32, 33 \n" | |||||
| "#n=%1 x=%3=%2 sum=%0\n" | |||||
| "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" | |||||
| : | |||||
| "=d" (sum), // 0 | |||||
| "+r" (n), // 1 | |||||
| "+b" (x), // 2 | |||||
| "=wa" (t0), // 3 | |||||
| "=wa" (t1), // 4 | |||||
| "=wa" (t2), // 5 | |||||
| "=wa" (t3) // 6 | |||||
| : | |||||
| "m" (*x) | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
| "vs48","vs49","vs50","vs51" | |||||
| ); | |||||
| return sum; | |||||
| } | |||||
| @@ -85,12 +85,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| if ( (inc_x == 1) && (inc_y == 1 )) | if ( (inc_x == 1) && (inc_y == 1 )) | ||||
| { | { | ||||
| BLASLONG n1 = n & -64; | |||||
| if ( n1 > 0 ) | |||||
| if ( n >= 64 ) | |||||
| { | |||||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||||
| for (i = 0; i < align; i++) { | |||||
| y[i] = x[i] ; | |||||
| } | |||||
| } | |||||
| BLASLONG n1 = (n-i) & -64; | |||||
| if ( n1 ) | |||||
| { | { | ||||
| copy_kernel(n1, x, y); | |||||
| i=n1; | |||||
| copy_kernel(n1, &x[i], &y[i]); | |||||
| i += n1; | |||||
| } | } | ||||
| while(i < n) | while(i < n) | ||||
| @@ -29,7 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| typedef __vector unsigned char vec_t; | typedef __vector unsigned char vec_t; | ||||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | ||||
| typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | |||||
| #if !__has_builtin(__builtin_vsx_assemble_pair) | |||||
| #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair | |||||
| #endif | |||||
| #if !__has_builtin(__builtin_vsx_disassemble_pair) | |||||
| #define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair | |||||
| #endif | |||||
| #ifdef TRMMKERNEL | #ifdef TRMMKERNEL | ||||
| #define SAVE_ACC(ACC, J) \ | #define SAVE_ACC(ACC, J) \ | ||||
| @@ -186,8 +192,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| vec_t *rb = (vec_t *) & BO[0]; | vec_t *rb = (vec_t *) & BO[0]; | ||||
| __vector_pair rowB, rowB1; | __vector_pair rowB, rowB1; | ||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); | __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); | ||||
| @@ -200,8 +206,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| { | { | ||||
| rowA = (vec_t *) & AO[l << 3]; | rowA = (vec_t *) & AO[l << 3]; | ||||
| rb = (vec_t *) & BO[l << 3]; | rb = (vec_t *) & BO[l << 3]; | ||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); | __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); | ||||
| @@ -242,8 +248,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| __vector_pair rowB, rowB1; | __vector_pair rowB, rowB1; | ||||
| vec_t *rb = (vec_t *) & BO[0]; | vec_t *rb = (vec_t *) & BO[0]; | ||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); | __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); | ||||
| @@ -252,8 +258,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| { | { | ||||
| rowA = (vec_t *) & AO[l << 2]; | rowA = (vec_t *) & AO[l << 2]; | ||||
| rb = (vec_t *) & BO[l << 3]; | rb = (vec_t *) & BO[l << 3]; | ||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); | __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); | ||||
| @@ -286,16 +292,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| __vector_pair rowB, rowB1; | __vector_pair rowB, rowB1; | ||||
| vec_t *rb = (vec_t *) & BO[0]; | vec_t *rb = (vec_t *) & BO[0]; | ||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | ||||
| for (l = 1; l < temp; l++) | for (l = 1; l < temp; l++) | ||||
| { | { | ||||
| rowA = (vec_t *) & AO[l << 1]; | rowA = (vec_t *) & AO[l << 1]; | ||||
| rb = (vec_t *) & BO[l << 3]; | rb = (vec_t *) & BO[l << 3]; | ||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | ||||
| } | } | ||||
| @@ -398,7 +404,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| __vector_pair rowB; | __vector_pair rowB; | ||||
| vec_t *rb = (vec_t *) & BO[0]; | vec_t *rb = (vec_t *) & BO[0]; | ||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | ||||
| __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); | __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); | ||||
| @@ -407,7 +413,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| { | { | ||||
| rowA = (vec_t *) & AO[l << 3]; | rowA = (vec_t *) & AO[l << 3]; | ||||
| rb = (vec_t *) & BO[l << 2]; | rb = (vec_t *) & BO[l << 2]; | ||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | ||||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); | __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); | ||||
| @@ -440,14 +446,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| __vector_pair rowB; | __vector_pair rowB; | ||||
| vec_t *rb = (vec_t *) & BO[0]; | vec_t *rb = (vec_t *) & BO[0]; | ||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | ||||
| for (l = 1; l < temp; l++) | for (l = 1; l < temp; l++) | ||||
| { | { | ||||
| rowA = (vec_t *) & AO[l << 2]; | rowA = (vec_t *) & AO[l << 2]; | ||||
| rb = (vec_t *) & BO[l << 2]; | rb = (vec_t *) & BO[l << 2]; | ||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | ||||
| } | } | ||||
| @@ -476,13 +482,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| __vector_pair rowB; | __vector_pair rowB; | ||||
| vec_t *rb = (vec_t *) & BO[0]; | vec_t *rb = (vec_t *) & BO[0]; | ||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| for (l = 1; l < temp; l++) | for (l = 1; l < temp; l++) | ||||
| { | { | ||||
| rowA = (vec_t *) & AO[l << 1]; | rowA = (vec_t *) & AO[l << 1]; | ||||
| rb = (vec_t *) & BO[l << 2]; | rb = (vec_t *) & BO[l << 2]; | ||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| } | } | ||||
| SAVE_ACC (&acc0, 0); | SAVE_ACC (&acc0, 0); | ||||
| @@ -562,11 +568,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| v4sf_t result[4]; | v4sf_t result[4]; | ||||
| __vector_quad acc0, acc1, acc2, acc3; | __vector_quad acc0, acc1, acc2, acc3; | ||||
| BLASLONG l = 0; | BLASLONG l = 0; | ||||
| FLOAT t[4] = { 0, 0, 0, 0 }; | |||||
| t[0] = BO[0], t[1] = BO[1]; | |||||
| __vector_pair rowB; | __vector_pair rowB; | ||||
| vec_t *rb = (vec_t *) & t[0]; | |||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| vec_t *rb = (vec_t *) & BO[0]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); | |||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | ||||
| @@ -574,9 +578,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); | __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); | ||||
| for (l = 1; l < temp; l++) | for (l = 1; l < temp; l++) | ||||
| { | { | ||||
| t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; | |||||
| rb = (vec_t *) & t[0]; | |||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| rb = (vec_t *) & BO[l << 1]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); | |||||
| rowA = (vec_t *) & AO[l << 3]; | rowA = (vec_t *) & AO[l << 3]; | ||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | ||||
| @@ -607,19 +610,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| v4sf_t result[4]; | v4sf_t result[4]; | ||||
| __vector_quad acc0, acc1; | __vector_quad acc0, acc1; | ||||
| BLASLONG l = 0; | BLASLONG l = 0; | ||||
| FLOAT t[4] = { 0, 0, 0, 0 }; | |||||
| t[0] = BO[0], t[1] = BO[1]; | |||||
| __vector_pair rowB; | __vector_pair rowB; | ||||
| vec_t *rb = (vec_t *) & t[0]; | |||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| vec_t *rb = (vec_t *) & BO[0]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); | |||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | ||||
| for (l = 1; l < temp; l++) | for (l = 1; l < temp; l++) | ||||
| { | { | ||||
| t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; | |||||
| rb = (vec_t *) & t[0]; | |||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| rb = (vec_t *) & BO[l << 1]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); | |||||
| rowA = (vec_t *) & AO[l << 2]; | rowA = (vec_t *) & AO[l << 2]; | ||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | ||||
| @@ -646,18 +646,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| v4sf_t result[4]; | v4sf_t result[4]; | ||||
| __vector_quad acc0; | __vector_quad acc0; | ||||
| BLASLONG l = 0; | BLASLONG l = 0; | ||||
| FLOAT t[4] = { 0, 0, 0, 0 }; | |||||
| t[0] = BO[0], t[1] = BO[1]; | |||||
| __vector_pair rowB; | __vector_pair rowB; | ||||
| vec_t *rb = (vec_t *) & t[0]; | |||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| vec_t *rb = (vec_t *) & BO[0]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); | |||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| for (l = 1; l < temp; l++) | for (l = 1; l < temp; l++) | ||||
| { | { | ||||
| t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; | |||||
| rb = (vec_t *) & t[0]; | |||||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| rb = (vec_t *) & BO[l << 1]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); | |||||
| rowA = (vec_t *) & AO[l << 1]; | rowA = (vec_t *) & AO[l << 1]; | ||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| } | } | ||||
| @@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #pragma GCC optimize "O1" | #pragma GCC optimize "O1" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | #if defined(__VEC__) || defined(__ALTIVEC__) | ||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "drot_microk_power8.c" | #include "drot_microk_power8.c" | ||||
| #elif defined(POWER10) | |||||
| #include "drot_microk_power10.c" | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | if ( (inc_x == 1) && (inc_y == 1) ) | ||||
| { | { | ||||
| #if defined(POWER10) | |||||
| if ( n >= 16 ) | |||||
| { | |||||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||||
| for (i = 0; i < align; i++) { | |||||
| temp = c*x[i] + s*y[i] ; | |||||
| y[i] = c*y[i] - s*x[i] ; | |||||
| x[i] = temp ; | |||||
| } | |||||
| } | |||||
| BLASLONG n1 = (n-i) & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| drot_kernel_16(n1,&x[i], &y[i], c, s); | |||||
| i+=n1; | |||||
| } | |||||
| #else | |||||
| BLASLONG n1 = n & -16; | BLASLONG n1 = n & -16; | ||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| drot_kernel_16(n1, x1, y1, c, s); | drot_kernel_16(n1, x1, y1, c, s); | ||||
| i=n1; | i=n1; | ||||
| } | } | ||||
| #endif | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| @@ -0,0 +1,148 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_16 1 | |||||
| static void drot_kernel_16 (long n, double *x, double *y, double c, double s) | |||||
| { | |||||
| __asm__ | |||||
| ( | |||||
| XXSPLTD_S(36,%x5,0) // load c to both dwords | |||||
| XXSPLTD_S(37,%x6,0) // load s to both dwords | |||||
| "lxvp 32, 0(%3) \n\t" // load x | |||||
| "lxvp 34, 32(%3) \n\t" | |||||
| "lxvp 48, 0(%4) \n\t" // load y | |||||
| "lxvp 50, 32(%4) \n\t" | |||||
| "addic. %2, %2, -8 \n\t" | |||||
| "ble two%= \n\t" | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "xvmuldp 40, 32, 36 \n\t" // c * x | |||||
| "xvmuldp 41, 33, 36 \n\t" | |||||
| "xvmuldp 42, 34, 36 \n\t" | |||||
| "xvmuldp 43, 35, 36 \n\t" | |||||
| "xvmuldp 44, 32, 37 \n\t" // s * x | |||||
| "xvmuldp 45, 33, 37 \n\t" | |||||
| "xvmuldp 46, 34, 37 \n\t" | |||||
| "xvmuldp 47, 35, 37 \n\t" | |||||
| "lxvp 32, 64(%3) \n\t" // load x | |||||
| "lxvp 34, 96(%3) \n\t" | |||||
| "xvmuldp 52, 48, 36 \n\t" // c * y | |||||
| "xvmuldp 53, 49, 36 \n\t" | |||||
| "xvmuldp 54, 50, 36 \n\t" | |||||
| "xvmuldp 55, 51, 36 \n\t" | |||||
| "xvmuldp 38, 48, 37 \n\t" // s * y | |||||
| "xvmuldp 39, 49, 37 \n\t" | |||||
| "xvmuldp 56, 50, 37 \n\t" | |||||
| "xvmuldp 57, 51, 37 \n\t" | |||||
| "lxvp 48, 64(%4) \n\t" // load y | |||||
| "lxvp 50, 96(%4) \n\t" | |||||
| "xvadddp 40, 40, 38 \n\t" // c * x + s * y | |||||
| "xvadddp 41, 41, 39 \n\t" // c * x + s * y | |||||
| "xvadddp 42, 42, 56 \n\t" // c * x + s * y | |||||
| "xvadddp 43, 43, 57 \n\t" // c * x + s * y | |||||
| "stxvp 40, 0(%3) \n\t" // store x | |||||
| "stxvp 42, 32(%3) \n\t" | |||||
| "xvsubdp 52, 52, 44 \n\t" // c * y - s * x | |||||
| "xvsubdp 53, 53, 45 \n\t" // c * y - s * x | |||||
| "xvsubdp 54, 54, 46 \n\t" // c * y - s * x | |||||
| "xvsubdp 55, 55, 47 \n\t" // c * y - s * x | |||||
| "stxvp 52, 0(%4) \n\t" // store y | |||||
| "stxvp 54, 32(%4) \n\t" | |||||
| "addi %3, %3, 64 \n\t" | |||||
| "addi %4, %4, 64 \n\t" | |||||
| "addic. %2, %2, -8 \n\t" | |||||
| "bgt one%= \n" | |||||
| "two%=: \n\t" | |||||
| "xvmuldp 40, 32, 36 \n\t" // c * x | |||||
| "xvmuldp 41, 33, 36 \n\t" | |||||
| "xvmuldp 42, 34, 36 \n\t" | |||||
| "xvmuldp 43, 35, 36 \n\t" | |||||
| "xvmuldp 52, 48, 36 \n\t" // c * y | |||||
| "xvmuldp 53, 49, 36 \n\t" | |||||
| "xvmuldp 54, 50, 36 \n\t" | |||||
| "xvmuldp 55, 51, 36 \n\t" | |||||
| "xvmuldp 44, 32, 37 \n\t" // s * x | |||||
| "xvmuldp 45, 33, 37 \n\t" | |||||
| "xvmuldp 46, 34, 37 \n\t" | |||||
| "xvmuldp 47, 35, 37 \n\t" | |||||
| "xvmuldp 38, 48, 37 \n\t" // s * y | |||||
| "xvmuldp 39, 49, 37 \n\t" | |||||
| "xvmuldp 56, 50, 37 \n\t" | |||||
| "xvmuldp 57, 51, 37 \n\t" | |||||
| "xvadddp 40, 40, 38 \n\t" // c * x + s * y | |||||
| "xvadddp 41, 41, 39 \n\t" // c * x + s * y | |||||
| "xvadddp 42, 42, 56 \n\t" // c * x + s * y | |||||
| "xvadddp 43, 43, 57 \n\t" // c * x + s * y | |||||
| "stxvp 40, 0(%3) \n\t" // store x | |||||
| "stxvp 42, 32(%3) \n\t" | |||||
| "xvsubdp 52, 52, 44 \n\t" // c * y - s * x | |||||
| "xvsubdp 53, 53, 45 \n\t" // c * y - s * x | |||||
| "xvsubdp 54, 54, 46 \n\t" // c * y - s * x | |||||
| "xvsubdp 55, 55, 47 \n\t" // c * y - s * x | |||||
| "stxvp 52, 0(%4) \n\t" // store y | |||||
| "stxvp 54, 32(%4) \n\t" | |||||
| "#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n" | |||||
| : | |||||
| "+m" (*x), | |||||
| "+m" (*y), | |||||
| "+r" (n), // 2 | |||||
| "+b" (x), // 3 | |||||
| "+b" (y) // 4 | |||||
| : | |||||
| "d" (c), // 5 | |||||
| "d" (s) // 6 | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||||
| "vs56","vs57" | |||||
| ); | |||||
| } | |||||
| @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | #if defined(__VEC__) || defined(__ALTIVEC__) | ||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "dscal_microk_power8.c" | #include "dscal_microk_power8.c" | ||||
| #elif defined(POWER10) | |||||
| #include "dscal_microk_power10.c" | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -100,12 +102,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| if ( da == 0.0 ) | if ( da == 0.0 ) | ||||
| { | { | ||||
| #if defined(POWER10) | |||||
| if ( n >= 16 ) | |||||
| { | |||||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||||
| for (j = 0; j < align; j++) { | |||||
| x[j] = 0.0; | |||||
| } | |||||
| } | |||||
| BLASLONG n1 = (n-j) & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| dscal_kernel_8_zero(n1, &x[j]); | |||||
| j+=n1; | |||||
| } | |||||
| #else | |||||
| BLASLONG n1 = n & -16; | BLASLONG n1 = n & -16; | ||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| dscal_kernel_8_zero(n1, x); | dscal_kernel_8_zero(n1, x); | ||||
| j=n1; | j=n1; | ||||
| } | } | ||||
| #endif | |||||
| while(j < n) | while(j < n) | ||||
| { | { | ||||
| @@ -118,12 +136,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| else | else | ||||
| { | { | ||||
| #if defined(POWER10) | |||||
| if ( n >= 16 ) | |||||
| { | |||||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||||
| for (j = 0; j < align; j++) { | |||||
| x[j] = da * x[j]; | |||||
| } | |||||
| } | |||||
| BLASLONG n1 = (n-j) & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| dscal_kernel_8(n1, &x[j], da); | |||||
| j+=n1; | |||||
| } | |||||
| #else | |||||
| BLASLONG n1 = n & -16; | BLASLONG n1 = n & -16; | ||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| dscal_kernel_8(n1, x, da); | dscal_kernel_8(n1, x, da); | ||||
| j=n1; | j=n1; | ||||
| } | } | ||||
| #endif | |||||
| while(j < n) | while(j < n) | ||||
| { | { | ||||
| @@ -0,0 +1,134 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_8 1 | |||||
| static void dscal_kernel_8 (long n, double *x, double alpha) | |||||
| { | |||||
| __asm__ | |||||
| ( | |||||
| "dcbt 0, %2 \n\t" | |||||
| XXSPLTD_S(48,%x3,0) | |||||
| "lxvp 32, 0(%2) \n\t" | |||||
| "lxvp 34, 32(%2) \n\t" | |||||
| "lxvp 36, 64(%2) \n\t" | |||||
| "lxvp 38, 96(%2) \n\t" | |||||
| "addic. %1, %1, -16 \n\t" | |||||
| "ble two%= \n\t" | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "xvmuldp 40, 32, 48 \n\t" | |||||
| "xvmuldp 41, 33, 48 \n\t" | |||||
| "xvmuldp 42, 34, 48 \n\t" | |||||
| "xvmuldp 43, 35, 48 \n\t" | |||||
| "lxvp 32, 128(%2) \n\t" | |||||
| "lxvp 34, 160(%2) \n\t" | |||||
| "xvmuldp 44, 36, 48 \n\t" | |||||
| "xvmuldp 45, 37, 48 \n\t" | |||||
| "xvmuldp 46, 38, 48 \n\t" | |||||
| "xvmuldp 47, 39, 48 \n\t" | |||||
| "lxvp 36, 192(%2) \n\t" | |||||
| "lxvp 38, 224(%2) \n\t" | |||||
| "stxvp 40, 0(%2) \n\t" | |||||
| "stxvp 42, 32(%2) \n\t" | |||||
| "stxvp 44, 64(%2) \n\t" | |||||
| "stxvp 46, 96(%2) \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addic. %1, %1, -16 \n\t" | |||||
| "bgt one%= \n" | |||||
| "two%=: \n\t" | |||||
| "xvmuldp 40, 32, 48 \n\t" | |||||
| "xvmuldp 41, 33, 48 \n\t" | |||||
| "xvmuldp 42, 34, 48 \n\t" | |||||
| "xvmuldp 43, 35, 48 \n\t" | |||||
| "xvmuldp 44, 36, 48 \n\t" | |||||
| "xvmuldp 45, 37, 48 \n\t" | |||||
| "xvmuldp 46, 38, 48 \n\t" | |||||
| "xvmuldp 47, 39, 48 \n\t" | |||||
| "stxvp 40, 0(%2) \n\t" | |||||
| "stxvp 42, 32(%2) \n\t" | |||||
| "stxvp 44, 64(%2) \n\t" | |||||
| "stxvp 46, 96(%2) \n\t" | |||||
| "#n=%1 alpha=%3 x=%0=%2" | |||||
| : | |||||
| "+m" (*x), | |||||
| "+r" (n), // 1 | |||||
| "+b" (x) // 2 | |||||
| : | |||||
| "d" (alpha) // 3 | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48" | |||||
| ); | |||||
| } | |||||
| static void dscal_kernel_8_zero (long n, double *x) | |||||
| { | |||||
| __asm__ | |||||
| ( | |||||
| "xxlxor 32, 32, 32 \n\t" | |||||
| "xxlxor 33, 33, 33 \n\t" | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "stxvp 32, 0(%2) \n\t" | |||||
| "stxvp 32, 32(%2) \n\t" | |||||
| "stxvp 32, 64(%2) \n\t" | |||||
| "stxvp 32, 96(%2) \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addic. %1, %1, -16 \n\t" | |||||
| "bgt one%= \n" | |||||
| "#n=%1 x=%0=%2 " | |||||
| : | |||||
| "=m" (*x), | |||||
| "+r" (n), // 1 | |||||
| "+b" (x) // 2 | |||||
| : | |||||
| : | |||||
| "cr0","vs32","vs33" | |||||
| ); | |||||
| } | |||||
| @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | #if defined(__VEC__) || defined(__ALTIVEC__) | ||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "dswap_microk_power8.c" | #include "dswap_microk_power8.c" | ||||
| #elif defined(POWER10) | |||||
| #include "swap_microk_power10.c" | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
| if ( (inc_x == 1) && (inc_y == 1 )) | if ( (inc_x == 1) && (inc_y == 1 )) | ||||
| { | { | ||||
| #if defined(POWER10) | |||||
| if ( n >= 32 ) | |||||
| { | |||||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||||
| for (i = 0; i < align; i++) { | |||||
| temp = y[i]; | |||||
| y[i] = x[i]; | |||||
| x[i] = temp; | |||||
| } | |||||
| } | |||||
| BLASLONG n1 = (n-i) & -32; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| dswap_kernel_32(n1,&x[i], &y[i]); | |||||
| i+=n1; | |||||
| } | |||||
| #else | |||||
| BLASLONG n1 = n & -32; | BLASLONG n1 = n & -32; | ||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| dswap_kernel_32(n1, x, y); | dswap_kernel_32(n1, x, y); | ||||
| i=n1; | i=n1; | ||||
| } | } | ||||
| #endif | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | #if defined(__VEC__) || defined(__ALTIVEC__) | ||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "sasum_microk_power8.c" | #include "sasum_microk_power8.c" | ||||
| #elif defined(POWER10) | |||||
| #include "sasum_microk_power10.c" | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| if ( inc_x == 1 ) | if ( inc_x == 1 ) | ||||
| { | { | ||||
| #if defined(POWER10) | |||||
| if ( n >= 32 ) | |||||
| { | |||||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||||
| for (i = 0; i < align; i++) { | |||||
| sumf += ABS(x[i]); | |||||
| } | |||||
| } | |||||
| n1 = (n-i) & -32; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| sumf += sasum_kernel_32(n1, &x[i]); | |||||
| i+=n1; | |||||
| } | |||||
| #else | |||||
| n1 = n & -32; | n1 = n & -32; | ||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| @@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| sumf = sasum_kernel_32(n1, x); | sumf = sasum_kernel_32(n1, x); | ||||
| i=n1; | i=n1; | ||||
| } | } | ||||
| #endif | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| @@ -0,0 +1,153 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_32 1 | |||||
| static float sasum_kernel_32 (long n, float *x) | |||||
| { | |||||
| float sum; | |||||
| __vector float t0; | |||||
| __vector float t1; | |||||
| __vector float t2; | |||||
| __vector float t3; | |||||
| __asm__ | |||||
| ( | |||||
| "dcbt 0, %2 \n\t" | |||||
| "xxlxor 32, 32, 32 \n\t" | |||||
| "xxlxor 33, 33, 33 \n\t" | |||||
| "xxlxor 34, 34, 34 \n\t" | |||||
| "xxlxor 35, 35, 35 \n\t" | |||||
| "xxlxor 36, 36, 36 \n\t" | |||||
| "xxlxor 37, 37, 37 \n\t" | |||||
| "xxlxor 38, 38, 38 \n\t" | |||||
| "xxlxor 39, 39, 39 \n\t" | |||||
| "lxvp 40, 0(%2) \n\t" | |||||
| "lxvp 42, 32(%2) \n\t" | |||||
| "lxvp 44, 64(%2) \n\t" | |||||
| "lxvp 46, 96(%2) \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addic. %1, %1, -32 \n\t" | |||||
| "ble two%= \n\t" | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "xvabssp 48, 40 \n\t" | |||||
| "xvabssp 49, 41 \n\t" | |||||
| "xvabssp 50, 42 \n\t" | |||||
| "xvabssp 51, 43 \n\t" | |||||
| "lxvp 40, 0(%2) \n\t" | |||||
| "xvabssp %x3, 44 \n\t" | |||||
| "xvabssp %x4, 45 \n\t" | |||||
| "lxvp 42, 32(%2) \n\t" | |||||
| "xvabssp %x5, 46 \n\t" | |||||
| "xvabssp %x6, 47 \n\t" | |||||
| "lxvp 44, 64(%2) \n\t" | |||||
| "xvaddsp 32, 32, 48 \n\t" | |||||
| "xvaddsp 33, 33, 49 \n\t" | |||||
| "lxvp 46, 96(%2) \n\t" | |||||
| "xvaddsp 34, 34, 50 \n\t" | |||||
| "xvaddsp 35, 35, 51 \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "xvaddsp 36, 36, %x3 \n\t" | |||||
| "xvaddsp 37, 37, %x4 \n\t" | |||||
| "addic. %1, %1, -32 \n\t" | |||||
| "xvaddsp 38, 38, %x5 \n\t" | |||||
| "xvaddsp 39, 39, %x6 \n\t" | |||||
| "bgt one%= \n" | |||||
| "two%=: \n\t" | |||||
| "xvabssp 48, 40 \n\t" | |||||
| "xvabssp 49, 41 \n\t" | |||||
| "xvabssp 50, 42 \n\t" | |||||
| "xvabssp 51, 43 \n\t" | |||||
| "xvabssp %x3, 44 \n\t" | |||||
| "xvabssp %x4, 45 \n\t" | |||||
| "xvabssp %x5, 46 \n\t" | |||||
| "xvabssp %x6, 47 \n\t" | |||||
| "xvaddsp 32, 32, 48 \n\t" | |||||
| "xvaddsp 33, 33, 49 \n\t" | |||||
| "xvaddsp 34, 34, 50 \n\t" | |||||
| "xvaddsp 35, 35, 51 \n\t" | |||||
| "xvaddsp 36, 36, %x3 \n\t" | |||||
| "xvaddsp 37, 37, %x4 \n\t" | |||||
| "xvaddsp 38, 38, %x5 \n\t" | |||||
| "xvaddsp 39, 39, %x6 \n\t" | |||||
| "xvaddsp 32, 32, 33 \n\t" | |||||
| "xvaddsp 34, 34, 35 \n\t" | |||||
| "xvaddsp 36, 36, 37 \n\t" | |||||
| "xvaddsp 38, 38, 39 \n\t" | |||||
| "xvaddsp 32, 32, 34 \n\t" | |||||
| "xvaddsp 36, 36, 38 \n\t" | |||||
| "xvaddsp 32, 32, 36 \n\t" | |||||
| "xxsldwi 33, 32, 32, 2 \n\t" | |||||
| "xvaddsp 32, 32, 33 \n\t" | |||||
| "xxsldwi 33, 32, 32, 1 \n\t" | |||||
| "xvaddsp 32, 32, 33 \n\t" | |||||
| "xscvspdp %x0, 32 \n" | |||||
| "#n=%1 x=%3=%2 sum=%0\n" | |||||
| "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" | |||||
| : | |||||
| "=f" (sum), // 0 | |||||
| "+r" (n), // 1 | |||||
| "+b" (x), // 2 | |||||
| "=wa" (t0), // 3 | |||||
| "=wa" (t1), // 4 | |||||
| "=wa" (t2), // 5 | |||||
| "=wa" (t3) // 6 | |||||
| : | |||||
| "m" (*x) | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
| "vs48","vs49","vs50","vs51" | |||||
| ); | |||||
| return sum; | |||||
| } | |||||
| @@ -86,11 +86,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| if ( (inc_x == 1) && (inc_y == 1 )) | if ( (inc_x == 1) && (inc_y == 1 )) | ||||
| { | { | ||||
| BLASLONG n1 = n & -128; | |||||
| if ( n1 > 0 ) | |||||
| if ( n >= 128 ) | |||||
| { | { | ||||
| copy_kernel (n1, x, y); | |||||
| i=n1; | |||||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||||
| for (i = 0; i < align; i++) { | |||||
| y[i] = x[i] ; | |||||
| } | |||||
| } | |||||
| BLASLONG n1 = (n-i) & -128; | |||||
| if ( n1 ) | |||||
| { | |||||
| copy_kernel(n1, &x[i], &y[i]); | |||||
| i += n1; | |||||
| } | } | ||||
| while(i < n) | while(i < n) | ||||
| @@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #pragma GCC optimize "O1" | #pragma GCC optimize "O1" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | #if defined(__VEC__) || defined(__ALTIVEC__) | ||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "srot_microk_power8.c" | #include "srot_microk_power8.c" | ||||
| #elif defined(POWER10) | |||||
| #include "srot_microk_power10.c" | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -115,6 +117,23 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | if ( (inc_x == 1) && (inc_y == 1) ) | ||||
| { | { | ||||
| #if defined(POWER10) | |||||
| if ( n >= 16 ) | |||||
| { | |||||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||||
| for (i = 0; i < align; i++) { | |||||
| temp = c*x[i] + s*y[i] ; | |||||
| y[i] = c*y[i] - s*x[i] ; | |||||
| x[i] = temp ; | |||||
| } | |||||
| } | |||||
| BLASLONG n1 = (n-i) & -16; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| srot_kernel_16(n1, &x1[i], &y1[i], c, s); | |||||
| i+=n1; | |||||
| } | |||||
| #else | |||||
| BLASLONG n1 = n & -16; | BLASLONG n1 = n & -16; | ||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| @@ -122,6 +141,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||||
| i=n1; | i=n1; | ||||
| } | } | ||||
| #endif | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| temp = c*x[i] + s*y[i] ; | temp = c*x[i] + s*y[i] ; | ||||
| @@ -0,0 +1,151 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_16 1 | |||||
| static void srot_kernel_16 (long n, float *x, float *y, float c, float s) | |||||
| { | |||||
| __asm__ | |||||
| ( | |||||
| "xscvdpspn 36, %x5 \n\t" // load c to all words | |||||
| "xxspltw 36, 36, 0 \n\t" | |||||
| "xscvdpspn 37, %x6 \n\t" // load s to all words | |||||
| "xxspltw 37, 37, 0 \n\t" | |||||
| "lxvp 32, 0(%3) \n\t" // load x | |||||
| "lxvp 34, 32(%3) \n\t" | |||||
| "lxvp 48, 0(%4) \n\t" // load y | |||||
| "lxvp 50, 32(%4) \n\t" | |||||
| "addic. %2, %2, -16 \n\t" | |||||
| "ble two%= \n\t" | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||||
| "xvmulsp 41, 33, 36 \n\t" | |||||
| "xvmulsp 42, 34, 36 \n\t" | |||||
| "xvmulsp 43, 35, 36 \n\t" | |||||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||||
| "xvmulsp 45, 33, 37 \n\t" | |||||
| "xvmulsp 46, 34, 37 \n\t" | |||||
| "xvmulsp 47, 35, 37 \n\t" | |||||
| "lxvp 32, 64(%3) \n\t" // load x | |||||
| "lxvp 34, 96(%3) \n\t" | |||||
| "xvmulsp 52, 48, 36 \n\t" // c * y | |||||
| "xvmulsp 53, 49, 36 \n\t" | |||||
| "xvmulsp 54, 50, 36 \n\t" | |||||
| "xvmulsp 55, 51, 36 \n\t" | |||||
| "xvmulsp 38, 48, 37 \n\t" // s * y | |||||
| "xvmulsp 39, 49, 37 \n\t" | |||||
| "xvmulsp 56, 50, 37 \n\t" | |||||
| "xvmulsp 57, 51, 37 \n\t" | |||||
| "lxvp 48, 64(%4) \n\t" // load y | |||||
| "lxvp 50, 96(%4) \n\t" | |||||
| "xvaddsp 40, 40, 38 \n\t" // c * x + s * y | |||||
| "xvaddsp 41, 41, 39 \n\t" // c * x + s * y | |||||
| "xvaddsp 42, 42, 56 \n\t" // c * x + s * y | |||||
| "xvaddsp 43, 43, 57 \n\t" // c * x + s * y | |||||
| "stxvp 40, 0(%3) \n\t" // store x | |||||
| "stxvp 42, 32(%3) \n\t" | |||||
| "xvsubsp 52, 52, 44 \n\t" // c * y - s * x | |||||
| "xvsubsp 53, 53, 45 \n\t" // c * y - s * x | |||||
| "xvsubsp 54, 54, 46 \n\t" // c * y - s * x | |||||
| "xvsubsp 55, 55, 47 \n\t" // c * y - s * x | |||||
| "stxvp 52, 0(%4) \n\t" // store y | |||||
| "stxvp 54, 32(%4) \n\t" | |||||
| "addi %3, %3, 64 \n\t" | |||||
| "addi %4, %4, 64 \n\t" | |||||
| "addic. %2, %2, -16 \n\t" | |||||
| "bgt one%= \n" | |||||
| "two%=: \n\t" | |||||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||||
| "xvmulsp 41, 33, 36 \n\t" | |||||
| "xvmulsp 42, 34, 36 \n\t" | |||||
| "xvmulsp 43, 35, 36 \n\t" | |||||
| "xvmulsp 52, 48, 36 \n\t" // c * y | |||||
| "xvmulsp 53, 49, 36 \n\t" | |||||
| "xvmulsp 54, 50, 36 \n\t" | |||||
| "xvmulsp 55, 51, 36 \n\t" | |||||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||||
| "xvmulsp 45, 33, 37 \n\t" | |||||
| "xvmulsp 46, 34, 37 \n\t" | |||||
| "xvmulsp 47, 35, 37 \n\t" | |||||
| "xvmulsp 38, 48, 37 \n\t" // s * y | |||||
| "xvmulsp 39, 49, 37 \n\t" | |||||
| "xvmulsp 56, 50, 37 \n\t" | |||||
| "xvmulsp 57, 51, 37 \n\t" | |||||
| "xvaddsp 40, 40, 38 \n\t" // c * x + s * y | |||||
| "xvaddsp 41, 41, 39 \n\t" // c * x + s * y | |||||
| "xvaddsp 42, 42, 56 \n\t" // c * x + s * y | |||||
| "xvaddsp 43, 43, 57 \n\t" // c * x + s * y | |||||
| "stxvp 40, 0(%3) \n\t" // store x | |||||
| "stxvp 42, 32(%3) \n\t" | |||||
| "xvsubsp 52, 52, 44 \n\t" // c * y - s * x | |||||
| "xvsubsp 53, 53, 45 \n\t" // c * y - s * x | |||||
| "xvsubsp 54, 54, 46 \n\t" // c * y - s * x | |||||
| "xvsubsp 55, 55, 47 \n\t" // c * y - s * x | |||||
| "stxvp 52, 0(%4) \n\t" // store y | |||||
| "stxvp 54, 32(%4) \n\t" | |||||
| "#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n" | |||||
| : | |||||
| "+m" (*x), | |||||
| "+m" (*y), | |||||
| "+r" (n), // 2 | |||||
| "+b" (x), // 3 | |||||
| "+b" (y) // 4 | |||||
| : | |||||
| "f" (c), // 5 | |||||
| "f" (s) // 6 | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||||
| "vs56","vs57" | |||||
| ); | |||||
| } | |||||
| @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | #if defined(__VEC__) || defined(__ALTIVEC__) | ||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "sscal_microk_power8.c" | #include "sscal_microk_power8.c" | ||||
| #elif defined(POWER10) | |||||
| #include "sscal_microk_power10.c" | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -102,12 +104,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| if ( da == 0.0 ) | if ( da == 0.0 ) | ||||
| { | { | ||||
| #if defined(POWER10) | |||||
| if ( n >= 32 ) | |||||
| { | |||||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||||
| for (j = 0; j < align; j++) { | |||||
| x[j] = 0.0; | |||||
| } | |||||
| } | |||||
| BLASLONG n1 = (n-j) & -32; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| sscal_kernel_16_zero(n1, &x[j]); | |||||
| j+=n1; | |||||
| } | |||||
| #else | |||||
| BLASLONG n1 = n & -32; | BLASLONG n1 = n & -32; | ||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| sscal_kernel_16_zero(n1, x); | sscal_kernel_16_zero(n1, x); | ||||
| j=n1; | j=n1; | ||||
| } | } | ||||
| #endif | |||||
| while(j < n) | while(j < n) | ||||
| { | { | ||||
| @@ -120,12 +138,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| else | else | ||||
| { | { | ||||
| #if defined(POWER10) | |||||
| if ( n >= 32 ) | |||||
| { | |||||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||||
| for (j = 0; j < align; j++) { | |||||
| x[j] = da * x[j]; | |||||
| } | |||||
| } | |||||
| BLASLONG n1 = (n-j) & -32; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| sscal_kernel_16(n1, &x[j], da); | |||||
| j+=n1; | |||||
| } | |||||
| #else | |||||
| BLASLONG n1 = n & -32; | BLASLONG n1 = n & -32; | ||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| sscal_kernel_16(n1, x, da); | sscal_kernel_16(n1, x, da); | ||||
| j=n1; | j=n1; | ||||
| } | } | ||||
| #endif | |||||
| while(j < n) | while(j < n) | ||||
| { | { | ||||
| @@ -0,0 +1,135 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_16 1 | |||||
| static void sscal_kernel_16 (long n, float *x, float alpha) | |||||
| { | |||||
| __asm__ | |||||
| ( | |||||
| "dcbt 0, %2 \n\t" | |||||
| "xscvdpspn 48, %x3 \n\t" | |||||
| "xxspltw 48, 48, 0 \n\t" | |||||
| "lxvp 32, 0(%2) \n\t" | |||||
| "lxvp 34, 32(%2) \n\t" | |||||
| "lxvp 36, 64(%2) \n\t" | |||||
| "lxvp 38, 96(%2) \n\t" | |||||
| "addic. %1, %1, -32 \n\t" | |||||
| "ble two%= \n\t" | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "xvmulsp 40, 32, 48 \n\t" | |||||
| "xvmulsp 41, 33, 48 \n\t" | |||||
| "xvmulsp 42, 34, 48 \n\t" | |||||
| "xvmulsp 43, 35, 48 \n\t" | |||||
| "lxvp 32, 128(%2) \n\t" | |||||
| "lxvp 34, 160(%2) \n\t" | |||||
| "xvmulsp 44, 36, 48 \n\t" | |||||
| "xvmulsp 45, 37, 48 \n\t" | |||||
| "xvmulsp 46, 38, 48 \n\t" | |||||
| "xvmulsp 47, 39, 48 \n\t" | |||||
| "lxvp 36, 192(%2) \n\t" | |||||
| "lxvp 38, 224(%2) \n\t" | |||||
| "stxvp 40, 0(%2) \n\t" | |||||
| "stxvp 42, 32(%2) \n\t" | |||||
| "stxvp 44, 64(%2) \n\t" | |||||
| "stxvp 46, 96(%2) \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addic. %1, %1, -32 \n\t" | |||||
| "bgt one%= \n" | |||||
| "two%=: \n\t" | |||||
| "xvmulsp 40, 32, 48 \n\t" | |||||
| "xvmulsp 41, 33, 48 \n\t" | |||||
| "xvmulsp 42, 34, 48 \n\t" | |||||
| "xvmulsp 43, 35, 48 \n\t" | |||||
| "xvmulsp 44, 36, 48 \n\t" | |||||
| "xvmulsp 45, 37, 48 \n\t" | |||||
| "xvmulsp 46, 38, 48 \n\t" | |||||
| "xvmulsp 47, 39, 48 \n\t" | |||||
| "stxvp 40, 0(%2) \n\t" | |||||
| "stxvp 42, 32(%2) \n\t" | |||||
| "stxvp 44, 64(%2) \n\t" | |||||
| "stxvp 46, 96(%2) \n\t" | |||||
| "#n=%1 alpha=%3 x=%0=%2" | |||||
| : | |||||
| "+m" (*x), | |||||
| "+r" (n), // 1 | |||||
| "+b" (x) // 2 | |||||
| : | |||||
| "f" (alpha) // 3 | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48" | |||||
| ); | |||||
| } | |||||
| static void sscal_kernel_16_zero (long n, float *x) | |||||
| { | |||||
| __asm__ | |||||
| ( | |||||
| "xxlxor 32, 32, 32 \n\t" | |||||
| "xxlxor 33, 33, 33 \n\t" | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "stxvp 32, 0(%2) \n\t" | |||||
| "stxvp 32, 32(%2) \n\t" | |||||
| "stxvp 32, 64(%2) \n\t" | |||||
| "stxvp 32, 96(%2) \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addic. %1, %1, -32 \n\t" | |||||
| "bgt one%= \n" | |||||
| "#n=%1 x=%0=%2 " | |||||
| : | |||||
| "=m" (*x), | |||||
| "+r" (n), // 1 | |||||
| "+b" (x) // 2 | |||||
| : | |||||
| : | |||||
| "cr0","vs32","vs33" | |||||
| ); | |||||
| } | |||||
| @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | #if defined(__VEC__) || defined(__ALTIVEC__) | ||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "sswap_microk_power8.c" | #include "sswap_microk_power8.c" | ||||
| #elif defined(POWER10) | |||||
| #include "swap_microk_power10.c" | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||||
| if ( (inc_x == 1) && (inc_y == 1 )) | if ( (inc_x == 1) && (inc_y == 1 )) | ||||
| { | { | ||||
| #if defined(POWER10) | |||||
| if ( n >= 64 ) | |||||
| { | |||||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||||
| for (i = 0; i < align; i++) { | |||||
| temp = y[i]; | |||||
| y[i] = x[i]; | |||||
| x[i] = temp; | |||||
| } | |||||
| } | |||||
| BLASLONG n1 = (n-i) & -64; | |||||
| if ( n1 > 0 ) | |||||
| { | |||||
| sswap_kernel_32(n1,&x[i], &y[i]); | |||||
| i+=n1; | |||||
| } | |||||
| #else | |||||
| BLASLONG n1 = n & -32; | BLASLONG n1 = n & -32; | ||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| sswap_kernel_32(n1, x, y); | sswap_kernel_32(n1, x, y); | ||||
| i=n1; | i=n1; | ||||
| } | } | ||||
| #endif | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| @@ -0,0 +1,105 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_32 1 | |||||
| #if defined(DOUBLE) | |||||
| static void dswap_kernel_32 (long n, double *x, double *y) | |||||
| #else | |||||
| static void sswap_kernel_32 (long n, float *x, float *y) | |||||
| #endif | |||||
| { | |||||
| __asm__ | |||||
| ( | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "lxvp 32, 0(%4) \n\t" | |||||
| "lxvp 34, 32(%4) \n\t" | |||||
| "lxvp 36, 64(%4) \n\t" | |||||
| "lxvp 38, 96(%4) \n\t" | |||||
| "lxvp 40, 128(%4) \n\t" | |||||
| "lxvp 42, 160(%4) \n\t" | |||||
| "lxvp 44, 192(%4) \n\t" | |||||
| "lxvp 46, 224(%4) \n\t" | |||||
| "lxvp 48, 0(%3) \n\t" | |||||
| "lxvp 50, 32(%3) \n\t" | |||||
| "lxvp 52, 64(%3) \n\t" | |||||
| "lxvp 54, 96(%3) \n\t" | |||||
| "lxvp 56, 128(%3) \n\t" | |||||
| "lxvp 58, 160(%3) \n\t" | |||||
| "lxvp 60, 192(%3) \n\t" | |||||
| "lxvp 62, 224(%3) \n\t" | |||||
| "stxvp 32, 0(%3) \n\t" | |||||
| "stxvp 34, 32(%3) \n\t" | |||||
| "stxvp 36, 64(%3) \n\t" | |||||
| "stxvp 38, 96(%3) \n\t" | |||||
| "stxvp 40, 128(%3) \n\t" | |||||
| "stxvp 42, 160(%3) \n\t" | |||||
| "stxvp 44, 192(%3) \n\t" | |||||
| "stxvp 46, 224(%3) \n\t" | |||||
| "stxvp 48, 0(%4) \n\t" | |||||
| "stxvp 50, 32(%4) \n\t" | |||||
| "stxvp 52, 64(%4) \n\t" | |||||
| "stxvp 54, 96(%4) \n\t" | |||||
| "stxvp 56, 128(%4) \n\t" | |||||
| "stxvp 58, 160(%4) \n\t" | |||||
| "stxvp 60, 192(%4) \n\t" | |||||
| "stxvp 62, 224(%4) \n\t" | |||||
| "addi %4, %4, 256 \n\t" | |||||
| "addi %3, %3, 256 \n\t" | |||||
| #if defined(DOUBLE) | |||||
| "addic. %2, %2, -32 \n\t" | |||||
| #else | |||||
| "addic. %2, %2, -64 \n\t" | |||||
| #endif | |||||
| "bgt one%= \n" | |||||
| "#n=%2 x=%0=%3 y=%1=%4" | |||||
| : | |||||
| "+m" (*x), | |||||
| "+m" (*y), | |||||
| "+r" (n), // 2 | |||||
| "+b" (x), // 3 | |||||
| "+b" (y) // 4 | |||||
| : | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||||
| "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" | |||||
| ); | |||||
| } | |||||
| @@ -38,11 +38,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #pragma GCC optimize "O1" | #pragma GCC optimize "O1" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | #if defined(__VEC__) || defined(__ALTIVEC__) | ||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| #include "zscal_microk_power8.c" | #include "zscal_microk_power8.c" | ||||
| #endif | #endif | ||||
| #elif defined(POWER10) | |||||
| #if defined(DOUBLE) | |||||
| #include "zscal_microk_power10.c" | |||||
| #else | |||||
| #include "cscal_microk_power10.c" | |||||
| #endif | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -145,7 +151,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||||
| { | { | ||||
| #if defined(DOUBLE) | |||||
| n1 = n & -8; | n1 = n & -8; | ||||
| #else | |||||
| n1 = n & -16; | |||||
| #endif | |||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| zscal_kernel_8(n1, x, da_r, da_i); | zscal_kernel_8(n1, x, da_r, da_i); | ||||
| @@ -0,0 +1,195 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_8 1 | |||||
| static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) | |||||
| { | |||||
| __vector double t0; | |||||
| __vector double t1; | |||||
| __vector double t2; | |||||
| __vector double t3; | |||||
| __vector double t4; | |||||
| __vector double t5; | |||||
| __asm__ | |||||
| ( | |||||
| "dcbt 0, %2 \n\t" | |||||
| "xsnegdp 33, %x10 \n\t" // -alpha_i | |||||
| XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r | |||||
| XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i | |||||
| "lxvp 40, 0(%2) \n\t" | |||||
| "lxvp 42, 32(%2) \n\t" | |||||
| "lxvp 44, 64(%2) \n\t" | |||||
| "lxvp 46, 96(%2) \n\t" | |||||
| "addic. %1, %1, -8 \n\t" | |||||
| "ble two%= \n\t" | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r | |||||
| "xvmuldp 49, 41, 32 \n\t" | |||||
| "xvmuldp 50, 42, 32 \n\t" | |||||
| "xvmuldp 51, 43, 32 \n\t" | |||||
| "xvmuldp 34, 44, 32 \n\t" | |||||
| "xvmuldp 35, 45, 32 \n\t" | |||||
| "xvmuldp 36, 46, 32 \n\t" | |||||
| "xvmuldp 37, 47, 32 \n\t" | |||||
| XXSWAPD_S(38,40) | |||||
| XXSWAPD_S(39,41) | |||||
| XXSWAPD_S(%x3,42) | |||||
| XXSWAPD_S(%x4,43) | |||||
| XXSWAPD_S(%x5,44) | |||||
| XXSWAPD_S(%x6,45) | |||||
| XXSWAPD_S(%x7,46) | |||||
| XXSWAPD_S(%x8,47) | |||||
| "xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i | |||||
| "xvmuldp 39, 39, 33 \n\t" | |||||
| "xvmuldp %x3, %x3, 33 \n\t" | |||||
| "xvmuldp %x4, %x4, 33 \n\t" | |||||
| "lxvp 40, 128(%2) \n\t" | |||||
| "lxvp 42, 160(%2) \n\t" | |||||
| "xvmuldp %x5, %x5, 33 \n\t" | |||||
| "xvmuldp %x6, %x6, 33 \n\t" | |||||
| "xvmuldp %x7, %x7, 33 \n\t" | |||||
| "xvmuldp %x8, %x8, 33 \n\t" | |||||
| "lxvp 44, 192(%2) \n\t" | |||||
| "lxvp 46, 224(%2) \n\t" | |||||
| "xvadddp 48, 48, 38 \n\t" | |||||
| "xvadddp 49, 49, 39 \n\t" | |||||
| "xvadddp 50, 50, %x3 \n\t" | |||||
| "xvadddp 51, 51, %x4 \n\t" | |||||
| "stxv 49, 0(%2) \n\t" | |||||
| "stxv 48, 16(%2) \n\t" | |||||
| "stxv 51, 32(%2) \n\t" | |||||
| "stxv 50, 48(%2) \n\t" | |||||
| "xvadddp 34, 34, %x5 \n\t" | |||||
| "xvadddp 35, 35, %x6 \n\t" | |||||
| "xvadddp 36, 36, %x7 \n\t" | |||||
| "xvadddp 37, 37, %x8 \n\t" | |||||
| "stxv 35, 64(%2) \n\t" | |||||
| "stxv 34, 80(%2) \n\t" | |||||
| "stxv 37, 96(%2) \n\t" | |||||
| "stxv 36, 112(%2) \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addic. %1, %1, -8 \n\t" | |||||
| "bgt one%= \n" | |||||
| "two%=: \n\t" | |||||
| "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r | |||||
| "xvmuldp 49, 41, 32 \n\t" | |||||
| "xvmuldp 50, 42, 32 \n\t" | |||||
| "xvmuldp 51, 43, 32 \n\t" | |||||
| "xvmuldp 34, 44, 32 \n\t" | |||||
| "xvmuldp 35, 45, 32 \n\t" | |||||
| "xvmuldp 36, 46, 32 \n\t" | |||||
| "xvmuldp 37, 47, 32 \n\t" | |||||
| XXSWAPD_S(38,40) | |||||
| XXSWAPD_S(39,41) | |||||
| XXSWAPD_S(%x3,42) | |||||
| XXSWAPD_S(%x4,43) | |||||
| XXSWAPD_S(%x5,44) | |||||
| XXSWAPD_S(%x6,45) | |||||
| XXSWAPD_S(%x7,46) | |||||
| XXSWAPD_S(%x8,47) | |||||
| "xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i | |||||
| "xvmuldp 39, 39, 33 \n\t" | |||||
| "xvmuldp %x3, %x3, 33 \n\t" | |||||
| "xvmuldp %x4, %x4, 33 \n\t" | |||||
| "xvmuldp %x5, %x5, 33 \n\t" | |||||
| "xvmuldp %x6, %x6, 33 \n\t" | |||||
| "xvmuldp %x7, %x7, 33 \n\t" | |||||
| "xvmuldp %x8, %x8, 33 \n\t" | |||||
| "xvadddp 48, 48, 38 \n\t" | |||||
| "xvadddp 49, 49, 39 \n\t" | |||||
| "xvadddp 50, 50, %x3 \n\t" | |||||
| "xvadddp 51, 51, %x4 \n\t" | |||||
| "stxv 49, 0(%2) \n\t" | |||||
| "stxv 48, 16(%2) \n\t" | |||||
| "stxv 51, 32(%2) \n\t" | |||||
| "stxv 50, 48(%2) \n\t" | |||||
| "xvadddp 34, 34, %x5 \n\t" | |||||
| "xvadddp 35, 35, %x6 \n\t" | |||||
| "xvadddp 36, 36, %x7 \n\t" | |||||
| "xvadddp 37, 37, %x8 \n\t" | |||||
| "stxv 35, 64(%2) \n\t" | |||||
| "stxv 34, 80(%2) \n\t" | |||||
| "stxv 37, 96(%2) \n\t" | |||||
| "stxv 36, 112(%2) \n\t" | |||||
| "#n=%1 x=%0=%2 alpha=(%9,%10) \n" | |||||
| : | |||||
| "+m" (*x), | |||||
| "+r" (n), // 1 | |||||
| "+b" (x), // 2 | |||||
| "=wa" (t0), // 3 | |||||
| "=wa" (t1), // 4 | |||||
| "=wa" (t2), // 5 | |||||
| "=wa" (t3), // 6 | |||||
| "=wa" (t4), // 7 | |||||
| "=wa" (t5) // 8 | |||||
| : | |||||
| "d" (alpha_r), // 9 | |||||
| "d" (alpha_i) // 10 | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
| "vs48","vs49","vs50","vs51" | |||||
| ); | |||||
| } | |||||
| @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | #if defined(__VEC__) || defined(__ALTIVEC__) | ||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "zswap_microk_power8.c" | #include "zswap_microk_power8.c" | ||||
| #elif defined(POWER10) | |||||
| #include "cswap_microk_power10.c" | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -489,3 +489,6 @@ XGEMM3MKERNEL = xgemm3m_kernel_2x2.S | |||||
| SSUMKERNEL = ../arm/sum.c | SSUMKERNEL = ../arm/sum.c | ||||
| DSUMKERNEL = ../arm/sum.c | DSUMKERNEL = ../arm/sum.c | ||||
| SOMATCOPY_RT = omatcopy_rt.c | |||||
| DOMATCOPY_RT = omatcopy_rt.c | |||||
| @@ -97,3 +97,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c | CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c | ||||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c | ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c | ||||
| SROTKERNEL = srot.c | |||||
| DROTKERNEL = drot.c | |||||
| @@ -6,7 +6,7 @@ | |||||
| #if defined(SKYLAKEX) | #if defined(SKYLAKEX) | ||||
| #include "dasum_microk_skylakex-2.c" | #include "dasum_microk_skylakex-2.c" | ||||
| #elif defined(HASWELL) | |||||
| #elif defined(HASWELL) || defined(ZEN) | |||||
| #include "dasum_microk_haswell-2.c" | #include "dasum_microk_haswell-2.c" | ||||
| #endif | #endif | ||||
| @@ -93,7 +93,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| #if defined(SMP) | #if defined(SMP) | ||||
| int nthreads; | int nthreads; | ||||
| FLOAT dummy_alpha; | FLOAT dummy_alpha; | ||||
| FLOAT * dummy_b; | |||||
| #endif | #endif | ||||
| FLOAT sumf = 0.0; | FLOAT sumf = 0.0; | ||||
| @@ -115,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| #else | #else | ||||
| mode = BLAS_DOUBLE | BLAS_REAL; | mode = BLAS_DOUBLE | BLAS_REAL; | ||||
| #endif | #endif | ||||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads); | |||||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); | |||||
| ptr = (FLOAT *)result; | ptr = (FLOAT *)result; | ||||
| for (i = 0; i < nthreads; i++) { | for (i = 0; i < nthreads; i++) { | ||||
| sumf += (*ptr); | sumf += (*ptr); | ||||
| @@ -2,7 +2,7 @@ | |||||
| #if defined(SKYLAKEX) | #if defined(SKYLAKEX) | ||||
| #include "drot_microk_skylakex-2.c" | #include "drot_microk_skylakex-2.c" | ||||
| #elif defined(HASWELL) | |||||
| #elif defined(HASWELL) || defined(ZEN) | |||||
| #include "drot_microk_haswell-2.c" | #include "drot_microk_haswell-2.c" | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,373 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #ifdef HAVE_AVX | |||||
| #define ROWS_OF_BLOCK 384 | |||||
| /* +r: %0 = src, %1 = dst, %2 = src_ld, %3 = dst_ld, %4 = dst_tmp */ | |||||
| /* m: %5 = num_rows, %6 = alpha */ | |||||
| /* xmm15 = alpha */ | |||||
| #define TRANS_4x4(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ | |||||
| "vunpcklps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t1_no"; vunpckhps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t2_no";"\ | |||||
| "vunpcklps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t3_no"; vunpckhps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t4_no";"\ | |||||
| "vunpcklpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a1_no"; vunpckhpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a2_no";"\ | |||||
| "vunpcklpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a3_no"; vunpckhpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a4_no";" | |||||
| #define TRANS_4x8(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ | |||||
| "vunpcklps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t1_no"; vunpckhps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t2_no";"\ | |||||
| "vunpcklps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t3_no"; vunpckhps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t4_no";"\ | |||||
| "vunpcklpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a1_no"; vunpckhpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a2_no";"\ | |||||
| "vunpcklpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a3_no"; vunpckhpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a4_no";" | |||||
| #define SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\ | |||||
| "vmovups %%xmm"#b1_no",(%4); vmovups %%xmm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ | |||||
| "vmovups %%xmm"#b3_no",(%4); vmovups %%xmm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;" | |||||
| #define SAVE_4x8(b1_no,b2_no,b3_no,b4_no) SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\ | |||||
| "vextractf128 $1,%%ymm"#b1_no",(%4); vextractf128 $1,%%ymm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ | |||||
| "vextractf128 $1,%%ymm"#b3_no",(%4); vextractf128 $1,%%ymm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;" | |||||
| #define COPY_4x16 "movq %1,%4; addq $16,%1;"\ | |||||
| "vmulps (%0),%%ymm15,%%ymm0; vmulps 32(%0),%%ymm15,%%ymm4; vmulps (%0,%2,1),%%ymm15,%%ymm1; vmulps 32(%0,%2,1),%%ymm15,%%ymm5; leaq (%0,%2,2),%0;"\ | |||||
| "vmulps (%0),%%ymm15,%%ymm2; vmulps 32(%0),%%ymm15,%%ymm6; vmulps (%0,%2,1),%%ymm15,%%ymm3; vmulps 32(%0,%2,1),%%ymm15,%%ymm7; leaq (%0,%2,2),%0;"\ | |||||
| TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3)\ | |||||
| TRANS_4x8(4,5,6,7,8,9,10,11) SAVE_4x8(4,5,6,7) | |||||
| #define COPY_4x8 "movq %1,%4; addq $16,%1;"\ | |||||
| "vmulps (%0),%%ymm15,%%ymm0; vmulps (%0,%2,1),%%ymm15,%%ymm1; leaq (%0,%2,2),%0;"\ | |||||
| "vmulps (%0),%%ymm15,%%ymm2; vmulps (%0,%2,1),%%ymm15,%%ymm3; leaq (%0,%2,2),%0;"\ | |||||
| TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3) | |||||
| #define COPY_4x4 "movq %1,%4; addq $16,%1;"\ | |||||
| "vmulps (%0),%%xmm15,%%xmm0; vmulps (%0,%2,1),%%xmm15,%%xmm1; leaq (%0,%2,2),%0;"\ | |||||
| "vmulps (%0),%%xmm15,%%xmm2; vmulps (%0,%2,1),%%xmm15,%%xmm3; leaq (%0,%2,2),%0;"\ | |||||
| TRANS_4x4(0,1,2,3,8,9,10,11) SAVE_4x4(0,1,2,3) | |||||
| #define COPY_4x2 \ | |||||
| "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ | |||||
| "vmovsd (%0),%%xmm1; vmovhpd (%0,%2,1),%%xmm1,%%xmm1; vmulps %%xmm15,%%xmm1,%%xmm1; leaq (%0,%2,2),%0;"\ | |||||
| "vpermilps $216,%%xmm0,%%xmm0; vpermilps $216,%%xmm1,%%xmm1; vunpcklpd %%xmm1,%%xmm0,%%xmm2; vunpckhpd %%xmm1,%%xmm0,%%xmm3;"\ | |||||
| "vmovups %%xmm2,(%1); vmovups %%xmm3,(%1,%3,1); addq $16,%1;" | |||||
| #define COPY_4x1 \ | |||||
| "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ | |||||
| "vinsertps $32,(%0),%%xmm0,%%xmm0; vinsertps $48,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ | |||||
| "vmulps %%xmm15,%%xmm0,%%xmm0; vmovups %%xmm0,(%1); addq $16,%1;" | |||||
| #define SAVE_2x4(c1_no,c2_no,t1_no,t2_no) \ | |||||
| "vunpcklps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t1_no"; vmulps %%xmm15,%%xmm"#t1_no",%%xmm"#t1_no";"\ | |||||
| "vmovsd %%xmm"#t1_no",(%4); vmovhpd %%xmm"#t1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ | |||||
| "vunpckhps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t2_no"; vmulps %%xmm15,%%xmm"#t2_no",%%xmm"#t2_no";"\ | |||||
| "vmovsd %%xmm"#t2_no",(%4); vmovhpd %%xmm"#t2_no",(%4,%3,1); leaq (%4,%3,2),%4;" | |||||
| #define COPY_2x16 "movq %1,%4; addq $8,%1;"\ | |||||
| "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm2; vmovups (%0,%2,1),%%ymm1; vmovups 32(%0,%2,1),%%ymm3; leaq (%0,%2,2),%0;"\ | |||||
| "vextractf128 $1,%%ymm0,%%xmm4; vextractf128 $1,%%ymm2,%%xmm6; vextractf128 $1,%%ymm1,%%xmm5; vextractf128 $1,%%ymm3,%%xmm7;"\ | |||||
| SAVE_2x4(0,1,8,9) SAVE_2x4(4,5,8,9) SAVE_2x4(2,3,8,9) SAVE_2x4(6,7,8,9) | |||||
| #define COPY_2x8 "movq %1,%4; addq $8,%1;"\ | |||||
| "vmovups (%0),%%ymm0; vmovups (%0,%2,1),%%ymm1; leaq (%0,%2,2),%0;"\ | |||||
| "vextractf128 $1,%%ymm0,%%xmm2; vextractf128 $1,%%ymm1,%%xmm3;"\ | |||||
| SAVE_2x4(0,1,4,5) SAVE_2x4(2,3,4,5) | |||||
| #define COPY_2x4 "movq %1,%4; addq $8,%1;"\ | |||||
| "vmovups (%0),%%xmm0; vmovups (%0,%2,1),%%xmm1; leaq (%0,%2,2),%0;"\ | |||||
| SAVE_2x4(0,1,4,5) | |||||
| #define COPY_2x2 \ | |||||
| "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vpermilps $216,%%xmm0,%%xmm0;"\ | |||||
| "vmovsd %%xmm0,(%1); vmovhpd %%xmm0,(%1,%3,1); addq $8,%1;" | |||||
| #define COPY_2x1 \ | |||||
| "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vmovsd %%xmm0,(%1); addq $8,%1;" | |||||
| #define SAVE_1x4(c1_no)\ | |||||
| "vmulps %%xmm15,%%xmm"#c1_no",%%xmm"#c1_no"; vmovss %%xmm"#c1_no",(%4); vextractps $1,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ | |||||
| "vextractps $2,%%xmm"#c1_no",(%4); vextractps $3,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;" | |||||
| #define COPY_1x16 "movq %1,%4; addq $4,%1;"\ | |||||
| "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2)\ | |||||
| "vmovups 32(%0),%%xmm1;" SAVE_1x4(1) "vmovups 48(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;" | |||||
| #define COPY_1x8 "movq %1,%4; addq $4,%1;"\ | |||||
| "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;" | |||||
| #define COPY_1x4 "movq %1,%4; addq $4,%1; vmovups (%0),%%xmm1;" SAVE_1x4(1) "addq %2,%0;" | |||||
| #define COPY_1x2 "vmovsd (%0),%%xmm1; addq %2,%0; vmulps %%xmm15,%%xmm1,%%xmm1; vmovss %%xmm1,(%1); vextractps $1,%%xmm1,(%1,%3,1); addq $4,%1;" | |||||
| #define COPY_1x1 "vmulss (%0),%%xmm15,%%xmm1; vmovss %%xmm1,(%1); addq %2,%0; addq $4,%1;" | |||||
| #define COMPUTE(ndim){\ | |||||
| src = src_base; dst = dst_base;\ | |||||
| __asm__ __volatile__(\ | |||||
| "vbroadcastss %6,%%ymm15; movq %5,%%r11; cmpq $4,%%r11; jb "#ndim"32f;"\ | |||||
| #ndim"31:\n\t"\ | |||||
| COPY_4x##ndim "subq $4,%%r11; cmpq $4,%%r11; jnb "#ndim"31b;"\ | |||||
| #ndim"32:\n\t"\ | |||||
| "cmpq $2,%%r11; jb "#ndim"33f;"\ | |||||
| COPY_2x##ndim "subq $2,%%r11;"\ | |||||
| #ndim"33:\n\t"\ | |||||
| "testq %%r11,%%r11; jz "#ndim"34f;"\ | |||||
| COPY_1x##ndim "subq $1,%%r11;"\ | |||||
| #ndim"34:\n\t"\ | |||||
| :"+r"(src),"+r"(dst),"+r"(src_ld_bytes),"+r"(dst_ld_bytes),"+r"(dst_tmp):"m"(num_rows),"m"(ALPHA):"r11","cc","memory"\ | |||||
| ,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||||
| } | |||||
| int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){ | |||||
| float *src, *dst, *dst_tmp, *src_base, *dst_base; | |||||
| uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0; | |||||
| BLASLONG cols_left, rows_done; float ALPHA = alpha; | |||||
| if(ALPHA==0.0){ | |||||
| dst_base = b; | |||||
| for(cols_left=cols;cols_left>0;cols_left--) {memset(dst_base,0,rows*sizeof(float)); dst_base += ldb;} | |||||
| return 0; | |||||
| } | |||||
| for(rows_done=0;rows_done<rows;rows_done+=num_rows){ | |||||
| num_rows = rows-rows_done; | |||||
| if(num_rows > ROWS_OF_BLOCK) num_rows = ROWS_OF_BLOCK; | |||||
| cols_left = cols; src_base = a + (int64_t)lda * (int64_t)rows_done; dst_base = b + rows_done; | |||||
| if(ldb%1024>3 && ldb%1024<1021) for(;cols_left>15;cols_left-=16){COMPUTE(16) src_base += 16; dst_base += 16 * ldb;} | |||||
| for(;cols_left>7;cols_left-=8){COMPUTE(8) src_base += 8; dst_base += 8 * ldb;} | |||||
| for(;cols_left>3;cols_left-=4){COMPUTE(4) src_base += 4; dst_base += 4 * ldb;} | |||||
| for(;cols_left>1;cols_left-=2){COMPUTE(2) src_base += 2; dst_base += 2 * ldb;} | |||||
| if(cols_left>0){COMPUTE(1) src_base ++; dst_base += ldb;} | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| #else | |||||
| int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) | |||||
| { | |||||
| BLASLONG i, j; | |||||
| FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||||
| FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; | |||||
| if (rows <= 0) return 0; | |||||
| if (cols <= 0) return 0; | |||||
| a_offset = a; | |||||
| b_offset = b; | |||||
| i = (rows >> 2); | |||||
| if (i > 0) { | |||||
| do { | |||||
| a_offset1 = a_offset; | |||||
| a_offset2 = a_offset1 + lda; | |||||
| a_offset3 = a_offset2 + lda; | |||||
| a_offset4 = a_offset3 + lda; | |||||
| a_offset += 4 * lda; | |||||
| b_offset1 = b_offset; | |||||
| b_offset2 = b_offset1 + ldb; | |||||
| b_offset3 = b_offset2 + ldb; | |||||
| b_offset4 = b_offset3 + ldb; | |||||
| b_offset += 4; | |||||
| j = (cols >> 2); | |||||
| if (j > 0) { | |||||
| do { | |||||
| /* Column 1 of MAT_B */ | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A | |||||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||||
| *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; | |||||
| *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; | |||||
| /* Column 2 of MAT_B */ | |||||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A | |||||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||||
| *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; | |||||
| *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; | |||||
| /* Column 3 of MAT_B */ | |||||
| *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A | |||||
| *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; | |||||
| *(b_offset3 + 2) = *(a_offset3 + 2)*alpha; | |||||
| *(b_offset4 + 2) = *(a_offset3 + 3)*alpha; | |||||
| /* Column 4 of MAT_B */ | |||||
| *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A | |||||
| *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; | |||||
| *(b_offset3 + 3) = *(a_offset4 + 2)*alpha; | |||||
| *(b_offset4 + 3) = *(a_offset4 + 3)*alpha; | |||||
| a_offset1 += 4; | |||||
| a_offset2 += 4; | |||||
| a_offset3 += 4; | |||||
| a_offset4 += 4; | |||||
| b_offset1 += ldb * 4; | |||||
| b_offset2 += ldb * 4; | |||||
| b_offset3 += ldb * 4; | |||||
| b_offset4 += ldb * 4; | |||||
| j--; | |||||
| } while (j > 0); | |||||
| } // if(j > 0) | |||||
| if (cols & 2) { | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||||
| *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; | |||||
| *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; | |||||
| *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; | |||||
| *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; | |||||
| a_offset1 += 2; | |||||
| a_offset2 += 2; | |||||
| a_offset3 += 2; | |||||
| a_offset4 += 2; | |||||
| b_offset1 += ldb*2; | |||||
| } | |||||
| if (cols & 1) { | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||||
| *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; | |||||
| *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; | |||||
| } | |||||
| i--; | |||||
| } while (i > 0); | |||||
| } | |||||
| if (rows & 2) { | |||||
| a_offset1 = a_offset; | |||||
| a_offset2 = a_offset1 + lda; | |||||
| a_offset += 2 * lda; | |||||
| b_offset1 = b_offset; | |||||
| b_offset2 = b_offset1 + ldb; | |||||
| b_offset3 = b_offset2 + ldb; | |||||
| b_offset4 = b_offset3 + ldb; | |||||
| b_offset += 2; | |||||
| j = (cols >> 2); | |||||
| if (j > 0){ | |||||
| do { | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||||
| *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; | |||||
| *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; | |||||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||||
| *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; | |||||
| *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; | |||||
| a_offset1 += 4; | |||||
| a_offset2 += 4; | |||||
| b_offset1 += ldb * 4; | |||||
| b_offset2 += ldb * 4; | |||||
| b_offset3 += ldb * 4; | |||||
| b_offset4 += ldb * 4; | |||||
| j--; | |||||
| } while (j > 0); | |||||
| } | |||||
| if (cols & 2){ | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||||
| a_offset1 += 2; | |||||
| a_offset2 += 2; | |||||
| b_offset1 += ldb*2; | |||||
| } | |||||
| if (cols & 1){ | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||||
| } | |||||
| } // if (rows & 2) | |||||
| if (rows & 1) { | |||||
| a_offset1 = a_offset; | |||||
| a_offset += lda; | |||||
| b_offset1 = b_offset; | |||||
| b_offset2 = b_offset1 + ldb; | |||||
| b_offset3 = b_offset2 + ldb; | |||||
| b_offset4 = b_offset3 + ldb; | |||||
| j = (cols >> 2); | |||||
| if (j > 0){ | |||||
| do { | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||||
| *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; | |||||
| *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; | |||||
| a_offset1 += 4; | |||||
| b_offset1 += ldb * 4; | |||||
| b_offset2 += ldb * 4; | |||||
| b_offset3 += ldb * 4; | |||||
| b_offset4 += ldb * 4; | |||||
| j--; | |||||
| } while (j > 0); | |||||
| } | |||||
| if (cols & 2){ | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||||
| a_offset1 += 2; | |||||
| b_offset1 += ldb * 2; | |||||
| } | |||||
| if (cols & 1){ | |||||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| #endif | |||||
| @@ -11,7 +11,7 @@ | |||||
| #if defined(SKYLAKEX) | #if defined(SKYLAKEX) | ||||
| #include "sasum_microk_skylakex-2.c" | #include "sasum_microk_skylakex-2.c" | ||||
| #elif defined(HASWELL) | |||||
| #elif defined(HASWELL) || defined(ZEN) | |||||
| #include "sasum_microk_haswell-2.c" | #include "sasum_microk_haswell-2.c" | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,426 @@ | |||||
| #include "sbgemm.h" | |||||
| #include <immintrin.h> | |||||
| // Walk around those intrinsics that missed by compiler | |||||
| #define MM256_LOADU_EPI16(addr) \ | |||||
| _mm256_maskz_loadu_epi16(~0, (addr)) | |||||
| #define MM256_STOREU_EPI16(addr, reg) \ | |||||
| _mm256_mask_storeu_epi16((addr), ~0, (reg)) | |||||
| #include <stdio.h> | |||||
| void print_block(BLASLONG m, BLASLONG n, bfloat16 * mat) | |||||
| { | |||||
| printf("---- BLOCK %ld x %ld ----\n", m, n); | |||||
| for (BLASLONG i=0; i<m; i++) { | |||||
| for (BLASLONG j=0; j<n; j++) { | |||||
| printf("%-4X ", *(mat + i*n +j)); | |||||
| } | |||||
| printf("\n"); | |||||
| } | |||||
| printf("---- End of BLOCK ----\n"); | |||||
| } | |||||
| void COL_MAJOR_INCOPY_KERNEL_Kx32(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) | |||||
| { | |||||
| BLASLONG tag_k_2x = k & (~1); | |||||
| __m512i array512_0, array512_1, array512_2, array512_3; | |||||
| BLASLONG idx_src_base0, idx_src_base1; | |||||
| BLASLONG idx_target_base0, idx_target_base1; | |||||
| BLASLONG LDA_2x = 2*lda; | |||||
| BLASLONG BF16_BLOCK_T_M_2x = 2*32; | |||||
| idx_src_base0 = 0; | |||||
| idx_src_base1 = lda; | |||||
| idx_target_base0 = 0; | |||||
| idx_target_base1 = 32; | |||||
| for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { | |||||
| array512_0 = _mm512_loadu_si512(&A[idx_src_base0]); | |||||
| array512_1 = _mm512_loadu_si512(&A[idx_src_base1]); | |||||
| array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1); | |||||
| array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1); | |||||
| _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); | |||||
| _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); | |||||
| idx_src_base0 += LDA_2x; | |||||
| idx_src_base1 += LDA_2x; | |||||
| idx_target_base0 += BF16_BLOCK_T_M_2x; | |||||
| idx_target_base1 += BF16_BLOCK_T_M_2x; | |||||
| } | |||||
| if (tag_k_2x != k) { | |||||
| __m512i ZERO512 = _mm512_setzero_si512(); | |||||
| array512_0 = _mm512_loadu_si512(&A[idx_src_base0]); | |||||
| array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512); | |||||
| array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512); | |||||
| _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); | |||||
| _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); | |||||
| } | |||||
| #ifdef DEBUG_PROFILE | |||||
| print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); | |||||
| #endif | |||||
| } | |||||
| void COL_MAJOR_INCOPY_KERNEL_Kx32m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) | |||||
| { | |||||
| BLASLONG tag_k_2x = k & (~1); | |||||
| unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-m)); | |||||
| __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); | |||||
| __m512i array512_0, array512_1, array512_2, array512_3; | |||||
| BLASLONG idx_src_base0, idx_src_base1; | |||||
| BLASLONG idx_target_base0, idx_target_base1; | |||||
| BLASLONG LDA_2x = 2*lda; | |||||
| BLASLONG BF16_BLOCK_T_M_2x = 2*32; | |||||
| idx_src_base0 = 0; | |||||
| idx_src_base1 = lda; | |||||
| idx_target_base0 = 0; | |||||
| idx_target_base1 = 32; | |||||
| for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { | |||||
| array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); | |||||
| array512_1 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]); | |||||
| array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1); | |||||
| array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1); | |||||
| _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); | |||||
| _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); | |||||
| idx_src_base0 += LDA_2x; | |||||
| idx_src_base1 += LDA_2x; | |||||
| idx_target_base0 += BF16_BLOCK_T_M_2x; | |||||
| idx_target_base1 += BF16_BLOCK_T_M_2x; | |||||
| } | |||||
| if (tag_k_2x != k) { | |||||
| __m512i ZERO512 = _mm512_setzero_si512(); | |||||
| array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); | |||||
| array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512); | |||||
| array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512); | |||||
| _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); | |||||
| _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); | |||||
| } | |||||
| #ifdef DEBUG_PROFILE | |||||
| print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); | |||||
| #endif | |||||
| } | |||||
| void COL_MAJOR_INCOPY_KERNEL_Kx16(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) | |||||
| { | |||||
| BLASLONG tag_k_2x = k & (~1); | |||||
| __m256i array256_0, array256_1, array256_2, array256_3; | |||||
| BLASLONG idx_src_base0, idx_src_base1; | |||||
| BLASLONG idx_target_base0; | |||||
| BLASLONG LDA_2x = 2*lda; | |||||
| idx_src_base0 = 0; | |||||
| idx_src_base1 = lda; | |||||
| idx_target_base0 = 0; | |||||
| for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { | |||||
| array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]); | |||||
| array256_1 = MM256_LOADU_EPI16(&A[idx_src_base1]); | |||||
| array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1); | |||||
| array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1); | |||||
| // Store in one row of block_B | |||||
| MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); | |||||
| MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); | |||||
| idx_src_base0 += LDA_2x; | |||||
| idx_src_base1 += LDA_2x; | |||||
| idx_target_base0 += 32; | |||||
| } | |||||
| if (tag_k_2x != k) { | |||||
| __m256i ZERO256 = _mm256_setzero_si256(); | |||||
| array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]); | |||||
| array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256); | |||||
| array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256); | |||||
| // Store in one row of block_B | |||||
| MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); | |||||
| MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); | |||||
| } | |||||
| #ifdef DEBUG_PROFILE | |||||
| print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); | |||||
| #endif | |||||
| } | |||||
| void COL_MAJOR_INCOPY_KERNEL_Kx16m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) | |||||
| { | |||||
| BLASLONG tag_k_2x = k & (~1); | |||||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); | |||||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||||
| __m256i array256_0, array256_1, array256_2, array256_3; | |||||
| BLASLONG idx_src_base0, idx_src_base1; | |||||
| BLASLONG idx_target_base0; | |||||
| BLASLONG LDA_2x = 2*lda; | |||||
| idx_src_base0 = 0; | |||||
| idx_src_base1 = lda; | |||||
| idx_target_base0 = 0; | |||||
| for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { | |||||
| array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); | |||||
| array256_1 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]); | |||||
| array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1); | |||||
| array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1); | |||||
| // Store in one row of block_B | |||||
| MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); | |||||
| MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); | |||||
| idx_src_base0 += LDA_2x; | |||||
| idx_src_base1 += LDA_2x; | |||||
| idx_target_base0 += 32; | |||||
| } | |||||
| if (tag_k_2x != k) { | |||||
| __m256i ZERO256 = _mm256_setzero_si256(); | |||||
| array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); | |||||
| array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256); | |||||
| array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256); | |||||
| // Store in one row of block_B | |||||
| MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); | |||||
| MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); | |||||
| } | |||||
| #ifdef DEBUG_PROFILE | |||||
| print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); | |||||
| #endif | |||||
| } | |||||
| void COL_MAJOR_ONCOPY_KERNEL_8x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) | |||||
| { | |||||
| BLASLONG tag_k_32x = k & (~31); | |||||
| BLASLONG idx_src_base0, idx_src_base1, idx_src_base2, idx_src_base3, idx_src_base4, idx_src_base5, idx_src_base6, idx_src_base7; | |||||
| BLASLONG idx_target_base0; | |||||
| idx_src_base0 = 0; | |||||
| idx_src_base1 = 1*ldb; | |||||
| idx_src_base2 = 2*ldb; | |||||
| idx_src_base3 = 3*ldb; | |||||
| idx_src_base4 = 4*ldb; | |||||
| idx_src_base5 = 5*ldb; | |||||
| idx_src_base6 = 6*ldb; | |||||
| idx_src_base7 = 7*ldb; | |||||
| idx_target_base0 = 0; | |||||
| for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0+idx_k])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base1+idx_k])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_loadu_si512(&B[idx_src_base2+idx_k])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_loadu_si512(&B[idx_src_base3+idx_k])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_loadu_si512(&B[idx_src_base4+idx_k])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_loadu_si512(&B[idx_src_base5+idx_k])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_loadu_si512(&B[idx_src_base6+idx_k])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_loadu_si512(&B[idx_src_base7+idx_k])); | |||||
| idx_target_base0 += 32*8; | |||||
| } | |||||
| if (tag_k_32x != k) { | |||||
| unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); | |||||
| __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0+tag_k_32x])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base1+tag_k_32x])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base2+tag_k_32x])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base3+tag_k_32x])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base4+tag_k_32x])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base5+tag_k_32x])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base6+tag_k_32x])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base7+tag_k_32x])); | |||||
| } | |||||
| #ifdef DEBUG_PROFILE | |||||
| print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B); | |||||
| #endif | |||||
| } | |||||
| void COL_MAJOR_ONCOPY_KERNEL_Nx32(BLASLONG n, BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) | |||||
| { | |||||
| BLASLONG tag_k_32x = k & (~31); | |||||
| BLASLONG tag_n_2x = n & (~1); | |||||
| BLASLONG idx_src_base0; | |||||
| BLASLONG idx_target_base0; | |||||
| BLASLONG LDB_2x = 2*ldb; | |||||
| idx_target_base0 = 0; | |||||
| for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { | |||||
| idx_src_base0 = 0; | |||||
| for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) { | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base0 + ldb + idx_k])); | |||||
| idx_src_base0 += LDB_2x; | |||||
| idx_target_base0 += 64; | |||||
| } | |||||
| if (tag_n_2x != n) { | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k])); | |||||
| idx_target_base0 += 32; | |||||
| } | |||||
| } | |||||
| if (tag_k_32x != k) { | |||||
| unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); | |||||
| __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); | |||||
| idx_src_base0 = 0; | |||||
| for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) { | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x])); | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + ldb + tag_k_32x])); | |||||
| idx_src_base0 += LDB_2x; | |||||
| idx_target_base0 += 64; | |||||
| } | |||||
| if (tag_n_2x != n) { | |||||
| _mm512_storeu_si512(&block_B[idx_target_base0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x])); | |||||
| } | |||||
| } | |||||
| #ifdef DEBUG_PROFILE | |||||
| print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B); | |||||
| #endif | |||||
| } | |||||
| // Scale matrix C while beta is not ZERO or ONE | |||||
| void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc) | |||||
| { | |||||
| BLASLONG tag_n_Nx = N & (~3); | |||||
| BLASLONG tag_n_Mx = M & (~15); | |||||
| BLASLONG LDC4x = ldc*4; | |||||
| BLASLONG idx_base_0 = 0; | |||||
| BLASLONG idx_base_1 = ldc; | |||||
| BLASLONG idx_base_2 = ldc*2; | |||||
| BLASLONG idx_base_3 = ldc*3; | |||||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); | |||||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||||
| __m512 array_512_0, array_512_1, array_512_2, array_512_3; | |||||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||||
| if (Order == CblasColMajor) { | |||||
| for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { | |||||
| for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { | |||||
| array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]); | |||||
| array_512_1 = _mm512_loadu_ps(&C[idx_base_1+idx_m]); | |||||
| array_512_2 = _mm512_loadu_ps(&C[idx_base_2+idx_m]); | |||||
| array_512_3 = _mm512_loadu_ps(&C[idx_base_3+idx_m]); | |||||
| array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); | |||||
| array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); | |||||
| array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); | |||||
| array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); | |||||
| _mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0); | |||||
| _mm512_storeu_ps(&C[idx_base_1+idx_m], array_512_1); | |||||
| _mm512_storeu_ps(&C[idx_base_2+idx_m], array_512_2); | |||||
| _mm512_storeu_ps(&C[idx_base_3+idx_m], array_512_3); | |||||
| } | |||||
| if (tag_n_Mx != M) { | |||||
| array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]); | |||||
| array_512_1 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_1+tag_n_Mx]); | |||||
| array_512_2 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_2+tag_n_Mx]); | |||||
| array_512_3 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_3+tag_n_Mx]); | |||||
| array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); | |||||
| array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); | |||||
| array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); | |||||
| array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); | |||||
| _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0); | |||||
| _mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, array_512_1); | |||||
| _mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, array_512_2); | |||||
| _mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, array_512_3); | |||||
| } | |||||
| idx_base_0 += LDC4x; | |||||
| idx_base_1 += LDC4x; | |||||
| idx_base_2 += LDC4x; | |||||
| idx_base_3 += LDC4x; | |||||
| } | |||||
| if (tag_n_Nx != N) { | |||||
| for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { | |||||
| for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { | |||||
| array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]); | |||||
| array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); | |||||
| _mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0); | |||||
| } | |||||
| if (tag_n_Mx != M) { | |||||
| array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]); | |||||
| array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); | |||||
| _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0); | |||||
| } | |||||
| idx_base_0 += ldc; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| } | |||||
| } | |||||
| // Scale matrix C while beta is not ZERO or ONE | |||||
| void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, float *C, OPENBLAS_CONST blasint ldc) | |||||
| { | |||||
| BLASLONG tag_n_Nx = N & (~3); | |||||
| BLASLONG tag_n_Mx = M & (~15); | |||||
| BLASLONG LDC4x = ldc*4; | |||||
| BLASLONG idx_base_0 = 0; | |||||
| BLASLONG idx_base_1 = ldc; | |||||
| BLASLONG idx_base_2 = ldc*2; | |||||
| BLASLONG idx_base_3 = ldc*3; | |||||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); | |||||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||||
| __m512 ZEROVECTOR = _mm512_setzero_ps(); | |||||
| if (Order == CblasColMajor) { | |||||
| for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { | |||||
| for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { | |||||
| _mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR); | |||||
| _mm512_storeu_ps(&C[idx_base_1+idx_m], ZEROVECTOR); | |||||
| _mm512_storeu_ps(&C[idx_base_2+idx_m], ZEROVECTOR); | |||||
| _mm512_storeu_ps(&C[idx_base_3+idx_m], ZEROVECTOR); | |||||
| } | |||||
| if (tag_n_Mx != M) { | |||||
| _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR); | |||||
| _mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, ZEROVECTOR); | |||||
| _mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, ZEROVECTOR); | |||||
| _mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, ZEROVECTOR); | |||||
| } | |||||
| idx_base_0 += LDC4x; | |||||
| idx_base_1 += LDC4x; | |||||
| idx_base_2 += LDC4x; | |||||
| idx_base_3 += LDC4x; | |||||
| } | |||||
| if (tag_n_Nx != N) { | |||||
| for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { | |||||
| for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { | |||||
| _mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR); | |||||
| } | |||||
| if (tag_n_Mx != M) { | |||||
| _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR); | |||||
| } | |||||
| idx_base_0 += ldc; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,625 @@ | |||||
| #include "sbgemm.h" | |||||
| #include "bf16_common_macros.h" | |||||
| #include <immintrin.h> | |||||
| #undef STORE16_COMPLETE_RESULT | |||||
| #undef STORE16_MASK_COMPLETE_RESULT | |||||
| #undef SBGEMM_BLOCK_KERNEL_32x8x32 | |||||
| #undef SBGEMM_BLOCK_KERNEL_16x8x32 | |||||
| #undef SBGEMM_BLOCK_KERNEL_32xNx32 | |||||
| #undef SBGEMM_BLOCK_KERNEL_16xNx32 | |||||
| #undef SBGEMM_BLOCKING_KERNEL_2 | |||||
| #ifndef ONE_ALPHA // ALPHA is not ONE | |||||
| #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE | |||||
| #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE | |||||
| #define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_alpha | |||||
| #define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_alpha | |||||
| #define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_alpha | |||||
| #define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_alpha | |||||
| #define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_alpha | |||||
| #else // ALPHA is ONE | |||||
| #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ONE_ONE | |||||
| #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ONE_ONE | |||||
| #define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_one | |||||
| #define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_one | |||||
| #define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_one | |||||
| #define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_one | |||||
| #define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_one | |||||
| #endif | |||||
| // SBGEMM Kernel for 16<M<=32, N=8, K can be any number, but the processing will take 32 as a base | |||||
| #ifndef ONE_ALPHA // ALPHA is not ONE | |||||
| void sbgemm_block_kernel_32x8x32_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||||
| #else // ALPHA is ONE | |||||
| void sbgemm_block_kernel_32x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||||
| #endif | |||||
| { | |||||
| int SHUFFLE_MAGIC_NO = 0x39; | |||||
| BLASLONG tag_k_32x = k & (~31); | |||||
| BLASLONG idxA_base = 0; | |||||
| BLASLONG idxB_base = 0; | |||||
| BLASLONG width = 32; | |||||
| #ifndef ONE_ALPHA | |||||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||||
| #endif | |||||
| __m512i arrayA_512_0, arrayA_512_1; | |||||
| __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7; | |||||
| __m512 result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7, | |||||
| result_512_8, result_512_9, result_512_10, result_512_11, result_512_12, result_512_13, result_512_14, result_512_15; | |||||
| __m512 result_512_tmp_0, result_512_tmp_1, result_512_tmp_2, result_512_tmp_3; | |||||
| __m512i M512_EPI32_8 = _mm512_set1_epi32(8); | |||||
| __m512i shuffle_idx_base0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0); | |||||
| __m512i shuffle_idx_base1 = _mm512_add_epi32(shuffle_idx_base0, M512_EPI32_8); | |||||
| result_512_0 = _mm512_setzero_ps(); | |||||
| result_512_1 = _mm512_setzero_ps(); | |||||
| result_512_2 = _mm512_setzero_ps(); | |||||
| result_512_3 = _mm512_setzero_ps(); | |||||
| result_512_4 = _mm512_setzero_ps(); | |||||
| result_512_5 = _mm512_setzero_ps(); | |||||
| result_512_6 = _mm512_setzero_ps(); | |||||
| result_512_7 = _mm512_setzero_ps(); | |||||
| result_512_8 = _mm512_setzero_ps(); | |||||
| result_512_9 = _mm512_setzero_ps(); | |||||
| result_512_10 = _mm512_setzero_ps(); | |||||
| result_512_11 = _mm512_setzero_ps(); | |||||
| result_512_12 = _mm512_setzero_ps(); | |||||
| result_512_13 = _mm512_setzero_ps(); | |||||
| result_512_14 = _mm512_setzero_ps(); | |||||
| result_512_15 = _mm512_setzero_ps(); | |||||
| for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) { | |||||
| // Load B with unroll 8 | |||||
| idxB_base = idx_k << 3; | |||||
| arrayB_512_0 = _mm512_loadu_si512(&B[idxB_base + 32*0]); | |||||
| arrayB_512_1 = _mm512_loadu_si512(&B[idxB_base + 32*1]); | |||||
| arrayB_512_2 = _mm512_loadu_si512(&B[idxB_base + 32*2]); | |||||
| arrayB_512_3 = _mm512_loadu_si512(&B[idxB_base + 32*3]); | |||||
| arrayB_512_4 = _mm512_loadu_si512(&B[idxB_base + 32*4]); | |||||
| arrayB_512_5 = _mm512_loadu_si512(&B[idxB_base + 32*5]); | |||||
| arrayB_512_6 = _mm512_loadu_si512(&B[idxB_base + 32*6]); | |||||
| arrayB_512_7 = _mm512_loadu_si512(&B[idxB_base + 32*7]); | |||||
| if (idx_k == tag_k_32x) {width = k - tag_k_32x;} | |||||
| for (BLASLONG idx = 0; idx < width;) { | |||||
| // Each two rows are a group for 32-pair bf16 elements | |||||
| idxA_base = idx << 5; | |||||
| arrayA_512_0 = _mm512_loadu_si512(&A[idxA_base]); | |||||
| arrayA_512_1 = _mm512_loadu_si512(&A[idxA_base + 32]); | |||||
| result_512_0 = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0))); | |||||
| result_512_1 = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1))); | |||||
| result_512_2 = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2))); | |||||
| result_512_3 = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3))); | |||||
| result_512_4 = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4))); | |||||
| result_512_5 = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5))); | |||||
| result_512_6 = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6))); | |||||
| result_512_7 = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7))); | |||||
| result_512_8 = _mm512_dpbf16_ps(result_512_8, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0))); | |||||
| result_512_9 = _mm512_dpbf16_ps(result_512_9, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1))); | |||||
| result_512_10 = _mm512_dpbf16_ps(result_512_10, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2))); | |||||
| result_512_11 = _mm512_dpbf16_ps(result_512_11, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3))); | |||||
| result_512_12 = _mm512_dpbf16_ps(result_512_12, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4))); | |||||
| result_512_13 = _mm512_dpbf16_ps(result_512_13, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5))); | |||||
| result_512_14 = _mm512_dpbf16_ps(result_512_14, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6))); | |||||
| result_512_15 = _mm512_dpbf16_ps(result_512_15, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7))); | |||||
| arrayB_512_0 = _mm512_shuffle_epi32(arrayB_512_0, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_1 = _mm512_shuffle_epi32(arrayB_512_1, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_2 = _mm512_shuffle_epi32(arrayB_512_2, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_3 = _mm512_shuffle_epi32(arrayB_512_3, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_4 = _mm512_shuffle_epi32(arrayB_512_4, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_5 = _mm512_shuffle_epi32(arrayB_512_5, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_6 = _mm512_shuffle_epi32(arrayB_512_6, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_7 = _mm512_shuffle_epi32(arrayB_512_7, SHUFFLE_MAGIC_NO); | |||||
| idx += 2; | |||||
| // Every 4 loops we need to switch to next 128 bits of arrayB registers | |||||
| if ((idx & (~7)) == idx) { | |||||
| arrayB_512_0 = _mm512_shuffle_i32x4(arrayB_512_0, arrayB_512_0, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_1 = _mm512_shuffle_i32x4(arrayB_512_1, arrayB_512_1, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_2 = _mm512_shuffle_i32x4(arrayB_512_2, arrayB_512_2, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_3 = _mm512_shuffle_i32x4(arrayB_512_3, arrayB_512_3, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_4 = _mm512_shuffle_i32x4(arrayB_512_4, arrayB_512_4, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_5 = _mm512_shuffle_i32x4(arrayB_512_5, arrayB_512_5, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_6 = _mm512_shuffle_i32x4(arrayB_512_6, arrayB_512_6, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_7 = _mm512_shuffle_i32x4(arrayB_512_7, arrayB_512_7, SHUFFLE_MAGIC_NO); | |||||
| } | |||||
| } | |||||
| } | |||||
| if (m != 32) { | |||||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (32-m)); | |||||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8); | |||||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8); | |||||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9); | |||||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9); | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*0])) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*0+16]), tail_mask) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*1])) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*1+16]), tail_mask) | |||||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10); | |||||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10); | |||||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11); | |||||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11); | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*2])) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*2+16]), tail_mask) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*3])) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*3+16]), tail_mask) | |||||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12); | |||||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12); | |||||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13); | |||||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13); | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*4])) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*4+16]), tail_mask) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*5])) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*5+16]), tail_mask) | |||||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14); | |||||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14); | |||||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15); | |||||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15); | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*6])) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*6+16]), tail_mask) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*7])) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*7+16]), tail_mask) | |||||
| } else { | |||||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8); | |||||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8); | |||||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9); | |||||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9); | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*0])) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*0+16])) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*1])) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*1+16])) | |||||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10); | |||||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10); | |||||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11); | |||||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11); | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*2])) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*2+16])) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*3])) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*3+16])) | |||||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12); | |||||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12); | |||||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13); | |||||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13); | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*4])) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*4+16])) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*5])) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*5+16])) | |||||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14); | |||||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14); | |||||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15); | |||||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15); | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*6])) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*6+16])) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*7])) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*7+16])) | |||||
| } | |||||
| } | |||||
| // SBGEMM Kernel for M<=16, N=8, K can be any number, but the processing will take 32 as a base | |||||
| #ifndef ONE_ALPHA // ALPHA is not ONE | |||||
| void sbgemm_block_kernel_16x8x32_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||||
| #else // ALPHA is ONE | |||||
| void sbgemm_block_kernel_16x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||||
| #endif | |||||
| { | |||||
| int SHUFFLE_MAGIC_NO = 0x39; | |||||
| BLASLONG tag_k_32x = k & (~31); | |||||
| BLASLONG idxB_base = 0; | |||||
| BLASLONG width = 32; | |||||
| #ifndef ONE_ALPHA | |||||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||||
| #endif | |||||
| __m512i arrayA_512_0; | |||||
| __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7; | |||||
| __m512 result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7; | |||||
| result_512_0 = _mm512_setzero_ps(); | |||||
| result_512_1 = _mm512_setzero_ps(); | |||||
| result_512_2 = _mm512_setzero_ps(); | |||||
| result_512_3 = _mm512_setzero_ps(); | |||||
| result_512_4 = _mm512_setzero_ps(); | |||||
| result_512_5 = _mm512_setzero_ps(); | |||||
| result_512_6 = _mm512_setzero_ps(); | |||||
| result_512_7 = _mm512_setzero_ps(); | |||||
| for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) { | |||||
| // Load B with unroll 8 | |||||
| idxB_base = idx_k << 3; | |||||
| arrayB_512_0 = _mm512_loadu_si512(&B[idxB_base + 32*0]); | |||||
| arrayB_512_1 = _mm512_loadu_si512(&B[idxB_base + 32*1]); | |||||
| arrayB_512_2 = _mm512_loadu_si512(&B[idxB_base + 32*2]); | |||||
| arrayB_512_3 = _mm512_loadu_si512(&B[idxB_base + 32*3]); | |||||
| arrayB_512_4 = _mm512_loadu_si512(&B[idxB_base + 32*4]); | |||||
| arrayB_512_5 = _mm512_loadu_si512(&B[idxB_base + 32*5]); | |||||
| arrayB_512_6 = _mm512_loadu_si512(&B[idxB_base + 32*6]); | |||||
| arrayB_512_7 = _mm512_loadu_si512(&B[idxB_base + 32*7]); | |||||
| if (idx_k == tag_k_32x) {width = k - tag_k_32x;} | |||||
| for (BLASLONG idx = 0; idx < width;) { | |||||
| // Each two rows are a group for 32-pair bf16 elements | |||||
| // Load two rows into a 512 register | |||||
| arrayA_512_0 = _mm512_loadu_si512(&A[idx<<4]); | |||||
| result_512_0 = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0))); | |||||
| result_512_1 = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1))); | |||||
| result_512_2 = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2))); | |||||
| result_512_3 = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3))); | |||||
| result_512_4 = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4))); | |||||
| result_512_5 = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5))); | |||||
| result_512_6 = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6))); | |||||
| result_512_7 = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7))); | |||||
| arrayB_512_0 = _mm512_shuffle_epi32(arrayB_512_0, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_1 = _mm512_shuffle_epi32(arrayB_512_1, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_2 = _mm512_shuffle_epi32(arrayB_512_2, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_3 = _mm512_shuffle_epi32(arrayB_512_3, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_4 = _mm512_shuffle_epi32(arrayB_512_4, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_5 = _mm512_shuffle_epi32(arrayB_512_5, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_6 = _mm512_shuffle_epi32(arrayB_512_6, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_7 = _mm512_shuffle_epi32(arrayB_512_7, SHUFFLE_MAGIC_NO); | |||||
| idx += 2; | |||||
| // Every 4 loops we need to switch to next 128 bits of arrayB registers | |||||
| if ((idx & (~7)) == idx) { | |||||
| arrayB_512_0 = _mm512_shuffle_i32x4(arrayB_512_0, arrayB_512_0, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_1 = _mm512_shuffle_i32x4(arrayB_512_1, arrayB_512_1, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_2 = _mm512_shuffle_i32x4(arrayB_512_2, arrayB_512_2, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_3 = _mm512_shuffle_i32x4(arrayB_512_3, arrayB_512_3, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_4 = _mm512_shuffle_i32x4(arrayB_512_4, arrayB_512_4, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_5 = _mm512_shuffle_i32x4(arrayB_512_5, arrayB_512_5, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_6 = _mm512_shuffle_i32x4(arrayB_512_6, arrayB_512_6, SHUFFLE_MAGIC_NO); | |||||
| arrayB_512_7 = _mm512_shuffle_i32x4(arrayB_512_7, arrayB_512_7, SHUFFLE_MAGIC_NO); | |||||
| } | |||||
| } | |||||
| } | |||||
| if (m != 16) { | |||||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); | |||||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||||
| result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8); | |||||
| result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8); | |||||
| result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8); | |||||
| result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8); | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_0, (&C[ldc*0]), tail_mask) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_1, (&C[ldc*1]), tail_mask) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_2, (&C[ldc*2]), tail_mask) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_3, (&C[ldc*3]), tail_mask) | |||||
| result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8); | |||||
| result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8); | |||||
| result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8); | |||||
| result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8); | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_4, (&C[ldc*4]), tail_mask) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_5, (&C[ldc*5]), tail_mask) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_6, (&C[ldc*6]), tail_mask) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_7, (&C[ldc*7]), tail_mask) | |||||
| } else { | |||||
| result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8); | |||||
| result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8); | |||||
| result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8); | |||||
| result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8); | |||||
| STORE16_COMPLETE_RESULT(result_512_0, (&C[ldc*0])) | |||||
| STORE16_COMPLETE_RESULT(result_512_1, (&C[ldc*1])) | |||||
| STORE16_COMPLETE_RESULT(result_512_2, (&C[ldc*2])) | |||||
| STORE16_COMPLETE_RESULT(result_512_3, (&C[ldc*3])) | |||||
| result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8); | |||||
| result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8); | |||||
| result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8); | |||||
| result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8); | |||||
| STORE16_COMPLETE_RESULT(result_512_4, (&C[ldc*4])) | |||||
| STORE16_COMPLETE_RESULT(result_512_5, (&C[ldc*5])) | |||||
| STORE16_COMPLETE_RESULT(result_512_6, (&C[ldc*6])) | |||||
| STORE16_COMPLETE_RESULT(result_512_7, (&C[ldc*7])) | |||||
| } | |||||
| } | |||||
| // SBGEMM Kernel for 16<M<=32, N<8, K can be any number, but the processing will take 32 as a base | |||||
| #ifndef ONE_ALPHA // ALPHA is not ONE | |||||
| void sbgemm_block_kernel_32xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||||
| #else // ALPHA is ONE | |||||
| void sbgemm_block_kernel_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||||
| #endif | |||||
| { | |||||
| int SHUFFLE_MAGIC_NO = 0x39; | |||||
| BLASLONG tag_k_32x = k & (~31); | |||||
| BLASLONG idxA_base = 0; | |||||
| BLASLONG idxB_base = 0; | |||||
| BLASLONG width = 32; | |||||
| #ifndef ONE_ALPHA | |||||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||||
| #endif | |||||
| __m512i arrayA_512[2]; | |||||
| __m512i arrayB_512[8]; | |||||
| __m512 result_512[16]; | |||||
| __m512 result_512_tmp_0, result_512_tmp_1; | |||||
| __m512i M512_EPI32_8 = _mm512_set1_epi32(8); | |||||
| __m512i shuffle_idx_base0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0); | |||||
| __m512i shuffle_idx_base1 = _mm512_add_epi32(shuffle_idx_base0, M512_EPI32_8); | |||||
| for (int i = 0; i < 15; i += 2) { | |||||
| result_512[i] = _mm512_setzero_ps(); | |||||
| result_512[i+1] = _mm512_setzero_ps(); | |||||
| } | |||||
| for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) { | |||||
| // Load B with unroll n | |||||
| for (int i = 0; i < n; i ++) { | |||||
| arrayB_512[i] = _mm512_loadu_si512(&B[idxB_base]); | |||||
| idxB_base += 32; | |||||
| } | |||||
| if (idx_k == tag_k_32x) {width = k - tag_k_32x;} | |||||
| for (BLASLONG idx = 0; idx < width;) { | |||||
| // Each two rows are a group for 32-pair bf16 elements | |||||
| idxA_base = idx << 5; | |||||
| arrayA_512[0] = _mm512_loadu_si512(&A[idxA_base]); | |||||
| arrayA_512[1] = _mm512_loadu_si512(&A[idxA_base + 32]); | |||||
| for (int i = 0; i < n; i++) { | |||||
| result_512[i] = _mm512_dpbf16_ps(result_512[i] , (__m512bh) arrayA_512[0], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); | |||||
| result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8], (__m512bh) arrayA_512[1], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); | |||||
| arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); | |||||
| } | |||||
| idx += 2; | |||||
| // Every 4 loops we need to switch to next 128 bits of arrayB registers | |||||
| if ((idx & (~7)) == idx) { | |||||
| for (int i = 0; i < n; i++) { | |||||
| arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| if (m != 32) { | |||||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (32-m)); | |||||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||||
| for (int i = 0; i < n; i++) { | |||||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); | |||||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i])) | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16]), tail_mask) | |||||
| } | |||||
| } else { | |||||
| for (int i = 0; i < n; i++) { | |||||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); | |||||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i])) | |||||
| STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16])) | |||||
| } | |||||
| } | |||||
| } | |||||
| // SBGEMM Kernel for 16<=M, N<8, K can be any number, but the processing will take 32 as a base | |||||
| #ifndef ONE_ALPHA // ALPHA is not ONE | |||||
| void sbgemm_block_kernel_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||||
| #else // ALPHA is ONE | |||||
| void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||||
| #endif | |||||
| { | |||||
| int SHUFFLE_MAGIC_NO = 0x39; | |||||
| BLASLONG tag_k_32x = k & (~31); | |||||
| BLASLONG idxB_base = 0; | |||||
| BLASLONG width = 32; | |||||
| #ifndef ONE_ALPHA | |||||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||||
| #endif | |||||
| __m512i arrayA_512; | |||||
| __m512i arrayB_512[8]; | |||||
| __m512 result_512[8]; | |||||
| for (int i = 0; i < 8; i += 2) { | |||||
| result_512[i] = _mm512_setzero_ps(); | |||||
| result_512[i+1] = _mm512_setzero_ps(); | |||||
| } | |||||
| for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) { | |||||
| // Load B with unroll n | |||||
| for (int i = 0; i < n; i ++) { | |||||
| arrayB_512[i] = _mm512_loadu_si512(&B[idxB_base]); | |||||
| idxB_base += 32; | |||||
| } | |||||
| if (idx_k == tag_k_32x) {width = k - tag_k_32x;} | |||||
| for (BLASLONG idx = 0; idx < width;) { | |||||
| // Each two rows are a group for 32-pair bf16 elements | |||||
| // Load two rows into a 512 register | |||||
| arrayA_512 = _mm512_loadu_si512(&A[idx<<4]); | |||||
| for (int i = 0; i < n; i ++) { | |||||
| result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); | |||||
| arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); | |||||
| } | |||||
| idx += 2; | |||||
| // Every 4 loops we need to switch to next 128 bits of arrayB registers | |||||
| if ((idx & (~7)) == idx) { | |||||
| for (int i = 0; i < n; i++) { | |||||
| arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| if (m != 16) { | |||||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); | |||||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||||
| for (int i = 0; i < n; i++) { | |||||
| result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); | |||||
| STORE16_MASK_COMPLETE_RESULT(result_512[i], (&C[ldc*i]), tail_mask) | |||||
| } | |||||
| } else { | |||||
| for (int i = 0; i < n; i++) { | |||||
| result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); | |||||
| STORE16_COMPLETE_RESULT(result_512[i], (&C[ldc*i])) | |||||
| } | |||||
| } | |||||
| } | |||||
| #ifndef ONE_ALPHA // ALPHA is not ONE | |||||
| void sbgemm_blocking_kernel_2_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) | |||||
| #else // ALPHA is ONE | |||||
| void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) | |||||
| #endif | |||||
| { | |||||
| BLASLONG m_step, n_step, k_step, k_step_round32; | |||||
| BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1)); | |||||
| BLASLONG n_from, n_to; | |||||
| BLASLONG tag_n_Nx; | |||||
| n_from = 0; | |||||
| n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N; | |||||
| tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); | |||||
| k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K; | |||||
| k_step_round32 = k_step & (~31); | |||||
| k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; | |||||
| if (M >= BF16_BLOCK_THRES_M) { | |||||
| while (n_from < N) { | |||||
| for (BLASLONG idx_k = 0; idx_k < K;) { | |||||
| // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... | |||||
| COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, 0), lda, block_A); | |||||
| // TODO: MT | |||||
| for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { | |||||
| // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... | |||||
| COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); | |||||
| SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); | |||||
| } | |||||
| if (tag_n_Nx != n_to) { | |||||
| n_step = n_to - tag_n_Nx; | |||||
| COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); | |||||
| SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); | |||||
| } | |||||
| for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { | |||||
| COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, idx_m), lda, block_A); | |||||
| for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { | |||||
| SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); | |||||
| } | |||||
| if (tag_n_Nx != n_to) { | |||||
| n_step = n_to - tag_n_Nx; | |||||
| SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); | |||||
| } | |||||
| } | |||||
| if (tag_m_Nx != M) { | |||||
| m_step = M - tag_m_Nx; | |||||
| if (m_step > 16) { | |||||
| COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); | |||||
| for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { | |||||
| SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); | |||||
| } | |||||
| if (tag_n_Nx != n_to) { | |||||
| n_step = n_to - tag_n_Nx; | |||||
| SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); | |||||
| } | |||||
| } else if (m_step == 16) { | |||||
| COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); | |||||
| for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { | |||||
| SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); | |||||
| } | |||||
| if (tag_n_Nx != n_to) { | |||||
| n_step = n_to - tag_n_Nx; | |||||
| SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); | |||||
| } | |||||
| } else { | |||||
| COL_MAJOR_INCOPY_KERNEL_Kx16m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); | |||||
| for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { | |||||
| SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); | |||||
| } | |||||
| if (tag_n_Nx != n_to) { | |||||
| n_step = n_to - tag_n_Nx; | |||||
| SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); | |||||
| } | |||||
| } | |||||
| } | |||||
| idx_k += k_step; | |||||
| k_step = K - idx_k; | |||||
| k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; | |||||
| k_step_round32 = k_step & (~31); | |||||
| k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; | |||||
| } | |||||
| n_from = n_to; | |||||
| n_to += BF16_BLOCK_THRES_N; | |||||
| n_to = (n_to > N) ? N : n_to; | |||||
| tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); | |||||
| } | |||||
| } else { | |||||
| m_step = M - tag_m_Nx; | |||||
| while (n_from < N) { | |||||
| for (BLASLONG idx_k = 0; idx_k < K;) { | |||||
| // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... | |||||
| COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, 0), lda, block_A); | |||||
| // TODO: MT | |||||
| for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { | |||||
| // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... | |||||
| COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); | |||||
| SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); | |||||
| } | |||||
| if (tag_n_Nx != n_to) { | |||||
| n_step = n_to - tag_n_Nx; | |||||
| COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); | |||||
| SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); | |||||
| } | |||||
| idx_k += k_step; | |||||
| k_step = K - idx_k; | |||||
| k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; | |||||
| k_step_round32 = k_step & (~31); | |||||
| k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; | |||||
| } | |||||
| n_from = n_to; | |||||
| n_to += BF16_BLOCK_THRES_N; | |||||
| n_to = (n_to > N) ? N : n_to; | |||||
| tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); | |||||
| } | |||||
| } | |||||
| } | |||||
| #ifndef ONE_ALPHA // ALPHA is not ONE | |||||
| void sbgemm_internal_kernel_alpha(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, | |||||
| OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc) | |||||
| #else // ALPHA is ONE | |||||
| void sbgemm_internal_kernel_one(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, | |||||
| OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc) | |||||
| #endif | |||||
| { | |||||
| bfloat16 block_A[BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M]; | |||||
| bfloat16 block_B[BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K]; | |||||
| // TODO: assume no trans for both A and B, to complement these scenarios later | |||||
| if (Order == CblasColMajor) { | |||||
| SBGEMM_BLOCKING_KERNEL_2(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); | |||||
| } else { | |||||
| } | |||||
| } | |||||
| @@ -1,8 +1,11 @@ | |||||
| /* the direct sgemm code written by Arjan van der Ven */ | /* the direct sgemm code written by Arjan van der Ven */ | ||||
| #if defined(SKYLAKEX) || defined (COOPERLAKE) | |||||
| #include <immintrin.h> | #include <immintrin.h> | ||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(SKYLAKEX) || defined (COOPERLAKE) | |||||
| /* | /* | ||||
| * "Direct sgemm" code. This code operates directly on the inputs and outputs | * "Direct sgemm" code. This code operates directly on the inputs and outputs | ||||
| * of the sgemm call, avoiding the copies, memory realignments and threading, | * of the sgemm call, avoiding the copies, memory realignments and threading, | ||||
| @@ -2,7 +2,7 @@ | |||||
| #if defined(SKYLAKEX) | #if defined(SKYLAKEX) | ||||
| #include "srot_microk_skylakex-2.c" | #include "srot_microk_skylakex-2.c" | ||||
| #elif defined(HASWELL) | |||||
| #elif defined(HASWELL) || defined(ZEN) | |||||
| #include "srot_microk_haswell-2.c" | #include "srot_microk_haswell-2.c" | ||||
| #endif | #endif | ||||
| @@ -13,7 +13,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) | |||||
| { | { | ||||
| BLASLONG i = 0; | BLASLONG i = 0; | ||||
| #if V_SIMD && (defined(HAVE_FMA3) || V_SIMD > 128) | |||||
| #if V_SIMD && !defined(C_PGI) && (defined(HAVE_FMA3) || V_SIMD > 128) | |||||
| const int vstep = v_nlanes_f32; | const int vstep = v_nlanes_f32; | ||||
| const int unrollx4 = n & (-vstep * 4); | const int unrollx4 = n & (-vstep * 4); | ||||
| const int unrollx = n & -vstep; | const int unrollx = n & -vstep; | ||||
| @@ -1,5 +1,4 @@ | |||||
| /* need a new enough GCC for avx512 support */ | |||||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) | |||||
| #if defined(HAVE_FMA3) && defined(HAVE_AVX2) | |||||
| #define HAVE_SROT_KERNEL 1 | #define HAVE_SROT_KERNEL 1 | ||||
| @@ -320,12 +320,13 @@ | |||||
| $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP | $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP | ||||
| COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, | COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, | ||||
| $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, | $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, | ||||
| $ U12, X | |||||
| $ U12, X, ABI12, Y | |||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| COMPLEX CLADIV | |||||
| LOGICAL LSAME | LOGICAL LSAME | ||||
| REAL CLANHS, SLAMCH | REAL CLANHS, SLAMCH | ||||
| EXTERNAL LSAME, CLANHS, SLAMCH | |||||
| EXTERNAL CLADIV, LLSAME, CLANHS, SLAMCH | |||||
| * .. | * .. | ||||
| * .. External Subroutines .. | * .. External Subroutines .. | ||||
| EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA | EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA | ||||
| @@ -729,22 +730,34 @@ | |||||
| AD22 = ( ASCALE*H( ILAST, ILAST ) ) / | AD22 = ( ASCALE*H( ILAST, ILAST ) ) / | ||||
| $ ( BSCALE*T( ILAST, ILAST ) ) | $ ( BSCALE*T( ILAST, ILAST ) ) | ||||
| ABI22 = AD22 - U12*AD21 | ABI22 = AD22 - U12*AD21 | ||||
| ABI12 = AD12 - U12*AD11 | |||||
| * | * | ||||
| T1 = HALF*( AD11+ABI22 ) | |||||
| RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 ) | |||||
| TEMP = REAL( T1-ABI22 )*REAL( RTDISC ) + | |||||
| $ AIMAG( T1-ABI22 )*AIMAG( RTDISC ) | |||||
| IF( TEMP.LE.ZERO ) THEN | |||||
| SHIFT = T1 + RTDISC | |||||
| ELSE | |||||
| SHIFT = T1 - RTDISC | |||||
| SHIFT = ABI22 | |||||
| CTEMP = SQRT( ABI12 )*SQRT( AD21 ) | |||||
| TEMP = ABS1( CTEMP ) | |||||
| IF( CTEMP.NE.ZERO ) THEN | |||||
| X = HALF*( AD11-SHIFT ) | |||||
| TEMP2 = ABS1( X ) | |||||
| TEMP = MAX( TEMP, ABS1( X ) ) | |||||
| Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 ) | |||||
| IF( TEMP2.GT.ZERO ) THEN | |||||
| IF( REAL( X / TEMP2 )*REAL( Y )+ | |||||
| $ AIMAG( X / TEMP2 )*AIMAG( Y ).LT.ZERO )Y = -Y | |||||
| END IF | |||||
| SHIFT = SHIFT - CTEMP*CLADIV( CTEMP, ( X+Y ) ) | |||||
| END IF | END IF | ||||
| ELSE | ELSE | ||||
| * | * | ||||
| * Exceptional shift. Chosen for no particularly good reason. | * Exceptional shift. Chosen for no particularly good reason. | ||||
| * | * | ||||
| ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/ | |||||
| $ (BSCALE*T(ILAST-1,ILAST-1)) | |||||
| IF( ( IITER / 20 )*20.EQ.IITER .AND. | |||||
| $ BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN | |||||
| ESHIFT = ESHIFT + ( ASCALE*H( ILAST, | |||||
| $ ILAST ) )/( BSCALE*T( ILAST, ILAST ) ) | |||||
| ELSE | |||||
| ESHIFT = ESHIFT + ( ASCALE*H( ILAST, | |||||
| $ ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) ) | |||||
| END IF | |||||
| SHIFT = ESHIFT | SHIFT = ESHIFT | ||||
| END IF | END IF | ||||
| * | * | ||||
| @@ -320,12 +320,13 @@ | |||||
| $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP | $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP | ||||
| COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, | COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, | ||||
| $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, | $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, | ||||
| $ U12, X | |||||
| $ U12, X, ABI12, Y | |||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| COMPLEX*16 ZLADIV | |||||
| LOGICAL LSAME | LOGICAL LSAME | ||||
| DOUBLE PRECISION DLAMCH, ZLANHS | DOUBLE PRECISION DLAMCH, ZLANHS | ||||
| EXTERNAL LSAME, DLAMCH, ZLANHS | |||||
| EXTERNAL ZLADIV, LSAME, DLAMCH, ZLANHS | |||||
| * .. | * .. | ||||
| * .. External Subroutines .. | * .. External Subroutines .. | ||||
| EXTERNAL XERBLA, ZLARTG, ZLASET, ZROT, ZSCAL | EXTERNAL XERBLA, ZLARTG, ZLASET, ZROT, ZSCAL | ||||
| @@ -730,22 +731,34 @@ | |||||
| AD22 = ( ASCALE*H( ILAST, ILAST ) ) / | AD22 = ( ASCALE*H( ILAST, ILAST ) ) / | ||||
| $ ( BSCALE*T( ILAST, ILAST ) ) | $ ( BSCALE*T( ILAST, ILAST ) ) | ||||
| ABI22 = AD22 - U12*AD21 | ABI22 = AD22 - U12*AD21 | ||||
| ABI12 = AD12 - U12*AD11 | |||||
| * | * | ||||
| T1 = HALF*( AD11+ABI22 ) | |||||
| RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 ) | |||||
| TEMP = DBLE( T1-ABI22 )*DBLE( RTDISC ) + | |||||
| $ DIMAG( T1-ABI22 )*DIMAG( RTDISC ) | |||||
| IF( TEMP.LE.ZERO ) THEN | |||||
| SHIFT = T1 + RTDISC | |||||
| ELSE | |||||
| SHIFT = T1 - RTDISC | |||||
| SHIFT = ABI22 | |||||
| CTEMP = SQRT( ABI12 )*SQRT( AD21 ) | |||||
| TEMP = ABS1( CTEMP ) | |||||
| IF( CTEMP.NE.ZERO ) THEN | |||||
| X = HALF*( AD11-SHIFT ) | |||||
| TEMP2 = ABS1( X ) | |||||
| TEMP = MAX( TEMP, ABS1( X ) ) | |||||
| Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 ) | |||||
| IF( TEMP2.GT.ZERO ) THEN | |||||
| IF( DBLE( X / TEMP2 )*DBLE( Y )+ | |||||
| $ DIMAG( X / TEMP2 )*DIMAG( Y ).LT.ZERO )Y = -Y | |||||
| END IF | |||||
| SHIFT = SHIFT - CTEMP*ZLADIV( CTEMP, ( X+Y ) ) | |||||
| END IF | END IF | ||||
| ELSE | ELSE | ||||
| * | * | ||||
| * Exceptional shift. Chosen for no particularly good reason. | * Exceptional shift. Chosen for no particularly good reason. | ||||
| * | * | ||||
| ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/ | |||||
| $ (BSCALE*T(ILAST-1,ILAST-1)) | |||||
| IF( ( IITER / 20 )*20.EQ.IITER .AND. | |||||
| $ BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN | |||||
| ESHIFT = ESHIFT + ( ASCALE*H( ILAST, | |||||
| $ ILAST ) )/( BSCALE*T( ILAST, ILAST ) ) | |||||
| ELSE | |||||
| ESHIFT = ESHIFT + ( ASCALE*H( ILAST, | |||||
| $ ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) ) | |||||
| END IF | |||||
| SHIFT = ESHIFT | SHIFT = ESHIFT | ||||
| END IF | END IF | ||||
| * | * | ||||
| @@ -174,7 +174,20 @@ if(PYTHONINTERP_FOUND) | |||||
| endif() | endif() | ||||
| if(WIN32) | |||||
| FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1 | |||||
| "if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n" | |||||
| "$ErrorActionPreference = \"Stop\"\n" | |||||
| "Get-Content $args[1] | & \"$($args[0]).exe\" | Out-File $args[2]\n" | |||||
| "If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n" | |||||
| "echo Error\n" | |||||
| "exit 1\n" | |||||
| "} else {\n" | |||||
| "exit 0\n" | |||||
| "}\n" | |||||
| ) | |||||
| set(helper_prefix powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1") | |||||
| else() | |||||
| # $1 exec, $2 input, $3 output_result | # $1 exec, $2 input, $3 output_result | ||||
| FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh | FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh | ||||
| "rm -f $3\n" | "rm -f $3\n" | ||||
| @@ -187,51 +200,52 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh | |||||
| "exit 0\n" | "exit 0\n" | ||||
| "fi\n" | "fi\n" | ||||
| ) | ) | ||||
| set(helper_prefix sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh") | |||||
| endif() | |||||
| add_test(NAME "REAL_LAPACK_linear_equation_routines" | add_test(NAME "REAL_LAPACK_linear_equation_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out" | |||||
| ) | ) | ||||
| add_test(NAME "COMPLEX_LAPACK_linear_equation_routines" | add_test(NAME "COMPLEX_LAPACK_linear_equation_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out" | |||||
| ) | ) | ||||
| add_test(NAME "DOUBLE_PRECISION_LAPACK_linear_equation_routines" | add_test(NAME "DOUBLE_PRECISION_LAPACK_linear_equation_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out" | |||||
| ) | ) | ||||
| add_test(NAME "COMPLEX16_LAPACK_linear_equation_routines" | add_test(NAME "COMPLEX16_LAPACK_linear_equation_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out" | |||||
| ) | ) | ||||
| add_test(NAME "SINGLE-DOUBLE_PRECISION_LAPACK_prototype_linear_equation_routines" | add_test(NAME "SINGLE-DOUBLE_PRECISION_LAPACK_prototype_linear_equation_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out" | |||||
| ) | ) | ||||
| # ======== COMPLEX-COMPLEX16 LIN TESTS ======================== | # ======== COMPLEX-COMPLEX16 LIN TESTS ======================== | ||||
| add_test(NAME "Testing_COMPLEX-COMPLEX16_LAPACK_prototype_linear_equation_routines" | add_test(NAME "Testing_COMPLEX-COMPLEX16_LAPACK_prototype_linear_equation_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out" | |||||
| ) | ) | ||||
| # ======== SINGLE RFP LIN TESTS ======================== | # ======== SINGLE RFP LIN TESTS ======================== | ||||
| add_test(NAME "Testing_REAL_LAPACK_RFP_prototype_linear_equation_routines" | add_test(NAME "Testing_REAL_LAPACK_RFP_prototype_linear_equation_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out" | |||||
| ) | ) | ||||
| # ======== COMPLEX16 RFP LIN TESTS ======================== | # ======== COMPLEX16 RFP LIN TESTS ======================== | ||||
| add_test(NAME "Testing_DOUBLE_PRECISION_LAPACK_RFP_prototype_linear_equation_routines" | add_test(NAME "Testing_DOUBLE_PRECISION_LAPACK_RFP_prototype_linear_equation_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out" | |||||
| ) | ) | ||||
| # ======== COMPLEX16 RFP LIN TESTS ======================== | # ======== COMPLEX16 RFP LIN TESTS ======================== | ||||
| add_test(NAME "Testing_COMPLEX_LAPACK_RFP_prototype_linear_equation_routines" | add_test(NAME "Testing_COMPLEX_LAPACK_RFP_prototype_linear_equation_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out" | |||||
| ) | ) | ||||
| # ======== COMPLEX16 RFP LIN TESTS ======================== | # ======== COMPLEX16 RFP LIN TESTS ======================== | ||||
| add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines" | add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out" | |||||
| ) | ) | ||||
| # | # | ||||
| # | # | ||||
| @@ -239,327 +253,327 @@ add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines" | |||||
| # | # | ||||
| add_test(NAME "SNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" | add_test(NAME "SNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out" | |||||
| ) | ) | ||||
| add_test(NAME "SSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" | add_test(NAME "SSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out" | |||||
| ) | ) | ||||
| add_test(NAME "SSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" | add_test(NAME "SSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out" | |||||
| ) | ) | ||||
| add_test(NAME "SSVD:_Testing_Singular_Value_Decomposition_routines" | add_test(NAME "SSVD:_Testing_Singular_Value_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out" | |||||
| ) | ) | ||||
| add_test(NAME "SSEC:_Testing_REAL_Eigen_Condition_Routines" | add_test(NAME "SSEC:_Testing_REAL_Eigen_Condition_Routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out" | |||||
| ) | ) | ||||
| add_test(NAME "SSEV:_Testing_REAL_Nonsymmetric_Eigenvalue_Driver" | add_test(NAME "SSEV:_Testing_REAL_Nonsymmetric_Eigenvalue_Driver" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out" | |||||
| ) | ) | ||||
| add_test(NAME "SGG:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" | add_test(NAME "SGG:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out" | |||||
| ) | ) | ||||
| add_test(NAME "SGD:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" | add_test(NAME "SGD:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out" | |||||
| ) | ) | ||||
| add_test(NAME "SSB:_Testing_REAL_Symmetric_Eigenvalue_Problem_routines" | add_test(NAME "SSB:_Testing_REAL_Symmetric_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out" | |||||
| ) | ) | ||||
| add_test(NAME "SSG:_Testing_REAL_Symmetric_Generalized_Eigenvalue_Problem_routines" | add_test(NAME "SSG:_Testing_REAL_Symmetric_Generalized_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out" | |||||
| ) | ) | ||||
| add_test(NAME "SGEBAL:_Testing_the_balancing_of_a_REAL_general_matrix" | add_test(NAME "SGEBAL:_Testing_the_balancing_of_a_REAL_general_matrix" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out" | |||||
| ) | ) | ||||
| add_test(NAME "SGEBAK:_Testing_the_back_transformation_of_a_REAL_balanced_matrix" | add_test(NAME "SGEBAK:_Testing_the_back_transformation_of_a_REAL_balanced_matrix" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out" | |||||
| ) | ) | ||||
| add_test(NAME "SGGBAL:_Testing_the_balancing_of_a_pair_of_REAL_general_matrices" | add_test(NAME "SGGBAL:_Testing_the_balancing_of_a_pair_of_REAL_general_matrices" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out" | |||||
| ) | ) | ||||
| add_test(NAME "SGGBAK:_Testing_the_back_transformation_of_a_pair_of_REAL_balanced_matrices" | add_test(NAME "SGGBAK:_Testing_the_back_transformation_of_a_pair_of_REAL_balanced_matrices" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out" | |||||
| ) | ) | ||||
| add_test(NAME "SBB:_Testing_banded_Singular_Value_Decomposition_routines" | add_test(NAME "SBB:_Testing_banded_Singular_Value_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out" | |||||
| ) | ) | ||||
| add_test(NAME "SGLM:_Testing_Generalized_Linear_Regression_Model_routines" | add_test(NAME "SGLM:_Testing_Generalized_Linear_Regression_Model_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out" | |||||
| ) | ) | ||||
| add_test(NAME "SGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" | add_test(NAME "SGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out" | |||||
| ) | ) | ||||
| add_test(NAME "SGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" | add_test(NAME "SGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out" | |||||
| ) | ) | ||||
| add_test(NAME "SCSD:_Testing_CS_Decomposition_routines" | add_test(NAME "SCSD:_Testing_CS_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out" | |||||
| ) | ) | ||||
| add_test(NAME "SLSE:_Testing_Constrained_Linear_Least_Squares_routines" | add_test(NAME "SLSE:_Testing_Constrained_Linear_Least_Squares_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out" | |||||
| ) | ) | ||||
| # ======== COMPLEX EIG TESTS =========================== | # ======== COMPLEX EIG TESTS =========================== | ||||
| add_test(NAME "CNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" | add_test(NAME "CNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out" | |||||
| ) | ) | ||||
| add_test(NAME "CSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" | add_test(NAME "CSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out" | |||||
| ) | ) | ||||
| add_test(NAME "CSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" | add_test(NAME "CSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out" | |||||
| ) | ) | ||||
| add_test(NAME "CSVD:_Testing_Singular_Value_Decomposition_routines" | add_test(NAME "CSVD:_Testing_Singular_Value_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out" | |||||
| ) | ) | ||||
| add_test(NAME "CEC:_Testing_COMPLEX_Eigen_Condition_Routines" | add_test(NAME "CEC:_Testing_COMPLEX_Eigen_Condition_Routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out" | |||||
| ) | ) | ||||
| add_test(NAME "CES:_Testing_COMPLEX_Nonsymmetric_Schur_Form_Driver" | add_test(NAME "CES:_Testing_COMPLEX_Nonsymmetric_Schur_Form_Driver" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out" | |||||
| ) | ) | ||||
| add_test(NAME "CGG:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" | add_test(NAME "CGG:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out" | |||||
| ) | ) | ||||
| add_test(NAME "CGD:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" | add_test(NAME "CGD:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out" | |||||
| ) | ) | ||||
| add_test(NAME "CHB:_Testing_Hermitian_Eigenvalue_Problem_routines" | add_test(NAME "CHB:_Testing_Hermitian_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out" | |||||
| ) | ) | ||||
| add_test(NAME "CSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" | add_test(NAME "CSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out" | |||||
| ) | ) | ||||
| add_test(NAME "CGEBAL:_Testing_the_balancing_of_a_COMPLEX_general_matrix" | add_test(NAME "CGEBAL:_Testing_the_balancing_of_a_COMPLEX_general_matrix" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out" | |||||
| ) | ) | ||||
| add_test(NAME "CGEBAK:_Testing_the_back_transformation_of_a_COMPLEX_balanced_matrix" | add_test(NAME "CGEBAK:_Testing_the_back_transformation_of_a_COMPLEX_balanced_matrix" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out" | |||||
| ) | ) | ||||
| add_test(NAME "CGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" | add_test(NAME "CGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out" | |||||
| ) | ) | ||||
| add_test(NAME "CGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX_balanced_matrices" | add_test(NAME "CGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX_balanced_matrices" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out" | |||||
| ) | ) | ||||
| add_test(NAME "CBB:_Testing_banded_Singular_Value_Decomposition_routines" | add_test(NAME "CBB:_Testing_banded_Singular_Value_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out" | |||||
| ) | ) | ||||
| add_test(NAME "CGLM:_Testing_Generalized_Linear_Regression_Model_routines" | add_test(NAME "CGLM:_Testing_Generalized_Linear_Regression_Model_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out" | |||||
| ) | ) | ||||
| add_test(NAME "CGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" | add_test(NAME "CGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out" | |||||
| ) | ) | ||||
| add_test(NAME "CGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" | add_test(NAME "CGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out" | |||||
| ) | ) | ||||
| add_test(NAME "CCSD:_Testing_CS_Decomposition_routines" | add_test(NAME "CCSD:_Testing_CS_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out" | |||||
| ) | ) | ||||
| add_test(NAME "CLSE:_Testing_Constrained_Linear_Least_Squares_routines" | add_test(NAME "CLSE:_Testing_Constrained_Linear_Least_Squares_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out" | |||||
| ) | ) | ||||
| # ======== DOUBLE EIG TESTS =========================== | # ======== DOUBLE EIG TESTS =========================== | ||||
| add_test(NAME "DNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" | add_test(NAME "DNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out" | |||||
| ) | ) | ||||
| add_test(NAME "DSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" | add_test(NAME "DSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out" | |||||
| ) | ) | ||||
| add_test(NAME "DSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" | add_test(NAME "DSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out" | |||||
| ) | ) | ||||
| add_test(NAME "DSVD:_Testing_Singular_Value_Decomposition_routines" | add_test(NAME "DSVD:_Testing_Singular_Value_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out" | |||||
| ) | ) | ||||
| add_test(NAME "DEC:_Testing_DOUBLE_PRECISION_Eigen_Condition_Routines" | add_test(NAME "DEC:_Testing_DOUBLE_PRECISION_Eigen_Condition_Routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out" | |||||
| ) | ) | ||||
| add_test(NAME "DEV:_Testing_DOUBLE_PRECISION_Nonsymmetric_Eigenvalue_Driver" | add_test(NAME "DEV:_Testing_DOUBLE_PRECISION_Nonsymmetric_Eigenvalue_Driver" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out" | |||||
| ) | ) | ||||
| add_test(NAME "DGG:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" | add_test(NAME "DGG:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out" | |||||
| ) | ) | ||||
| add_test(NAME "DGD:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" | add_test(NAME "DGD:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out" | |||||
| ) | ) | ||||
| add_test(NAME "DSB:_Testing_DOUBLE_PRECISION_Symmetric_Eigenvalue_Problem_routines" | add_test(NAME "DSB:_Testing_DOUBLE_PRECISION_Symmetric_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out" | |||||
| ) | ) | ||||
| add_test(NAME "DSG:_Testing_DOUBLE_PRECISION_Symmetric_Generalized_Eigenvalue_Problem_routines" | add_test(NAME "DSG:_Testing_DOUBLE_PRECISION_Symmetric_Generalized_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out" | |||||
| ) | ) | ||||
| add_test(NAME "DGEBAL:_Testing_the_balancing_of_a_DOUBLE_PRECISION_general_matrix" | add_test(NAME "DGEBAL:_Testing_the_balancing_of_a_DOUBLE_PRECISION_general_matrix" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out" | |||||
| ) | ) | ||||
| add_test(NAME "DGEBAK:_Testing_the_back_transformation_of_a_DOUBLE_PRECISION_balanced_matrix" | add_test(NAME "DGEBAK:_Testing_the_back_transformation_of_a_DOUBLE_PRECISION_balanced_matrix" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out" | |||||
| ) | ) | ||||
| add_test(NAME "DGGBAL:_Testing_the_balancing_of_a_pair_of_DOUBLE_PRECISION_general_matrices" | add_test(NAME "DGGBAL:_Testing_the_balancing_of_a_pair_of_DOUBLE_PRECISION_general_matrices" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out" | |||||
| ) | ) | ||||
| add_test(NAME "DGGBAK:_Testing_the_back_transformation_of_a_pair_of_DOUBLE_PRECISION_balanced_matrices" | add_test(NAME "DGGBAK:_Testing_the_back_transformation_of_a_pair_of_DOUBLE_PRECISION_balanced_matrices" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out" | |||||
| ) | ) | ||||
| add_test(NAME "DBB:_Testing_banded_Singular_Value_Decomposition_routines" | add_test(NAME "DBB:_Testing_banded_Singular_Value_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out" | |||||
| ) | ) | ||||
| add_test(NAME "DGLM:_Testing_Generalized_Linear_Regression_Model_routines" | add_test(NAME "DGLM:_Testing_Generalized_Linear_Regression_Model_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out" | |||||
| ) | ) | ||||
| add_test(NAME "DGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" | add_test(NAME "DGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out" | |||||
| ) | ) | ||||
| add_test(NAME "DGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" | add_test(NAME "DGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out" | |||||
| ) | ) | ||||
| add_test(NAME "DCSD:_Testing_CS_Decomposition_routines" | add_test(NAME "DCSD:_Testing_CS_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out" | |||||
| ) | ) | ||||
| add_test(NAME "DLSE:_Testing_Constrained_Linear_Least_Squares_routines" | add_test(NAME "DLSE:_Testing_Constrained_Linear_Least_Squares_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out" | |||||
| ) | ) | ||||
| # ======== COMPLEX16 EIG TESTS =========================== | # ======== COMPLEX16 EIG TESTS =========================== | ||||
| add_test(NAME "ZNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" | add_test(NAME "ZNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" | add_test(NAME "ZSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" | add_test(NAME "ZSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZSVD:_Testing_Singular_Value_Decomposition_routines" | add_test(NAME "ZSVD:_Testing_Singular_Value_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZEC:_Testing_COMPLEX16_Eigen_Condition_Routines" | add_test(NAME "ZEC:_Testing_COMPLEX16_Eigen_Condition_Routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZES:_Testing_COMPLEX16_Nonsymmetric_Schur_Form_Driver" | add_test(NAME "ZES:_Testing_COMPLEX16_Nonsymmetric_Schur_Form_Driver" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZGG:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" | add_test(NAME "ZGG:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZGD:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" | add_test(NAME "ZGD:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZHB:_Testing_Hermitian_Eigenvalue_Problem_routines" | add_test(NAME "ZHB:_Testing_Hermitian_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" | add_test(NAME "ZSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZGEBAL:_Testing_the_balancing_of_a_COMPLEX16_general_matrix" | add_test(NAME "ZGEBAL:_Testing_the_balancing_of_a_COMPLEX16_general_matrix" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZGEBAK:_Testing_the_back_transformation_of_a_COMPLEX16_balanced_matrix" | add_test(NAME "ZGEBAK:_Testing_the_back_transformation_of_a_COMPLEX16_balanced_matrix" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" | add_test(NAME "ZGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX16_balanced_matrices" | add_test(NAME "ZGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX16_balanced_matrices" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZBB:_Testing_banded_Singular_Value_Decomposition_routines" | add_test(NAME "ZBB:_Testing_banded_Singular_Value_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZGLM:_Testing_Generalized_Linear_Regression_Model_routines" | add_test(NAME "ZGLM:_Testing_Generalized_Linear_Regression_Model_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" | add_test(NAME "ZGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" | add_test(NAME "ZGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out" | |||||
| ) | ) | ||||
| add_test(NAME "ZCSD:_Testing_CS_Decomposition_routines" | add_test(NAME "ZCSD:_Testing_CS_Decomposition_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out" | |||||
| ) | ) | ||||
| add_test(NAME "Constrained_Linear_Least_Squares_routines" | add_test(NAME "Constrained_Linear_Least_Squares_routines" | ||||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out" | |||||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out" | |||||
| ) | ) | ||||
| @@ -25,7 +25,7 @@ set(AEIGTST | |||||
| set(SCIGTST slafts.f slahd2.f slasum.f slatb9.f sstech.f sstect.f | set(SCIGTST slafts.f slahd2.f slasum.f slatb9.f sstech.f sstect.f | ||||
| ssvdch.f ssvdct.f ssxt1.f) | ssvdch.f ssvdct.f ssxt1.f) | ||||
| set(SEIGTST schkee.f | |||||
| set(SEIGTST schkee.F | |||||
| sbdt01.f sbdt02.f sbdt03.f sbdt04.f sbdt05.f | sbdt01.f sbdt02.f sbdt03.f sbdt04.f sbdt05.f | ||||
| schkbb.f schkbd.f schkbk.f schkbl.f schkec.f | schkbb.f schkbd.f schkbk.f schkbl.f schkec.f | ||||
| schkgg.f schkgk.f schkgl.f schkhs.f schksb.f schkst.f schkst2stg.f schksb2stg.f | schkgg.f schkgk.f schkgl.f schkhs.f schksb.f schkst.f schkst2stg.f schksb2stg.f | ||||
| @@ -42,7 +42,7 @@ set(SEIGTST schkee.f | |||||
| sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f | sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f | ||||
| sstt22.f ssyt21.f ssyt22.f) | sstt22.f ssyt21.f ssyt22.f) | ||||
| set(CEIGTST cchkee.f | |||||
| set(CEIGTST cchkee.F | |||||
| cbdt01.f cbdt02.f cbdt03.f cbdt05.f | cbdt01.f cbdt02.f cbdt03.f cbdt05.f | ||||
| cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f | cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f | ||||
| cchkgg.f cchkgk.f cchkgl.f cchkhb.f cchkhs.f cchkst.f cchkst2stg.f cchkhb2stg.f | cchkgg.f cchkgk.f cchkgl.f cchkhb.f cchkhs.f cchkst.f cchkst2stg.f cchkhb2stg.f | ||||
| @@ -62,7 +62,7 @@ set(CEIGTST cchkee.f | |||||
| set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f | set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f | ||||
| dsvdch.f dsvdct.f dsxt1.f) | dsvdch.f dsvdct.f dsxt1.f) | ||||
| set(DEIGTST dchkee.f | |||||
| set(DEIGTST dchkee.F | |||||
| dbdt01.f dbdt02.f dbdt03.f dbdt04.f dbdt05.f | dbdt01.f dbdt02.f dbdt03.f dbdt04.f dbdt05.f | ||||
| dchkbb.f dchkbd.f dchkbk.f dchkbl.f dchkec.f | dchkbb.f dchkbd.f dchkbk.f dchkbl.f dchkec.f | ||||
| dchkgg.f dchkgk.f dchkgl.f dchkhs.f dchksb.f dchkst.f dchkst2stg.f dchksb2stg.f | dchkgg.f dchkgk.f dchkgl.f dchkhs.f dchksb.f dchkst.f dchkst2stg.f dchksb2stg.f | ||||
| @@ -79,7 +79,7 @@ set(DEIGTST dchkee.f | |||||
| dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f | dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f | ||||
| dstt22.f dsyt21.f dsyt22.f) | dstt22.f dsyt21.f dsyt22.f) | ||||
| set(ZEIGTST zchkee.f | |||||
| set(ZEIGTST zchkee.F | |||||
| zbdt01.f zbdt02.f zbdt03.f zbdt05.f | zbdt01.f zbdt02.f zbdt03.f zbdt05.f | ||||
| zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f | zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f | ||||
| zchkgg.f zchkgk.f zchkgl.f zchkhb.f zchkhs.f zchkst.f zchkst2stg.f zchkhb2stg.f | zchkgg.f zchkgk.f zchkgl.f zchkhb.f zchkhs.f zchkst.f zchkst2stg.f zchkhb2stg.f | ||||
| @@ -157,11 +157,11 @@ cleanobj: | |||||
| cleanexe: | cleanexe: | ||||
| rm -f xeigtst* | rm -f xeigtst* | ||||
| schkee.o: schkee.f | |||||
| schkee.o: schkee.F | |||||
| $(FC) $(FFLAGS_DRV) -c -o $@ $< | $(FC) $(FFLAGS_DRV) -c -o $@ $< | ||||
| dchkee.o: dchkee.f | |||||
| dchkee.o: dchkee.F | |||||
| $(FC) $(FFLAGS_DRV) -c -o $@ $< | $(FC) $(FFLAGS_DRV) -c -o $@ $< | ||||
| cchkee.o: cchkee.f | |||||
| cchkee.o: cchkee.F | |||||
| $(FC) $(FFLAGS_DRV) -c -o $@ $< | $(FC) $(FFLAGS_DRV) -c -o $@ $< | ||||
| zchkee.o: zchkee.f | |||||
| zchkee.o: zchkee.F | |||||
| $(FC) $(FFLAGS_DRV) -c -o $@ $< | $(FC) $(FFLAGS_DRV) -c -o $@ $< | ||||
| @@ -1034,6 +1034,10 @@ | |||||
| * ===================================================================== | * ===================================================================== | ||||
| PROGRAM CCHKEE | PROGRAM CCHKEE | ||||
| * | * | ||||
| #if defined(_OPENMP) | |||||
| use omp_lib | |||||
| #endif | |||||
| * | |||||
| * -- LAPACK test routine (version 3.7.0) -- | * -- LAPACK test routine (version 3.7.0) -- | ||||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | * -- LAPACK is a software package provided by Univ. of Tennessee, -- | ||||
| * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | ||||
| @@ -1071,7 +1075,7 @@ | |||||
| CHARACTER*80 LINE | CHARACTER*80 LINE | ||||
| INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, | INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, | ||||
| $ NK, NN, NPARMS, NRHS, NTYPES, | $ NK, NN, NPARMS, NRHS, NTYPES, | ||||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH | |||||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS | |||||
| REAL EPS, S1, S2, THRESH, THRSHN | REAL EPS, S1, S2, THRESH, THRSHN | ||||
| * .. | * .. | ||||
| * .. Local Arrays .. | * .. Local Arrays .. | ||||
| @@ -1084,12 +1088,16 @@ | |||||
| INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), | INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), | ||||
| $ ISHFTS( MAXIN ), IACC22( MAXIN ) | $ ISHFTS( MAXIN ), IACC22( MAXIN ) | ||||
| REAL ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), | REAL ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), | ||||
| $ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX ) | |||||
| COMPLEX A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), | |||||
| $ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ), | |||||
| $ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ), | |||||
| $ RESULT( 500 ) | |||||
| COMPLEX DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ), | |||||
| $ X( 5*NMAX ) | $ X( 5*NMAX ) | ||||
| * .. | * .. | ||||
| * .. Allocatable Arrays .. | |||||
| INTEGER AllocateStatus | |||||
| REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S | |||||
| COMPLEX, DIMENSION(:), ALLOCATABLE :: WORK | |||||
| COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, C | |||||
| * .. | |||||
| * .. External Functions .. | * .. External Functions .. | ||||
| LOGICAL LSAMEN | LOGICAL LSAMEN | ||||
| REAL SECOND, SLAMCH | REAL SECOND, SLAMCH | ||||
| @@ -1130,6 +1138,21 @@ | |||||
| DATA INTSTR / '0123456789' / | DATA INTSTR / '0123456789' / | ||||
| DATA IOLDSD / 0, 0, 0, 1 / | DATA IOLDSD / 0, 0, 0, 1 / | ||||
| * .. | * .. | ||||
| * .. Allocate memory dynamically .. | |||||
| * | |||||
| ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| * .. | |||||
| * .. Executable Statements .. | * .. Executable Statements .. | ||||
| * | * | ||||
| A = 0.0 | A = 0.0 | ||||
| @@ -1846,8 +1869,16 @@ | |||||
| CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | ||||
| CALL XLAENV( 1, 1 ) | CALL XLAENV( 1, 1 ) | ||||
| CALL XLAENV( 9, 25 ) | CALL XLAENV( 9, 25 ) | ||||
| IF( TSTERR ) | |||||
| $ CALL CERRST( 'CST', NOUT ) | |||||
| IF( TSTERR ) THEN | |||||
| #if defined(_OPENMP) | |||||
| N_THREADS = OMP_GET_NUM_THREADS() | |||||
| CALL OMP_SET_NUM_THREADS(1) | |||||
| #endif | |||||
| CALL CERRST( 'CST', NOUT ) | |||||
| #if defined(_OPENMP) | |||||
| CALL OMP_SET_NUM_THREADS(N_THREADS) | |||||
| #endif | |||||
| END IF | |||||
| DO 290 I = 1, NPARMS | DO 290 I = 1, NPARMS | ||||
| CALL XLAENV( 1, NBVAL( I ) ) | CALL XLAENV( 1, NBVAL( I ) ) | ||||
| CALL XLAENV( 2, NBMIN( I ) ) | CALL XLAENV( 2, NBMIN( I ) ) | ||||
| @@ -2305,8 +2336,16 @@ | |||||
| MAXTYP = 15 | MAXTYP = 15 | ||||
| NTYPES = MIN( MAXTYP, NTYPES ) | NTYPES = MIN( MAXTYP, NTYPES ) | ||||
| CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | ||||
| IF( TSTERR ) | |||||
| $ CALL CERRST( 'CHB', NOUT ) | |||||
| IF( TSTERR ) THEN | |||||
| #if defined(_OPENMP) | |||||
| N_THREADS = OMP_GET_NUM_THREADS() | |||||
| CALL OMP_SET_NUM_THREADS(1) | |||||
| #endif | |||||
| CALL CERRST( 'CHB', NOUT ) | |||||
| #if defined(_OPENMP) | |||||
| CALL OMP_SET_NUM_THREADS(N_THREADS) | |||||
| #endif | |||||
| END IF | |||||
| * CALL CCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, | * CALL CCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, | ||||
| * $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), | * $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), | ||||
| * $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, | * $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, | ||||
| @@ -2436,7 +2475,14 @@ | |||||
| 380 CONTINUE | 380 CONTINUE | ||||
| WRITE( NOUT, FMT = 9994 ) | WRITE( NOUT, FMT = 9994 ) | ||||
| S2 = SECOND( ) | S2 = SECOND( ) | ||||
| WRITE( NOUT, FMT = 9993 )S2 - S1 | |||||
| WRITE( NOUT, FMT = 9993 )S2 - S1 | |||||
| * | |||||
| DEALLOCATE (S, STAT = AllocateStatus) | |||||
| DEALLOCATE (A, STAT = AllocateStatus) | |||||
| DEALLOCATE (B, STAT = AllocateStatus) | |||||
| DEALLOCATE (C, STAT = AllocateStatus) | |||||
| DEALLOCATE (RWORK, STAT = AllocateStatus) | |||||
| DEALLOCATE (WORK, STAT = AllocateStatus) | |||||
| * | * | ||||
| 9999 FORMAT( / ' Execution not attempted due to input errors' ) | 9999 FORMAT( / ' Execution not attempted due to input errors' ) | ||||
| 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) | 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) | ||||
| @@ -1038,7 +1038,11 @@ | |||||
| *> \ingroup double_eig | *> \ingroup double_eig | ||||
| * | * | ||||
| * ===================================================================== | * ===================================================================== | ||||
| PROGRAM DCHKEE | |||||
| PROGRAM DCHKEE | |||||
| * | |||||
| #if defined(_OPENMP) | |||||
| use omp_lib | |||||
| #endif | |||||
| * | * | ||||
| * -- LAPACK test routine (version 3.7.0) -- | * -- LAPACK test routine (version 3.7.0) -- | ||||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | * -- LAPACK is a software package provided by Univ. of Tennessee, -- | ||||
| @@ -1077,7 +1081,7 @@ | |||||
| CHARACTER*80 LINE | CHARACTER*80 LINE | ||||
| INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, | INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, | ||||
| $ NK, NN, NPARMS, NRHS, NTYPES, | $ NK, NN, NPARMS, NRHS, NTYPES, | ||||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH | |||||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS | |||||
| DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN | DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN | ||||
| * .. | * .. | ||||
| * .. Local Arrays .. | * .. Local Arrays .. | ||||
| @@ -1089,10 +1093,13 @@ | |||||
| $ PVAL( MAXIN ) | $ PVAL( MAXIN ) | ||||
| INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), | INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), | ||||
| $ ISHFTS( MAXIN ), IACC22( MAXIN ) | $ ISHFTS( MAXIN ), IACC22( MAXIN ) | ||||
| DOUBLE PRECISION A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), | |||||
| $ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ), | |||||
| $ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ), | |||||
| $ WORK( LWORK ), X( 5*NMAX ) | |||||
| DOUBLE PRECISION D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ), | |||||
| $ TAUB( NMAX ), X( 5*NMAX ) | |||||
| * .. | |||||
| * .. Allocatable Arrays .. | |||||
| INTEGER AllocateStatus | |||||
| DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: WORK | |||||
| DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, C | |||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| LOGICAL LSAMEN | LOGICAL LSAMEN | ||||
| @@ -1132,7 +1139,18 @@ | |||||
| * .. | * .. | ||||
| * .. Data statements .. | * .. Data statements .. | ||||
| DATA INTSTR / '0123456789' / | DATA INTSTR / '0123456789' / | ||||
| DATA IOLDSD / 0, 0, 0, 1 / | |||||
| DATA IOLDSD / 0, 0, 0, 1 / | |||||
| * .. | |||||
| * .. Allocate memory dynamically .. | |||||
| * | |||||
| ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| * .. | * .. | ||||
| * .. Executable Statements .. | * .. Executable Statements .. | ||||
| * | * | ||||
| @@ -1856,8 +1874,16 @@ | |||||
| CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | ||||
| CALL XLAENV( 1, 1 ) | CALL XLAENV( 1, 1 ) | ||||
| CALL XLAENV( 9, 25 ) | CALL XLAENV( 9, 25 ) | ||||
| IF( TSTERR ) | |||||
| $ CALL DERRST( 'DST', NOUT ) | |||||
| IF( TSTERR ) THEN | |||||
| #if defined(_OPENMP) | |||||
| N_THREADS = OMP_GET_NUM_THREADS() | |||||
| CALL OMP_SET_NUM_THREADS(1) | |||||
| #endif | |||||
| CALL DERRST( 'DST', NOUT ) | |||||
| #if defined(_OPENMP) | |||||
| CALL OMP_SET_NUM_THREADS(N_THREADS) | |||||
| #endif | |||||
| END IF | |||||
| DO 290 I = 1, NPARMS | DO 290 I = 1, NPARMS | ||||
| CALL XLAENV( 1, NBVAL( I ) ) | CALL XLAENV( 1, NBVAL( I ) ) | ||||
| CALL XLAENV( 2, NBMIN( I ) ) | CALL XLAENV( 2, NBMIN( I ) ) | ||||
| @@ -2436,7 +2462,12 @@ | |||||
| 380 CONTINUE | 380 CONTINUE | ||||
| WRITE( NOUT, FMT = 9994 ) | WRITE( NOUT, FMT = 9994 ) | ||||
| S2 = DSECND( ) | S2 = DSECND( ) | ||||
| WRITE( NOUT, FMT = 9993 )S2 - S1 | |||||
| WRITE( NOUT, FMT = 9993 )S2 - S1 | |||||
| * | |||||
| DEALLOCATE (A, STAT = AllocateStatus) | |||||
| DEALLOCATE (B, STAT = AllocateStatus) | |||||
| DEALLOCATE (C, STAT = AllocateStatus) | |||||
| DEALLOCATE (WORK, STAT = AllocateStatus) | |||||
| * | * | ||||
| 9999 FORMAT( / ' Execution not attempted due to input errors' ) | 9999 FORMAT( / ' Execution not attempted due to input errors' ) | ||||
| 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) | 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) | ||||
| @@ -1040,6 +1040,10 @@ | |||||
| * ===================================================================== | * ===================================================================== | ||||
| PROGRAM SCHKEE | PROGRAM SCHKEE | ||||
| * | * | ||||
| #if defined(_OPENMP) | |||||
| use omp_lib | |||||
| #endif | |||||
| * | |||||
| * -- LAPACK test routine (version 3.7.0) -- | * -- LAPACK test routine (version 3.7.0) -- | ||||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | * -- LAPACK is a software package provided by Univ. of Tennessee, -- | ||||
| * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | ||||
| @@ -1077,7 +1081,7 @@ | |||||
| CHARACTER*80 LINE | CHARACTER*80 LINE | ||||
| INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, | INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, | ||||
| $ NK, NN, NPARMS, NRHS, NTYPES, | $ NK, NN, NPARMS, NRHS, NTYPES, | ||||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH | |||||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS | |||||
| REAL EPS, S1, S2, THRESH, THRSHN | REAL EPS, S1, S2, THRESH, THRSHN | ||||
| * .. | * .. | ||||
| * .. Local Arrays .. | * .. Local Arrays .. | ||||
| @@ -1089,10 +1093,13 @@ | |||||
| $ PVAL( MAXIN ) | $ PVAL( MAXIN ) | ||||
| INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), | INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), | ||||
| $ ISHFTS( MAXIN ), IACC22( MAXIN ) | $ ISHFTS( MAXIN ), IACC22( MAXIN ) | ||||
| REAL A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), | |||||
| $ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ), | |||||
| $ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ), | |||||
| $ WORK( LWORK ), X( 5*NMAX ) | |||||
| REAL D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ), | |||||
| $ TAUB( NMAX ), X( 5*NMAX ) | |||||
| * .. | |||||
| * .. Allocatable Arrays .. | |||||
| INTEGER AllocateStatus | |||||
| REAL, DIMENSION(:), ALLOCATABLE :: WORK | |||||
| REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, C | |||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| LOGICAL LSAMEN | LOGICAL LSAMEN | ||||
| @@ -1132,7 +1139,18 @@ | |||||
| * .. | * .. | ||||
| * .. Data statements .. | * .. Data statements .. | ||||
| DATA INTSTR / '0123456789' / | DATA INTSTR / '0123456789' / | ||||
| DATA IOLDSD / 0, 0, 0, 1 / | |||||
| DATA IOLDSD / 0, 0, 0, 1 / | |||||
| * .. | |||||
| * .. Allocate memory dynamically .. | |||||
| * | |||||
| ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| * .. | * .. | ||||
| * .. Executable Statements .. | * .. Executable Statements .. | ||||
| * | * | ||||
| @@ -1857,8 +1875,16 @@ | |||||
| CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | ||||
| CALL XLAENV( 1, 1 ) | CALL XLAENV( 1, 1 ) | ||||
| CALL XLAENV( 9, 25 ) | CALL XLAENV( 9, 25 ) | ||||
| IF( TSTERR ) | |||||
| $ CALL SERRST( 'SST', NOUT ) | |||||
| IF( TSTERR ) THEN | |||||
| #if defined(_OPENMP) | |||||
| N_THREADS = OMP_GET_NUM_THREADS() | |||||
| CALL OMP_SET_NUM_THREADS(1) | |||||
| #endif | |||||
| CALL SERRST( 'SST', NOUT ) | |||||
| #if defined(_OPENMP) | |||||
| CALL OMP_SET_NUM_THREADS(N_THREADS) | |||||
| #endif | |||||
| END IF | |||||
| DO 290 I = 1, NPARMS | DO 290 I = 1, NPARMS | ||||
| CALL XLAENV( 1, NBVAL( I ) ) | CALL XLAENV( 1, NBVAL( I ) ) | ||||
| CALL XLAENV( 2, NBMIN( I ) ) | CALL XLAENV( 2, NBMIN( I ) ) | ||||
| @@ -2440,6 +2466,11 @@ | |||||
| WRITE( NOUT, FMT = 9994 ) | WRITE( NOUT, FMT = 9994 ) | ||||
| S2 = SECOND( ) | S2 = SECOND( ) | ||||
| WRITE( NOUT, FMT = 9993 )S2 - S1 | WRITE( NOUT, FMT = 9993 )S2 - S1 | ||||
| * | |||||
| DEALLOCATE (A, STAT = AllocateStatus) | |||||
| DEALLOCATE (B, STAT = AllocateStatus) | |||||
| DEALLOCATE (C, STAT = AllocateStatus) | |||||
| DEALLOCATE (WORK, STAT = AllocateStatus) | |||||
| * | * | ||||
| 9999 FORMAT( / ' Execution not attempted due to input errors' ) | 9999 FORMAT( / ' Execution not attempted due to input errors' ) | ||||
| 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) | 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) | ||||
| @@ -1034,6 +1034,10 @@ | |||||
| * ===================================================================== | * ===================================================================== | ||||
| PROGRAM ZCHKEE | PROGRAM ZCHKEE | ||||
| * | * | ||||
| #if defined(_OPENMP) | |||||
| use omp_lib | |||||
| #endif | |||||
| * | |||||
| * -- LAPACK test routine (version 3.7.0) -- | * -- LAPACK test routine (version 3.7.0) -- | ||||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | * -- LAPACK is a software package provided by Univ. of Tennessee, -- | ||||
| * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | ||||
| @@ -1071,7 +1075,7 @@ | |||||
| CHARACTER*80 LINE | CHARACTER*80 LINE | ||||
| INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, | INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, | ||||
| $ NK, NN, NPARMS, NRHS, NTYPES, | $ NK, NN, NPARMS, NRHS, NTYPES, | ||||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH | |||||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS | |||||
| DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN | DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN | ||||
| * .. | * .. | ||||
| * .. Local Arrays .. | * .. Local Arrays .. | ||||
| @@ -1084,12 +1088,16 @@ | |||||
| INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), | INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), | ||||
| $ ISHFTS( MAXIN ), IACC22( MAXIN ) | $ ISHFTS( MAXIN ), IACC22( MAXIN ) | ||||
| DOUBLE PRECISION ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), | DOUBLE PRECISION ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), | ||||
| $ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX ) | |||||
| COMPLEX*16 A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), | |||||
| $ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ), | |||||
| $ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ), | |||||
| $ RESULT( 500 ) | |||||
| COMPLEX*16 DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ), | |||||
| $ X( 5*NMAX ) | $ X( 5*NMAX ) | ||||
| * .. | * .. | ||||
| * .. Allocatable Arrays .. | |||||
| INTEGER AllocateStatus | |||||
| DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S | |||||
| COMPLEX*16, DIMENSION(:), ALLOCATABLE :: WORK | |||||
| COMPLEX*16, DIMENSION(:,:), ALLOCATABLE :: A, B, C | |||||
| * .. | |||||
| * .. External Functions .. | * .. External Functions .. | ||||
| LOGICAL LSAMEN | LOGICAL LSAMEN | ||||
| DOUBLE PRECISION DLAMCH, DSECND | DOUBLE PRECISION DLAMCH, DSECND | ||||
| @@ -1130,6 +1138,21 @@ | |||||
| DATA INTSTR / '0123456789' / | DATA INTSTR / '0123456789' / | ||||
| DATA IOLDSD / 0, 0, 0, 1 / | DATA IOLDSD / 0, 0, 0, 1 / | ||||
| * .. | * .. | ||||
| * .. Allocate memory dynamically .. | |||||
| * | |||||
| ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) | |||||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||||
| * .. | |||||
| * .. Executable Statements .. | * .. Executable Statements .. | ||||
| * | * | ||||
| A = 0.0 | A = 0.0 | ||||
| @@ -1846,8 +1869,16 @@ | |||||
| CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | ||||
| CALL XLAENV( 1, 1 ) | CALL XLAENV( 1, 1 ) | ||||
| CALL XLAENV( 9, 25 ) | CALL XLAENV( 9, 25 ) | ||||
| IF( TSTERR ) | |||||
| $ CALL ZERRST( 'ZST', NOUT ) | |||||
| IF( TSTERR ) THEN | |||||
| #if defined(_OPENMP) | |||||
| N_THREADS = OMP_GET_NUM_THREADS() | |||||
| CALL OMP_SET_NUM_THREADS(1) | |||||
| #endif | |||||
| CALL ZERRST( 'ZST', NOUT ) | |||||
| #if defined(_OPENMP) | |||||
| CALL OMP_SET_NUM_THREADS(N_THREADS) | |||||
| #endif | |||||
| END IF | |||||
| DO 290 I = 1, NPARMS | DO 290 I = 1, NPARMS | ||||
| CALL XLAENV( 1, NBVAL( I ) ) | CALL XLAENV( 1, NBVAL( I ) ) | ||||
| CALL XLAENV( 2, NBMIN( I ) ) | CALL XLAENV( 2, NBMIN( I ) ) | ||||
| @@ -2303,8 +2334,16 @@ | |||||
| MAXTYP = 15 | MAXTYP = 15 | ||||
| NTYPES = MIN( MAXTYP, NTYPES ) | NTYPES = MIN( MAXTYP, NTYPES ) | ||||
| CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | ||||
| IF( TSTERR ) | |||||
| $ CALL ZERRST( 'ZHB', NOUT ) | |||||
| IF( TSTERR ) THEN | |||||
| #if defined(_OPENMP) | |||||
| N_THREADS = OMP_GET_NUM_THREADS() | |||||
| CALL OMP_SET_NUM_THREADS(1) | |||||
| #endif | |||||
| CALL ZERRST( 'ZHB', NOUT ) | |||||
| #if defined(_OPENMP) | |||||
| CALL OMP_SET_NUM_THREADS(N_THREADS) | |||||
| #endif | |||||
| END IF | |||||
| * CALL ZCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, | * CALL ZCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, | ||||
| * $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), | * $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), | ||||
| * $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, | * $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, | ||||
| @@ -2435,6 +2474,13 @@ | |||||
| WRITE( NOUT, FMT = 9994 ) | WRITE( NOUT, FMT = 9994 ) | ||||
| S2 = DSECND( ) | S2 = DSECND( ) | ||||
| WRITE( NOUT, FMT = 9993 )S2 - S1 | WRITE( NOUT, FMT = 9993 )S2 - S1 | ||||
| * | |||||
| DEALLOCATE (S, STAT = AllocateStatus) | |||||
| DEALLOCATE (A, STAT = AllocateStatus) | |||||
| DEALLOCATE (B, STAT = AllocateStatus) | |||||
| DEALLOCATE (C, STAT = AllocateStatus) | |||||
| DEALLOCATE (RWORK, STAT = AllocateStatus) | |||||
| DEALLOCATE (WORK, STAT = AllocateStatus) | |||||
| * | * | ||||
| 9999 FORMAT( / ' Execution not attempted due to input errors' ) | 9999 FORMAT( / ' Execution not attempted due to input errors' ) | ||||
| 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) | 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) | ||||
| @@ -72,6 +72,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #ifndef PARAM_H | #ifndef PARAM_H | ||||
| #define PARAM_H | #define PARAM_H | ||||
| #define LONGCAST (BLASLONG) | |||||
| #if defined(__BYTE_ORDER__) | |||||
| #if __GNUC__ < 9 | |||||
| #undef LONGCAST | |||||
| #define LONGCAST | |||||
| #endif | |||||
| #endif | |||||
| #define SBGEMM_DEFAULT_UNROLL_N 4 | #define SBGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define SBGEMM_DEFAULT_UNROLL_M 8 | #define SBGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define SBGEMM_DEFAULT_UNROLL_MN 32 | #define SBGEMM_DEFAULT_UNROLL_MN 32 | ||||
| @@ -85,7 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 64 | #define GEMM_DEFAULT_OFFSET_A 64 | ||||
| #define GEMM_DEFAULT_OFFSET_B 256 | #define GEMM_DEFAULT_OFFSET_B 256 | ||||
| #define GEMM_DEFAULT_ALIGN 0x01ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | #define DGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -157,7 +165,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 64 | #define GEMM_DEFAULT_OFFSET_A 64 | ||||
| #define GEMM_DEFAULT_OFFSET_B 832 | #define GEMM_DEFAULT_OFFSET_B 832 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | #define DGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -237,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 64 | #define GEMM_DEFAULT_OFFSET_A 64 | ||||
| #define GEMM_DEFAULT_OFFSET_B 832 | #define GEMM_DEFAULT_OFFSET_B 832 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL | |||||
| @@ -330,7 +338,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 64 | #define GEMM_DEFAULT_OFFSET_A 64 | ||||
| #define GEMM_DEFAULT_OFFSET_B 832 | #define GEMM_DEFAULT_OFFSET_B 832 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL | |||||
| @@ -422,7 +430,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 64 | #define GEMM_DEFAULT_OFFSET_A 64 | ||||
| #define GEMM_DEFAULT_OFFSET_B 832 | #define GEMM_DEFAULT_OFFSET_B 832 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL | |||||
| @@ -515,7 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 64 | #define GEMM_DEFAULT_OFFSET_A 64 | ||||
| #define GEMM_DEFAULT_OFFSET_B 832 | #define GEMM_DEFAULT_OFFSET_B 832 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL | |||||
| @@ -607,7 +615,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SYMV_P 8 | #define SYMV_P 8 | ||||
| @@ -726,7 +734,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 384 | #define GEMM_DEFAULT_OFFSET_B 384 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | #define DGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -774,7 +782,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 256 | #define GEMM_DEFAULT_OFFSET_B 256 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | #define DGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -821,7 +829,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 64 | #define GEMM_DEFAULT_OFFSET_A 64 | ||||
| #define GEMM_DEFAULT_OFFSET_B 256 | #define GEMM_DEFAULT_OFFSET_B 256 | ||||
| #define GEMM_DEFAULT_ALIGN 0x01ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL | |||||
| #ifdef ARCH_X86 | #ifdef ARCH_X86 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -890,7 +898,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||||
| #ifdef HAVE_SSE | #ifdef HAVE_SSE | ||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | #define SGEMM_DEFAULT_UNROLL_M 8 | ||||
| @@ -945,7 +953,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||||
| #ifdef CORE_YONAH | #ifdef CORE_YONAH | ||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
| @@ -1011,7 +1019,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 32 | #define GEMM_DEFAULT_OFFSET_B 32 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||||
| #define SYMV_P 8 | #define SYMV_P 8 | ||||
| @@ -1068,7 +1076,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_B 256 | #define GEMM_DEFAULT_OFFSET_B 256 | ||||
| #endif | #endif | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||||
| #define SYMV_P 8 | #define SYMV_P 8 | ||||
| @@ -1128,7 +1136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 448 | #define GEMM_DEFAULT_OFFSET_A 448 | ||||
| #define GEMM_DEFAULT_OFFSET_B 128 | #define GEMM_DEFAULT_OFFSET_B 128 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SYMV_P 8 | #define SYMV_P 8 | ||||
| @@ -1201,7 +1209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 128 | #define GEMM_DEFAULT_OFFSET_A 128 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SYMV_P 8 | #define SYMV_P 8 | ||||
| @@ -1272,7 +1280,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 128 | #define GEMM_DEFAULT_OFFSET_A 128 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SYMV_P 8 | #define SYMV_P 8 | ||||
| @@ -1344,7 +1352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 32 | #define GEMM_DEFAULT_OFFSET_A 32 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SYMV_P 8 | #define SYMV_P 8 | ||||
| @@ -1417,7 +1425,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SYMV_P 8 | #define SYMV_P 8 | ||||
| @@ -1510,7 +1518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SYMV_P 8 | #define SYMV_P 8 | ||||
| @@ -1636,7 +1644,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SYMV_P 8 | #define SYMV_P 8 | ||||
| @@ -1877,7 +1885,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 64 | #define GEMM_DEFAULT_OFFSET_A 64 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||||
| #define SYMV_P 8 | #define SYMV_P 8 | ||||
| @@ -1939,7 +1947,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 128 | #define GEMM_DEFAULT_OFFSET_B 128 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | #define SGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | #define SGEMM_DEFAULT_UNROLL_N 8 | ||||
| @@ -1993,7 +2001,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 512 | #define GEMM_DEFAULT_OFFSET_A 512 | ||||
| #define GEMM_DEFAULT_OFFSET_B 512 | #define GEMM_DEFAULT_OFFSET_B 512 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -2061,7 +2069,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 8192 | #define GEMM_DEFAULT_OFFSET_B 8192 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | #define SGEMM_DEFAULT_UNROLL_M 16 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -2088,7 +2096,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #ifdef PPCG4 | #ifdef PPCG4 | ||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 1024 | #define GEMM_DEFAULT_OFFSET_B 1024 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | #define SGEMM_DEFAULT_UNROLL_M 16 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -2119,7 +2127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 2688 | #define GEMM_DEFAULT_OFFSET_A 2688 | ||||
| #define GEMM_DEFAULT_OFFSET_B 3072 | #define GEMM_DEFAULT_OFFSET_B 3072 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL | |||||
| #if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | #if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | ||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
| @@ -2168,7 +2176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A (32 * 0) | #define GEMM_DEFAULT_OFFSET_A (32 * 0) | ||||
| #define GEMM_DEFAULT_OFFSET_B (32 * 0) | #define GEMM_DEFAULT_OFFSET_B (32 * 0) | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -2204,7 +2212,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A (32 * 0) | #define GEMM_DEFAULT_OFFSET_A (32 * 0) | ||||
| #define GEMM_DEFAULT_OFFSET_B (32 * 0) | #define GEMM_DEFAULT_OFFSET_B (32 * 0) | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | #define SGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -2239,7 +2247,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(POWER3) || defined(POWER4) || defined(POWER5) | #if defined(POWER3) || defined(POWER4) || defined(POWER5) | ||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 2048 | #define GEMM_DEFAULT_OFFSET_B 2048 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -2312,7 +2320,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 384 | #define GEMM_DEFAULT_OFFSET_A 384 | ||||
| #define GEMM_DEFAULT_OFFSET_B 1024 | #define GEMM_DEFAULT_OFFSET_B 1024 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -2344,7 +2352,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 65536 | #define GEMM_DEFAULT_OFFSET_B 65536 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL | |||||
| #if defined(__32BIT__) | #if defined(__32BIT__) | ||||
| #warning using BINARY32==POWER6 | #warning using BINARY32==POWER6 | ||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
| @@ -2397,7 +2406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 65536 | #define GEMM_DEFAULT_OFFSET_B 65536 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL | |||||
| #define SWITCH_RATIO 16 | |||||
| #define GEMM_PREFERED_SIZE 16 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | #define SGEMM_DEFAULT_UNROLL_M 16 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | #define SGEMM_DEFAULT_UNROLL_N 8 | ||||
| @@ -2433,24 +2445,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 65536 | #define GEMM_DEFAULT_OFFSET_B 65536 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL | |||||
| #define SWITCH_RATIO 16 | |||||
| #define GEMM_PREFERED_SIZE 16 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | #define SGEMM_DEFAULT_UNROLL_M 16 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | #define SGEMM_DEFAULT_UNROLL_N 8 | ||||
| #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||||
| #define DGEMM_DEFAULT_UNROLL_M 16 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #else | |||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | #define DGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define DGEMM_DEFAULT_UNROLL_N 8 | #define DGEMM_DEFAULT_UNROLL_N 8 | ||||
| #endif | |||||
| #define CGEMM_DEFAULT_UNROLL_M 8 | #define CGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | #define CGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define ZGEMM_DEFAULT_UNROLL_M 8 | #define ZGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | #define ZGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define SGEMM_DEFAULT_P 832 | |||||
| #define DGEMM_DEFAULT_P 320 | |||||
| #define SGEMM_DEFAULT_P 512 | |||||
| #define DGEMM_DEFAULT_P 384 | |||||
| #define CGEMM_DEFAULT_P 512 | #define CGEMM_DEFAULT_P 512 | ||||
| #define ZGEMM_DEFAULT_P 256 | #define ZGEMM_DEFAULT_P 256 | ||||
| #define SGEMM_DEFAULT_Q 1026 | |||||
| #define DGEMM_DEFAULT_Q 960 | |||||
| #define SGEMM_DEFAULT_Q 512 | |||||
| #define DGEMM_DEFAULT_Q 512 | |||||
| #define CGEMM_DEFAULT_Q 1026 | #define CGEMM_DEFAULT_Q 1026 | ||||
| #define ZGEMM_DEFAULT_Q 1026 | #define ZGEMM_DEFAULT_Q 1026 | ||||
| @@ -2480,7 +2500,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 2048 | #define GEMM_DEFAULT_OFFSET_B 2048 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 2 | #define SGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | #define SGEMM_DEFAULT_UNROLL_N 8 | ||||
| @@ -2512,7 +2532,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 2048 | #define GEMM_DEFAULT_OFFSET_B 2048 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -2543,7 +2563,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 2 | #define SGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | #define SGEMM_DEFAULT_UNROLL_N 8 | ||||
| @@ -2578,7 +2598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #ifdef HAVE_MSA | #ifdef HAVE_MSA | ||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | #define SGEMM_DEFAULT_UNROLL_M 8 | ||||
| @@ -2634,7 +2654,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | #define SGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -2675,7 +2695,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL | |||||
| #ifdef HAVE_MSA | #ifdef HAVE_MSA | ||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | #define SGEMM_DEFAULT_UNROLL_M 8 | ||||
| @@ -2724,7 +2744,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #ifdef RISCV64_GENERIC | #ifdef RISCV64_GENERIC | ||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 2 | #define SGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 2 | #define SGEMM_DEFAULT_UNROLL_N 2 | ||||
| @@ -2805,7 +2825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -2846,7 +2866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 2 | #define SGEMM_DEFAULT_UNROLL_N 2 | ||||
| @@ -3121,7 +3141,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 2 | #define SGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 2 | #define SGEMM_DEFAULT_UNROLL_N 2 | ||||
| @@ -3162,7 +3182,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -3203,7 +3223,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -3244,7 +3264,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 2 | #define SGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 2 | #define SGEMM_DEFAULT_UNROLL_N 2 | ||||
| @@ -3283,7 +3303,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | #define SGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -3365,7 +3385,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_N 2 | #define SGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define DGEMM_DEFAULT_UNROLL_N 2 | #define DGEMM_DEFAULT_UNROLL_N 2 | ||||