Update branch from develop for 0.3.14 releasetags/v0.3.14
| @@ -190,3 +190,27 @@ steps: | |||
| - make -C ctest $COMMON_FLAGS | |||
| - make -C utest $COMMON_FLAGS | |||
| - make -C cpp_thread_test dgemm_tester | |||
| --- | |||
| kind: pipeline | |||
| name: arm64_gcc10 | |||
| platform: | |||
| os: linux | |||
| arch: arm64 | |||
| steps: | |||
| - name: Build and Test | |||
| image: ubuntu:20.04 | |||
| environment: | |||
| CC: gcc-10 | |||
| FC: gfortran-10 | |||
| COMMON_FLAGS: 'TARGET=ARMV8 DYNAMIC_ARCH=1' | |||
| commands: | |||
| - echo "MAKE_FLAGS:= $COMMON_FLAGS" | |||
| - apt-get update -y | |||
| - apt-get install -y make $CC gfortran-10 perl python g++ | |||
| - $CC --version | |||
| - make QUIET_MAKE=1 $COMMON_FLAGS | |||
| - make -C utest $COMMON_FLAGS | |||
| - make -C test $COMMON_FLAGS | |||
| @@ -44,6 +44,11 @@ jobs: | |||
| if: github.event_name != 'pull_request' | |||
| run: brew update || true | |||
| - name: unlink installed gcc to allow updating | |||
| run: | | |||
| brew unlink gcc@8 | |||
| brew unlink gcc@9 | |||
| - name: Install prerequisites | |||
| run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas | |||
| @@ -89,5 +89,7 @@ build.* | |||
| *.swp | |||
| benchmark/*.goto | |||
| benchmark/smallscaling | |||
| .vscode | |||
| CMakeCache.txt | |||
| CMakeFiles/* | |||
| .vscode | |||
| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||
| project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 13) | |||
| set(OpenBLAS_PATCH_VERSION 14) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| # Adhere to GNU filesystem layout conventions | |||
| @@ -14,6 +14,9 @@ include(GNUInstallDirs) | |||
| include(CMakePackageConfigHelpers) | |||
| if(MSVC AND NOT DEFINED NOFORTRAN) | |||
| set(NOFORTRAN ON) | |||
| endif() | |||
| ####### | |||
| if(MSVC) | |||
| @@ -229,7 +232,7 @@ if (NOT NO_CBLAS) | |||
| add_subdirectory(utest) | |||
| endif() | |||
| if (NOT MSVC AND NOT NOFORTRAN) | |||
| if (NOT NOFORTRAN) | |||
| # Build test and ctest | |||
| add_subdirectory(test) | |||
| if(NOT NO_CBLAS) | |||
| @@ -1,4 +1,52 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.14 | |||
| 17-Mar-2021 | |||
| common: | |||
| * Fixed a race condition on thread shutdown in non-OpenMP builds | |||
| * Fixed custom BUFFERSIZE option getting ignored in gmake builds | |||
| * Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms | |||
| * Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT | |||
| * Improved performance of OMATCOPY_RT across all platforms | |||
| * Changed perl scripts to use env instead of a hardcoded /usr/bin/perl | |||
| * Fixed potential misreading of the GCC compiler version in the build scripts | |||
| * Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477) | |||
| * Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335) | |||
| RISCV: | |||
| * Fixed compilation on RISCV (missing entry in getarch) | |||
| POWER: | |||
| * Fixed compilation for DYNAMIC_ARCH with clang and with old gcc versions | |||
| * Added support for compilation on FreeBSD/ppc64le | |||
| * Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL | |||
| * Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM | |||
| * Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10 | |||
| * Improved SCOPY and CCOPY performance on POWER10 | |||
| * Improved SGEMM and DGEMM performance on POWER10 | |||
| * Added support for compilation with the NVIDIA HPC compiler | |||
| x86_64: | |||
| * Added an optimized bfloat16 GEMM kernel for Cooperlake | |||
| * Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus | |||
| * Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus | |||
| * Added support for compilation with the NAG Fortran compiler | |||
| * Fixed recognition of the AMD AOCC compiler | |||
| * Fixed compilation for DYNAMIC_ARCH with clang on Windows | |||
| * Added support for running the BLAS/CBLAS tests on Windows | |||
| * Fixed signatures of the tls callback functions for Windows x64 | |||
| * Fixed various issues with fma intrinsics support handling | |||
| ARM: | |||
| * Added support for embedded Cortex M targets via a new option EMBEDDED | |||
| ARMV8: | |||
| * Fixed the THUNDERX2T99 and NEOVERSEN1 DNRM2/ZNRM2 kernels for inputs with Inf | |||
| * Added support for the DYNAMIC_LIST option | |||
| * Added support for compilation with the NVIDIA HPC compiler | |||
| * Added support for compiling with the NAG Fortran compiler | |||
| ==================================================================== | |||
| Version 0.3.13 | |||
| 12-Dec-2020 | |||
| @@ -59,6 +59,9 @@ endif | |||
| @$(CC) --version > /dev/null 2>&1;\ | |||
| if [ $$? -eq 0 ]; then \ | |||
| cverinfo=`$(CC) --version | sed -n '1p'`; \ | |||
| if [ -z "$${cverinfo}" ]; then \ | |||
| cverinfo=`$(CC) --version | sed -n '2p'`; \ | |||
| fi; \ | |||
| echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ | |||
| else \ | |||
| echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ | |||
| @@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| @$(FC) --version > /dev/null 2>&1;\ | |||
| if [ $$? -eq 0 ]; then \ | |||
| fverinfo=`$(FC) --version | sed -n '1p'`; \ | |||
| if [ -z "$${fverinfo}" ]; then \ | |||
| fverinfo=`$(FC) --version | sed -n '2p'`; \ | |||
| fi; \ | |||
| echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ | |||
| else \ | |||
| echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ | |||
| @@ -1,28 +1,38 @@ | |||
| ifneq ($(C_COMPILER), PGI) | |||
| ifeq ($(CORE), ARMV8) | |||
| CCOMMON_OPT += -march=armv8-a | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), CORTEXA53) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), CORTEXA57) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), CORTEXA72) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), CORTEXA73) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 | |||
| endif | |||
| endif | |||
| # Use a72 tunings because Neoverse-N1 is only available | |||
| # in GCC>=9 | |||
| @@ -30,51 +40,71 @@ ifeq ($(CORE), NEOVERSEN1) | |||
| ifeq ($(GCCVERSIONGTEQ7), 1) | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), THUNDERX) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=thunderx | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=thunderx | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), FALKOR) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=falkor | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=falkor | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), THUNDERX2T99) | |||
| CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), THUNDERX3T110) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), VORTEX) | |||
| CCOMMON_OPT += -march=armv8.3-a | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.3-a | |||
| endif | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq ($(CORE), TSV110) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -10,9 +10,11 @@ USE_OPENMP = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER10) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.13 | |||
| VERSION = 0.3.14 | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64) | |||
| override ARCH=x86_64 | |||
| else ifeq ($(ARCH), powerpc64) | |||
| override ARCH=power | |||
| else ifeq ($(ARCH), powerpc64le) | |||
| override ARCH=power | |||
| else ifeq ($(ARCH), powerpc) | |||
| override ARCH=power | |||
| else ifeq ($(ARCH), i386) | |||
| @@ -181,7 +183,7 @@ endif | |||
| # On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. | |||
| ifeq ($(HOSTARCH), x86_64) | |||
| ifeq ($(findstring pgcc,$(HOSTCC)),) | |||
| ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),) | |||
| GETARCH_FLAGS += -march=native | |||
| endif | |||
| endif | |||
| @@ -623,6 +625,11 @@ DYNAMIC_CORE += THUNDERX2T99 | |||
| DYNAMIC_CORE += TSV110 | |||
| DYNAMIC_CORE += EMAG8180 | |||
| DYNAMIC_CORE += THUNDERX3T110 | |||
| ifdef DYNAMIC_LIST | |||
| override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST) | |||
| XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8 | |||
| XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), mips64) | |||
| @@ -663,6 +670,7 @@ endif | |||
| endif # ARCH zarch | |||
| ifeq ($(ARCH), power) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| DYNAMIC_CORE = POWER6 | |||
| DYNAMIC_CORE += POWER8 | |||
| ifneq ($(C_COMPILER), GCC) | |||
| @@ -689,6 +697,10 @@ else | |||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) | |||
| endif | |||
| endif | |||
| else | |||
| DYNAMIC_CORE = POWER8 | |||
| DYNAMIC_CORE += POWER9 | |||
| endif | |||
| endif | |||
| # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty | |||
| @@ -847,9 +859,19 @@ endif | |||
| endif | |||
| ifeq ($(C_COMPILER), PGI) | |||
| PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) | |||
| PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20) | |||
| PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11) | |||
| PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) | |||
| ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011)) | |||
| NEWPGI := 1 | |||
| endif | |||
| ifdef BINARY64 | |||
| ifeq ($(ARCH), x86_64) | |||
| CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm | |||
| CCOMMON_OPT += -tp p7-64 | |||
| ifneq ($(NEWPGI),1) | |||
| CCOMMON_OPT += -D__MMX__ -Mnollvm | |||
| endif | |||
| else | |||
| ifeq ($(ARCH), power) | |||
| ifeq ($(CORE), POWER8) | |||
| @@ -877,13 +899,25 @@ endif | |||
| # Fortran Compiler dependent settings | |||
| # | |||
| ifeq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| FCOMMON_OPT += -openmp | |||
| endif | |||
| endif | |||
| ifeq ($(F_COMPILER), FLANG) | |||
| CCOMMON_OPT += -DF_INTERFACE_FLANG | |||
| FCOMMON_OPT += -Mrecursive -Kieee | |||
| ifeq ($(OSNAME), Linux) | |||
| ifeq ($(ARCH), x86_64) | |||
| FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`) | |||
| ifeq ($(FLANG_VENDOR),AOCC) | |||
| FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ") | |||
| ifeq ($(FLANG_VENDOR), AMD) | |||
| FCOMMON_OPT += -fno-unroll-loops | |||
| endif | |||
| endif | |||
| @@ -1029,18 +1063,24 @@ ifeq ($(ARCH), x86_64) | |||
| FCOMMON_OPT += -tp p7-64 | |||
| else | |||
| ifeq ($(ARCH), power) | |||
| ifeq ($(CORE), POWER6) | |||
| $(warning NVIDIA HPC compilers do not support POWER6.) | |||
| endif | |||
| ifeq ($(CORE), POWER8) | |||
| FCOMMON_OPT += -tp pwr8 | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| FCOMMON_OPT += -tp pwr9 | |||
| endif | |||
| ifeq ($(CORE), POWER10) | |||
| $(warning NVIDIA HPC compilers do not support POWER10.) | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| FCOMMON_OPT += -tp p7 | |||
| endif | |||
| FCOMMON_OPT += -Mrecursive | |||
| FCOMMON_OPT += -Mrecursive -Kieee | |||
| ifeq ($(USE_OPENMP), 1) | |||
| FCOMMON_OPT += -mp | |||
| endif | |||
| @@ -1179,6 +1219,8 @@ CCOMMON_OPT += -fPIC | |||
| endif | |||
| ifeq ($(F_COMPILER), SUN) | |||
| FCOMMON_OPT += -pic | |||
| else ifeq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -PIC | |||
| else | |||
| FCOMMON_OPT += -fPIC | |||
| endif | |||
| @@ -1256,6 +1298,10 @@ CCOMMON_OPT += -DUSE_PAPI | |||
| EXTRALIB += -lpapi -lperfctr | |||
| endif | |||
| ifdef BUFFERSIZE | |||
| CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE) | |||
| endif | |||
| ifdef DYNAMIC_THREADS | |||
| CCOMMON_OPT += -DDYNAMIC_THREADS | |||
| endif | |||
| @@ -1433,6 +1479,10 @@ LAPACK_FFLAGS := $(FFLAGS) | |||
| LAPACK_FPFLAGS := $(FPFLAGS) | |||
| endif | |||
| ifeq ($(F_COMPILER),NAG) | |||
| LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) | |||
| endif | |||
| LAPACK_CFLAGS = $(CFLAGS) | |||
| LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H | |||
| ifdef INTERFACE64 | |||
| @@ -10,40 +10,46 @@ endif | |||
| ifdef HAVE_SSE3 | |||
| CCOMMON_OPT += -msse3 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -msse3 | |||
| endif | |||
| endif | |||
| ifdef HAVE_SSSE3 | |||
| CCOMMON_OPT += -mssse3 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -mssse3 | |||
| endif | |||
| endif | |||
| ifdef HAVE_SSE4_1 | |||
| CCOMMON_OPT += -msse4.1 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -msse4.1 | |||
| endif | |||
| endif | |||
| ifndef OLDGCC | |||
| ifdef HAVE_AVX | |||
| CCOMMON_OPT += -mavx | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -mavx | |||
| endif | |||
| endif | |||
| endif | |||
| ifndef NO_AVX2 | |||
| ifdef HAVE_AVX2 | |||
| CCOMMON_OPT += -mavx2 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -mavx2 | |||
| endif | |||
| endif | |||
| ifndef OLDGCC | |||
| ifdef HAVE_FMA3 | |||
| CCOMMON_OPT += -mfma | |||
| FCOMMON_OPT += -mfma | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), SKYLAKEX) | |||
| ifndef DYNAMIC_ARCH | |||
| ifndef NO_AVX512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| FCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| @@ -65,9 +71,11 @@ ifeq ($(C_COMPILER), GCC) | |||
| # cooperlake support was added in 10.1 | |||
| ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) | |||
| CCOMMON_OPT += -march=cooperlake | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=cooperlake | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| FCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| @@ -13,10 +13,14 @@ Drone CI: [ library based on GotoBLAS2 1.13 BSD version. | |||
| Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>. | |||
| For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib: | |||
| <https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six | |||
| 20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare <https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/> or Youtube <https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek> may be helpful. | |||
| ## Binary Packages | |||
| We provide official binary packages for the following platform: | |||
| @@ -208,7 +212,8 @@ Please note that it is not possible to combine support for different architectur | |||
| - **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. | |||
| - **AIX**: Supported on PPC up to POWER8 | |||
| - **Haiku**: Supported by the community. We don't actively test the library on this OS. | |||
| - **SunOS**: Supported by the community. We don't actively test the library on this OS: | |||
| - **SunOS**: Supported by the community. We don't actively test the library on this OS. | |||
| - **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>. | |||
| ## Usage | |||
| @@ -30,10 +30,10 @@ environment: | |||
| CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 | |||
| matrix: | |||
| - COMPILER: clang-cl | |||
| WITH_FORTRAN: yes | |||
| WITH_FORTRAN: ON | |||
| - COMPILER: clang-cl | |||
| DYNAMIC_ARCH: ON | |||
| WITH_FORTRAN: no | |||
| WITH_FORTRAN: OFF | |||
| - COMPILER: cl | |||
| - COMPILER: MinGW64-gcc-7.2.0-mingw | |||
| DYNAMIC_ARCH: OFF | |||
| @@ -47,12 +47,7 @@ environment: | |||
| install: | |||
| - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat | |||
| - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force | |||
| - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake | |||
| - if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja | |||
| - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja | |||
| - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang | |||
| - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 | |||
| - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 | |||
| - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" | |||
| - if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%" | |||
| @@ -68,15 +63,14 @@ before_build: | |||
| - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. | |||
| - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | |||
| - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | |||
| - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | |||
| - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||
| - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | |||
| - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||
| - if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON .. | |||
| - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. | |||
| build_script: | |||
| - cmake --build . | |||
| test_script: | |||
| - echo Running Test | |||
| - cd utest | |||
| - openblas_utest | |||
| - ctest -j2 | |||
| @@ -74,6 +74,9 @@ static void *huge_malloc(BLASLONG size){ | |||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | |||
| struct timeval start, stop; | |||
| #elif defined(__APPLE__) | |||
| mach_timebase_info_data_t info; | |||
| uint64_t start = 0, stop = 0; | |||
| #else | |||
| struct timespec start = { 0, 0 }, stop = { 0, 0 }; | |||
| #endif | |||
| @@ -82,6 +85,9 @@ double getsec() | |||
| { | |||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | |||
| return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| #elif defined(__APPLE__) | |||
| mach_timebase_info(&info); | |||
| return (double)(((stop - start) * info.numer)/info.denom) * 1.e-9; | |||
| #else | |||
| return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; | |||
| #endif | |||
| @@ -90,6 +96,8 @@ double getsec() | |||
| void begin() { | |||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| #elif defined(__APPLE__) | |||
| start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); | |||
| #else | |||
| clock_gettime(CLOCK_REALTIME, &start); | |||
| #endif | |||
| @@ -98,7 +106,9 @@ void begin() { | |||
| void end() { | |||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| #elif defined(__APPLE__) | |||
| stop = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); | |||
| #else | |||
| clock_gettime(CLOCK_REALTIME, &stop); | |||
| #endif | |||
| } | |||
| } | |||
| @@ -1,11 +1,11 @@ | |||
| #!/usr/bin/perl | |||
| #!/usr/bin/env perl | |||
| #use File::Basename; | |||
| # use File::Temp qw(tempfile); | |||
| # Checking cross compile | |||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); | |||
| $hostarch = `uname -m | sed -e s/i.86/x86/`; | |||
| $hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS"); | |||
| chop($hostarch); | |||
| $hostarch = "x86_64" if ($hostarch eq "amd64"); | |||
| @@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, | |||
| void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); | |||
| void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); | |||
| void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); | |||
| void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); | |||
| void cblas_srotg(float *a, float *b, float *c, float *s); | |||
| void cblas_drotg(double *a, double *b, double *c, double *s); | |||
| void cblas_crotg(void *a, void *b, float *c, void *s); | |||
| void cblas_zrotg(void *a, void *b, double *c, void *s); | |||
| void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P); | |||
| void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P); | |||
| @@ -45,6 +45,9 @@ endif () | |||
| if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| if (DYNAMIC_LIST) | |||
| set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | |||
| endif () | |||
| endif () | |||
| if (POWER) | |||
| @@ -2499,6 +2499,5 @@ foreach (Utils_FILE ${Utils_SRC}) | |||
| endforeach () | |||
| set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include") | |||
| configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY) | |||
| include_directories(${lapacke_include_dir}) | |||
| set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") | |||
| @@ -148,16 +148,20 @@ endif () | |||
| include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") | |||
| if (DEFINED TARGET) | |||
| if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) | |||
| # if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") | |||
| else() | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| # elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") | |||
| # set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
| # endif() | |||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") | |||
| else() | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| @@ -233,6 +237,11 @@ if (BINARY64) | |||
| endif () | |||
| endif () | |||
| if(EMBEDDED) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DOS_EMBEDDED") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16") | |||
| endif() | |||
| if (NEED_PIC) | |||
| if (${CMAKE_C_COMPILER} STREQUAL "IBM") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large") | |||
| @@ -74,6 +74,9 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||
| string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | |||
| if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER) | |||
| set (CMAKE_MATCH_1 CMAKE_C_COMPILER) | |||
| endif () | |||
| if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) | |||
| # message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| @@ -122,7 +122,7 @@ extern "C" { | |||
| #define ATOM GOTO_ATOM | |||
| #undef GOTO_ATOM | |||
| #endif | |||
| #else | |||
| #elif !defined(OS_EMBEDDED) | |||
| #include <sys/mman.h> | |||
| #ifndef NO_SYSV_IPC | |||
| #include <sys/shm.h> | |||
| @@ -134,6 +134,9 @@ extern "C" { | |||
| #if defined(SMP) || defined(USE_LOCKING) | |||
| #include <pthread.h> | |||
| #endif | |||
| #else | |||
| #include <time.h> | |||
| #include <math.h> | |||
| #endif | |||
| #if defined(OS_SUNOS) | |||
| @@ -488,10 +491,12 @@ static inline unsigned long long rpcc(void){ | |||
| struct timespec ts; | |||
| clock_gettime(CLOCK_MONOTONIC, &ts); | |||
| return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; | |||
| #else | |||
| #elif !defined(OS_EMBEDDED) | |||
| struct timeval tv; | |||
| gettimeofday(&tv,NULL); | |||
| return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| #define RPCC_DEFINED | |||
| @@ -521,6 +526,10 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||
| #include "common_linux.h" | |||
| #endif | |||
| #ifdef OS_EMBEDDED | |||
| #define DTB_DEFAULT_ENTRIES 64 | |||
| #endif | |||
| #define MMAP_ACCESS (PROT_READ | PROT_WRITE) | |||
| #ifdef __NetBSD__ | |||
| @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define INLINE inline | |||
| #ifdef F_INTERFACE_FLANG | |||
| #if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI) | |||
| #define RETURN_BY_STACK | |||
| #else | |||
| #define RETURN_BY_COMPLEX | |||
| @@ -1418,6 +1418,15 @@ int get_cpuname(void){ | |||
| case 9: | |||
| case 8: | |||
| switch (model) { | |||
| case 12: // Tiger Lake | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 14: // Kaby Lake and refreshes | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| @@ -1436,6 +1445,15 @@ int get_cpuname(void){ | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 7: // Rocket Lake | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| } | |||
| @@ -2014,6 +2032,19 @@ int get_coretype(void){ | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| case 7:// Rocket Lake | |||
| #ifndef NO_AVX512 | |||
| if(support_avx512()) | |||
| return CORE_SKYLAKEX; | |||
| #endif | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| #endif | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| case 5: | |||
| switch (model) { | |||
| @@ -2102,6 +2133,16 @@ int get_coretype(void){ | |||
| break; | |||
| case 9: | |||
| case 8: | |||
| if (model == 12) { // Tiger Lake | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| if (model == 14) { // Kaby Lake | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| @@ -5,9 +5,18 @@ enable_language(Fortran) | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") | |||
| if(WIN32) | |||
| FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1 | |||
| "$ErrorActionPreference = \"Stop\"\n" | |||
| "Get-Content $args[1] | & $args[0]\n" | |||
| ) | |||
| set(test_helper powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1") | |||
| else() | |||
| FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh | |||
| "$1 < $2\n" | |||
| ) | |||
| set(test_helper sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh") | |||
| endif() | |||
| foreach(float_type ${FLOAT_TYPES}) | |||
| string(SUBSTRING ${float_type} 0 1 float_char_upper) | |||
| @@ -21,7 +30,7 @@ foreach(float_type ${FLOAT_TYPES}) | |||
| c_${float_char}blas1.c) | |||
| target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) | |||
| add_test(NAME "x${float_char}cblat1" | |||
| COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1") | |||
| COMMAND $<TARGET_FILE:x${float_char}cblat1>) | |||
| #level2 | |||
| add_executable(x${float_char}cblat2 | |||
| @@ -33,7 +42,7 @@ foreach(float_type ${FLOAT_TYPES}) | |||
| constant.c) | |||
| target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) | |||
| add_test(NAME "x${float_char}cblat2" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") | |||
| COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat2> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") | |||
| #level3 | |||
| add_executable(x${float_char}cblat3 | |||
| @@ -45,6 +54,6 @@ foreach(float_type ${FLOAT_TYPES}) | |||
| constant.c) | |||
| target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | |||
| add_test(NAME "x${float_char}cblat3" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") | |||
| COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") | |||
| endforeach() | |||
| @@ -212,6 +212,9 @@ ifeq ($(C_COMPILER), CLANG) | |||
| CEXTRALIB = -lomp | |||
| endif | |||
| endif | |||
| ifeq ($(F_COMPILER), NAG) | |||
| CEXTRALIB = -lgomp | |||
| endif | |||
| endif | |||
| ifeq ($(BUILD_SINGLE),1) | |||
| @@ -1024,38 +1024,39 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
| int i; | |||
| if (!blas_server_avail) return 0; | |||
| LOCK_COMMAND(&server_lock); | |||
| for (i = 0; i < blas_num_threads - 1; i++) { | |||
| if (blas_server_avail) { | |||
| for (i = 0; i < blas_num_threads - 1; i++) { | |||
| pthread_mutex_lock (&thread_status[i].lock); | |||
| atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); | |||
| thread_status[i].status = THREAD_STATUS_WAKEUP; | |||
| pthread_cond_signal (&thread_status[i].wakeup); | |||
| pthread_mutex_lock (&thread_status[i].lock); | |||
| pthread_mutex_unlock(&thread_status[i].lock); | |||
| atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); | |||
| thread_status[i].status = THREAD_STATUS_WAKEUP; | |||
| pthread_cond_signal (&thread_status[i].wakeup); | |||
| } | |||
| pthread_mutex_unlock(&thread_status[i].lock); | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| pthread_join(blas_threads[i], NULL); | |||
| } | |||
| } | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| pthread_mutex_destroy(&thread_status[i].lock); | |||
| pthread_cond_destroy (&thread_status[i].wakeup); | |||
| } | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| pthread_join(blas_threads[i], NULL); | |||
| } | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| pthread_mutex_destroy(&thread_status[i].lock); | |||
| pthread_cond_destroy (&thread_status[i].wakeup); | |||
| } | |||
| #ifdef NEED_STACKATTR | |||
| pthread_attr_destory(&attr); | |||
| pthread_attr_destroy(&attr); | |||
| #endif | |||
| blas_server_avail = 0; | |||
| blas_server_avail = 0; | |||
| } | |||
| UNLOCK_COMMAND(&server_lock); | |||
| return 0; | |||
| @@ -644,6 +644,21 @@ static gotoblas_t *get_coretype(void){ | |||
| return NULL; | |||
| case 9: | |||
| case 8: | |||
| if (model == 12) { // Tiger Lake | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx2()){ | |||
| openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||
| return &gotoblas_HASWELL; | |||
| } | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| if (model == 14 ) { // Kaby Lake, Coffee Lake | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| @@ -656,7 +671,7 @@ static gotoblas_t *get_coretype(void){ | |||
| } | |||
| } | |||
| case 10: | |||
| if (model == 5 || model == 6) { | |||
| if (model == 5 || model == 6) { | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| @@ -666,7 +681,20 @@ static gotoblas_t *get_coretype(void){ | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| } | |||
| if (model == 7) { | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| return NULL; | |||
| } | |||
| case 0xf: | |||
| @@ -43,6 +43,63 @@ | |||
| #endif | |||
| extern gotoblas_t gotoblas_ARMV8; | |||
| #ifdef DYNAMIC_LIST | |||
| #ifdef DYN_CORTEXA53 | |||
| extern gotoblas_t gotoblas_CORTEXA53; | |||
| #else | |||
| #define gotoblas_CORTEXA53 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEXA57 | |||
| extern gotoblas_t gotoblas_CORTEXA57; | |||
| #else | |||
| #define gotoblas_CORTEXA57 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEXA72 | |||
| extern gotoblas_t gotoblas_CORTEXA72; | |||
| #else | |||
| #define gotoblas_CORTEXA72 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEXA73 | |||
| extern gotoblas_t gotoblas_CORTEXA73; | |||
| #else | |||
| #define gotoblas_CORTEXA73 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_FALKOR | |||
| extern gotoblas_t gotoblas_FALKOR; | |||
| #else | |||
| #define gotoblas_FALKOR gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_TSV110 | |||
| extern gotoblas_t gotoblas_TSV110; | |||
| #else | |||
| #define gotoblas_TSV110 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_THUNDERX | |||
| extern gotoblas_t gotoblas_THUNDERX; | |||
| #else | |||
| #define gotoblas_THUNDERX gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_THUNDERX2T99 | |||
| extern gotoblas_t gotoblas_THUNDERX2T99; | |||
| #else | |||
| #define gotoblas_THUNDERX2T99 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_THUNDERX3T110 | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| #else | |||
| #define gotoblas_THUNDERX3T110 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_EMAG8180 | |||
| extern gotoblas_t gotoblas_EMAG8180; | |||
| #else | |||
| #define gotoblas_EMAG8180 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_NEOVERSEN1 | |||
| extern gotoblas_t gotoblas_NEOVERSEN1; | |||
| #else | |||
| #define gotoblas_NEOVERSEN1 gotoblas_ARMV8 | |||
| #endif | |||
| #else | |||
| extern gotoblas_t gotoblas_CORTEXA53; | |||
| extern gotoblas_t gotoblas_CORTEXA57; | |||
| extern gotoblas_t gotoblas_CORTEXA72; | |||
| @@ -54,6 +111,7 @@ extern gotoblas_t gotoblas_TSV110; | |||
| extern gotoblas_t gotoblas_EMAG8180; | |||
| extern gotoblas_t gotoblas_NEOVERSEN1; | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| #endif | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| @@ -68,7 +126,7 @@ extern void openblas_warning(int verbose, const char * msg); | |||
| #endif | |||
| #define get_cpu_ftr(id, var) ({ \ | |||
| __asm__("mrs %0, "#id : "=r" (var)); \ | |||
| __asm__ ("mrs %0, "#id : "=r" (var)); \ | |||
| }) | |||
| static char *corename[] = { | |||
| @@ -27,7 +27,9 @@ static char *corename[] = { | |||
| #define NUM_CORETYPES 4 | |||
| char *gotoblas_corename(void) { | |||
| #ifndef C_PGI | |||
| if (gotoblas == &gotoblas_POWER6) return corename[1]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_POWER8) return corename[2]; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| if (gotoblas == &gotoblas_POWER9) return corename[3]; | |||
| @@ -38,10 +40,164 @@ char *gotoblas_corename(void) { | |||
| return corename[0]; | |||
| } | |||
| #if defined(__clang__) | |||
| static int __builtin_cpu_supports(char* arg) | |||
| { | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if defined(C_PGI) || defined(__clang__) | |||
| /* | |||
| * NV HPC compilers do not yet implement __builtin_cpu_is(). | |||
| * Fake a version here for use in the CPU detection code below. | |||
| * | |||
| * Strategy here is to first check the CPU to see what it actually is, | |||
| * and then test the input to see if what the CPU actually is matches | |||
| * what was requested. | |||
| */ | |||
| #include <string.h> | |||
| /* | |||
| * Define POWER processor version table. | |||
| * | |||
| * NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time | |||
| */ | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_POWER5 5 | |||
| #define CPU_POWER6 6 | |||
| #define CPU_POWER8 8 | |||
| #define CPU_POWER9 9 | |||
| #define CPU_POWER10 10 | |||
| static struct { | |||
| uint32_t pvr_mask; | |||
| uint32_t pvr_value; | |||
| const char* cpu_name; | |||
| uint32_t cpu_type; | |||
| } pvrPOWER [] = { | |||
| { /* POWER6 in P5+ mode; 2.04-compliant processor */ | |||
| .pvr_mask = 0xffffffff, | |||
| .pvr_value = 0x0f000001, | |||
| .cpu_name = "POWER5+", | |||
| .cpu_type = CPU_POWER5, | |||
| }, | |||
| { /* Power6 aka POWER6X*/ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x003e0000, | |||
| .cpu_name = "POWER6 (raw)", | |||
| .cpu_type = CPU_POWER6, | |||
| }, | |||
| { /* Power7 */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x003f0000, | |||
| .cpu_name = "POWER7 (raw)", | |||
| .cpu_type = CPU_POWER6, | |||
| }, | |||
| { /* Power7+ */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004A0000, | |||
| .cpu_name = "POWER7+ (raw)", | |||
| .cpu_type = CPU_POWER6, | |||
| }, | |||
| { /* Power8E */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004b0000, | |||
| .cpu_name = "POWER8E (raw)", | |||
| .cpu_type = CPU_POWER8, | |||
| }, | |||
| { /* Power8NVL */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004c0000, | |||
| .cpu_name = "POWER8NVL (raw)", | |||
| .cpu_type = CPU_POWER8, | |||
| }, | |||
| { /* Power8 */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004d0000, | |||
| .cpu_name = "POWER8 (raw)", | |||
| .cpu_type = CPU_POWER8, | |||
| }, | |||
| { /* Power9 DD2.0 */ | |||
| .pvr_mask = 0xffffefff, | |||
| .pvr_value = 0x004e0200, | |||
| .cpu_name = "POWER9 (raw)", | |||
| .cpu_type = CPU_POWER9, | |||
| }, | |||
| { /* Power9 DD 2.1 */ | |||
| .pvr_mask = 0xffffefff, | |||
| .pvr_value = 0x004e0201, | |||
| .cpu_name = "POWER9 (raw)", | |||
| .cpu_type = CPU_POWER9, | |||
| }, | |||
| { /* Power9 DD2.2 or later */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004e0000, | |||
| .cpu_name = "POWER9 (raw)", | |||
| .cpu_type = CPU_POWER9, | |||
| }, | |||
| { /* Power10 */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x00800000, | |||
| .cpu_name = "POWER10 (raw)", | |||
| .cpu_type = CPU_POWER10, | |||
| }, | |||
| { /* End of table, pvr_mask and pvr_value must be zero */ | |||
| .pvr_mask = 0x0, | |||
| .pvr_value = 0x0, | |||
| .cpu_name = "Unknown", | |||
| .cpu_type = CPU_UNKNOWN, | |||
| }, | |||
| }; | |||
| static int __builtin_cpu_is(const char *cpu) { | |||
| int i; | |||
| uint32_t pvr; | |||
| uint32_t cpu_type; | |||
| asm("mfpvr %0" : "=r"(pvr)); | |||
| for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) { | |||
| if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) { | |||
| break; | |||
| } | |||
| } | |||
| #if defined(DEBUG) | |||
| printf("%s: returning CPU=%s, cpu_type=%p\n", __func__, | |||
| pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); | |||
| #endif | |||
| cpu_type = pvrPOWER[i].cpu_type; | |||
| if (!strcmp(cpu, "power8")) | |||
| return cpu_type == CPU_POWER8; | |||
| if (!strcmp(cpu, "power9")) | |||
| return cpu_type == CPU_POWER9; | |||
| return 0; | |||
| } | |||
| #endif /* C_PGI */ | |||
| static gotoblas_t *get_coretype(void) { | |||
| #ifndef C_PGI | |||
| if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) | |||
| return &gotoblas_POWER6; | |||
| #endif | |||
| if (__builtin_cpu_is("power8")) | |||
| return &gotoblas_POWER8; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| @@ -53,7 +209,7 @@ static gotoblas_t *get_coretype(void) { | |||
| return &gotoblas_POWER10; | |||
| #endif | |||
| /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) | |||
| if (__builtin_cpu_is("power10")) | |||
| return &gotoblas_POWER9; | |||
| #endif | |||
| @@ -77,7 +233,9 @@ static gotoblas_t *force_coretype(char * coretype) { | |||
| switch (found) | |||
| { | |||
| #ifndef C_PGI | |||
| case 1: return (&gotoblas_POWER6); | |||
| #endif | |||
| case 2: return (&gotoblas_POWER8); | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| case 3: return (&gotoblas_POWER9); | |||
| @@ -222,11 +222,11 @@ int get_num_procs(void); | |||
| #else | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| #if defined(__GLIBC_PREREQ) | |||
| cpu_set_t cpuset,*cpusetp; | |||
| size_t size; | |||
| int ret; | |||
| #if defined(__GLIBC_PREREQ) | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| int i; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| @@ -1241,7 +1241,7 @@ UNLOCK_COMMAND(&alloc_lock); | |||
| func = &memoryalloc[0]; | |||
| while ((func != NULL) && (map_address == (void *) -1)) { | |||
| while ((*func != NULL) && (map_address == (void *) -1)) { | |||
| map_address = (*func)((void *)base_address); | |||
| @@ -1619,10 +1619,12 @@ static int on_process_term(void) | |||
| #else | |||
| #pragma data_seg(".CRT$XLB") | |||
| #endif | |||
| static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; | |||
| #ifdef _WIN64 | |||
| static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; | |||
| #pragma const_seg() | |||
| #else | |||
| static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; | |||
| #pragma data_seg() | |||
| #endif | |||
| @@ -1631,10 +1633,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI | |||
| #else | |||
| #pragma data_seg(".CRT$XTU") | |||
| #endif | |||
| static int(*p_process_term)(void) = on_process_term; | |||
| #ifdef _WIN64 | |||
| static const int(*p_process_term)(void) = on_process_term; | |||
| #pragma const_seg() | |||
| #else | |||
| static int(*p_process_term)(void) = on_process_term; | |||
| #pragma data_seg() | |||
| #endif | |||
| #endif | |||
| @@ -1668,16 +1672,23 @@ void gotoblas_dummy_for_PGI(void) { | |||
| #ifndef MEM_LARGE_PAGES | |||
| #define MEM_LARGE_PAGES 0x20000000 | |||
| #endif | |||
| #else | |||
| #elif !defined(OS_EMBEDDED) | |||
| #define ALLOC_MMAP | |||
| #define ALLOC_MALLOC | |||
| #else | |||
| #define ALLOC_MALLOC | |||
| inline int puts(const char *str) { return 0; } | |||
| inline int printf(const char *format, ...) { return 0; } | |||
| inline char *getenv(const char *name) { return ""; } | |||
| inline int atoi(const char *str) { return 0; } | |||
| #endif | |||
| #include <stdlib.h> | |||
| #include <stdio.h> | |||
| #include <fcntl.h> | |||
| #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) | |||
| #if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED) | |||
| #include <sys/mman.h> | |||
| #ifndef NO_SYSV_IPC | |||
| #include <sys/shm.h> | |||
| @@ -1,4 +1,4 @@ | |||
| #!/usr/bin/perl | |||
| #!/usr/bin/env perl | |||
| # Changelog | |||
| # 2017/09/03 staticfloat | |||
| @@ -1,4 +1,4 @@ | |||
| #!/usr/bin/perl | |||
| #!/usr/bin/env perl | |||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| @@ -32,9 +32,9 @@ if ($compiler eq "") { | |||
| "xlf95", "xlf90", "xlf", | |||
| "ppuf77", "ppuf95", "ppuf90", "ppuxlf", | |||
| "pathf90", "pathf95", | |||
| "pgf95", "pgf90", "pgf77", | |||
| "pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran", | |||
| "flang", "egfortran", | |||
| "ifort"); | |||
| "ifort", "nagfor"); | |||
| OUTER: | |||
| foreach $lists (@lists) { | |||
| @@ -64,7 +64,9 @@ if ($compiler eq "") { | |||
| if (!$?) { | |||
| $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; | |||
| if ($data eq "") { | |||
| $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`; | |||
| } | |||
| if ($data =~ /zhoge_/) { | |||
| $bu = "_"; | |||
| } | |||
| @@ -76,6 +78,7 @@ if ($compiler eq "") { | |||
| } elsif ($data =~ /GNU/ || $data =~ /GCC/ ) { | |||
| $data =~ s/\(+.*?\)+//g; | |||
| $data =~ /(\d+)\.(\d+).(\d+)/; | |||
| $major = $1; | |||
| $minor = $2; | |||
| @@ -87,7 +90,7 @@ if ($compiler eq "") { | |||
| if ($compiler =~ /flang/) { | |||
| $vendor = FLANG; | |||
| $openmp = "-fopenmp"; | |||
| } elsif ($compiler =~ /pgf/) { | |||
| } elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) { | |||
| $vendor = PGI; | |||
| $openmp = "-mp"; | |||
| } else { | |||
| @@ -123,7 +126,7 @@ if ($compiler eq "") { | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($data =~ /PGF/) { | |||
| if ($data =~ /PGF/ || $data =~ /NVF/) { | |||
| $vendor = PGI; | |||
| $openmp = "-mp"; | |||
| } | |||
| @@ -133,8 +136,16 @@ if ($compiler eq "") { | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($data =~ /NAG/) { | |||
| $vendor = NAG; | |||
| $openmp = "-openmp"; | |||
| } | |||
| # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. | |||
| $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; | |||
| if ($data eq "") { | |||
| $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`; | |||
| } | |||
| if ($data =~ / zho_ge__/) { | |||
| $need2bu = 1; | |||
| } | |||
| @@ -177,7 +188,7 @@ if ($compiler eq "") { | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($compiler =~ /pgf/) { | |||
| if ($compiler =~ /pgf/ || $compiler =~ /nvf/) { | |||
| $vendor = PGI; | |||
| $bu = "_"; | |||
| $openmp = "-mp"; | |||
| @@ -222,6 +233,12 @@ if ($compiler eq "") { | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| if ($compiler =~ /nagfor/) { | |||
| $vendor = NAG; | |||
| $bu = "_"; | |||
| $openmp = "-openmp"; | |||
| } | |||
| if ($vendor eq "") { | |||
| $nofortran = 1; | |||
| $compiler = "gfortran"; | |||
| @@ -275,14 +292,20 @@ if (!$?) { | |||
| if ($?) { | |||
| $link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| #For nagfor | |||
| if ($?) { | |||
| $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| $binary = "" if ($?); | |||
| } | |||
| if ($binary eq "") { | |||
| $link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| } | |||
| if ( $vendor eq "NAG") { | |||
| $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; | |||
| } | |||
| $linker_L = ""; | |||
| $linker_l = ""; | |||
| $linker_a = ""; | |||
| @@ -330,12 +353,13 @@ if ($link ne "") { | |||
| $flags =~ s/\@/\,/g; | |||
| $linker_L .= "-Wl,". $flags . " " ; | |||
| } | |||
| if ($flags =~ /-lgomp/ && $CC =~ /clang/) { | |||
| if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { | |||
| $flags = "-lomp"; | |||
| } | |||
| if ( | |||
| ($flags =~ /^\-l/) | |||
| && ($flags !~ /ibrary/) | |||
| && ($flags !~ /gfortranbegin/) | |||
| && ($flags !~ /frtbegin/) | |||
| && ($flags !~ /pathfstart/) | |||
| @@ -352,6 +376,16 @@ if ($link ne "") { | |||
| $linker_l .= $flags . " "; | |||
| } | |||
| if ( $flags =~ /quickfit.o/ && $vendor == NAG) { | |||
| $linker_l .= $flags . " "; | |||
| } | |||
| if ( $flags =~ /safefit.o/ && $vendor == NAG) { | |||
| $linker_l .= $flags . " "; | |||
| } | |||
| if ( $flags =~ /thsafe.o/ && $vendor == NAG) { | |||
| $linker_l .= $flags . " "; | |||
| } | |||
| $linker_a .= $flags . " " if $flags =~ /\.a$/; | |||
| } | |||
| @@ -1375,6 +1375,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef __riscv | |||
| #include "cpuid_riscv64.c" | |||
| #define OPENBLAS_SUPPORTED | |||
| #endif | |||
| #ifdef __arm__ | |||
| @@ -4,7 +4,7 @@ | |||
| #else | |||
| #include "config_kernel.h" | |||
| #endif | |||
| #include "param.h" | |||
| #include "common.h" | |||
| int main(int argc, char **argv) { | |||
| @@ -316,7 +316,7 @@ CCBLAS1OBJS = \ | |||
| cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | |||
| cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | |||
| cblas_caxpby.$(SUFFIX) \ | |||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) | |||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) | |||
| CCBLAS2OBJS = \ | |||
| cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ | |||
| @@ -346,7 +346,7 @@ CZBLAS1OBJS = \ | |||
| cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | |||
| cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | |||
| cblas_zaxpby.$(SUFFIX) \ | |||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) | |||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) | |||
| CZBLAS2OBJS = \ | |||
| @@ -1634,6 +1634,12 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c | |||
| cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_crotg.$(SUFFIX) cblas_crotg.$(PSUFFIX): zrotg.c | |||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | |||
| cblas_zrotg.$(SUFFIX) cblas_zrotg.$(PSUFFIX): zrotg.c | |||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | |||
| cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| @@ -1664,6 +1670,12 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c | |||
| cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) | |||
| cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| @@ -1,4 +1,4 @@ | |||
| #!/usr/bin/perl | |||
| #!/usr/bin/env perl | |||
| $count = 0; | |||
| @@ -246,6 +246,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| #ifdef SMP | |||
| double MNK; | |||
| #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| int mode = BLAS_XDOUBLE | BLAS_REAL; | |||
| @@ -264,6 +265,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) | |||
| int nodes; | |||
| @@ -417,8 +419,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| #ifdef SMP | |||
| #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) | |||
| mode |= (transa << BLAS_TRANSA_SHIFT); | |||
| mode |= (transb << BLAS_TRANSB_SHIFT); | |||
| #endif | |||
| MNK = (double) args.m * (double) args.n * (double) args.k; | |||
| if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
| @@ -107,7 +107,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||
| dq1 = dp1 * *dx1; | |||
| if(ABS(dq1) > ABS(dq2)) | |||
| { | |||
| dflag = ZERO; | |||
| dh11 = ONE; | |||
| dh22 = ONE; | |||
| dh21 = - dy1 / *dx1; | |||
| @@ -187,10 +187,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| endif () | |||
| # Makefile.L3 | |||
| set(USE_TRMM false) | |||
| if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE)) | |||
| string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) | |||
| if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE)) | |||
| set(USE_TRMM true) | |||
| endif () | |||
| if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) | |||
| if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) | |||
| set(USE_TRMM true) | |||
| endif () | |||
| @@ -36,7 +36,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| override CFLAGS += -march=cooperlake | |||
| else | |||
| override CFLAGS += -march=skylake-avx512 | |||
| override CFLAGS += -march=skylake-avx512 -mavx512f | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| override CFLAGS += -fno-asynchronous-unwind-tables | |||
| @@ -47,7 +47,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE) | |||
| endif | |||
| endif | |||
| else ifeq ($(TARGET_CORE), SKYLAKEX) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 -mavx512f | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| override CFLAGS += -fno-asynchronous-unwind-tables | |||
| endif | |||
| @@ -1,3 +1,11 @@ | |||
| FMAFLAG= | |||
| ifndef OLDGCC | |||
| ifdef HAVE_FMA3 | |||
| FMAFLAG = -mfma | |||
| endif | |||
| endif | |||
| ### AMAX ### | |||
| ifndef SAMAXKERNEL | |||
| @@ -828,10 +836,10 @@ $(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KE | |||
| $(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@ | |||
| $(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ | |||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ | |||
| $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||
| $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||
| @@ -1,5 +1,5 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| @@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| /***************************************************** | |||
| * 2014/06/09 Saar | |||
| * | |||
| * Order rowMajor | |||
| * Trans | |||
| * | |||
| ******************************************************/ | |||
| int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) | |||
| { | |||
| BLASLONG i,j; | |||
| FLOAT *aptr,*bptr; | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||
| FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; | |||
| if ( rows <= 0 ) return(0); | |||
| if ( cols <= 0 ) return(0); | |||
| if (rows <= 0) return 0; | |||
| if (cols <= 0) return 0; | |||
| aptr = a; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| for ( i=0; i<rows ; i++ ) | |||
| { | |||
| bptr = &b[i]; | |||
| for(j=0; j<cols; j++) | |||
| { | |||
| bptr[j*ldb] = alpha * aptr[j]; | |||
| } | |||
| aptr += lda; | |||
| } | |||
| i = (rows >> 2); | |||
| if (i > 0) { | |||
| do { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset += 4 * lda; | |||
| return(0); | |||
| b_offset1 = b_offset; | |||
| b_offset2 = b_offset1 + ldb; | |||
| b_offset3 = b_offset2 + ldb; | |||
| b_offset4 = b_offset3 + ldb; | |||
| b_offset += 4; | |||
| j = (cols >> 2); | |||
| if (j > 0) { | |||
| do { | |||
| /* Column 1 of MAT_B */ | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A | |||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||
| *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; | |||
| *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; | |||
| /* Column 2 of MAT_B */ | |||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A | |||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||
| *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; | |||
| *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; | |||
| /* Column 3 of MAT_B */ | |||
| *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A | |||
| *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; | |||
| *(b_offset3 + 2) = *(a_offset3 + 2)*alpha; | |||
| *(b_offset4 + 2) = *(a_offset3 + 3)*alpha; | |||
| /* Column 4 of MAT_B */ | |||
| *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A | |||
| *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; | |||
| *(b_offset3 + 3) = *(a_offset4 + 2)*alpha; | |||
| *(b_offset4 + 3) = *(a_offset4 + 3)*alpha; | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| a_offset3 += 4; | |||
| a_offset4 += 4; | |||
| b_offset1 += ldb * 4; | |||
| b_offset2 += ldb * 4; | |||
| b_offset3 += ldb * 4; | |||
| b_offset4 += ldb * 4; | |||
| j--; | |||
| } while (j > 0); | |||
| } // if(j > 0) | |||
| if (cols & 2) { | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||
| *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; | |||
| *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; | |||
| *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; | |||
| *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; | |||
| a_offset1 += 2; | |||
| a_offset2 += 2; | |||
| a_offset3 += 2; | |||
| a_offset4 += 2; | |||
| b_offset1 += ldb*2; | |||
| } | |||
| if (cols & 1) { | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||
| *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; | |||
| *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; | |||
| } | |||
| i--; | |||
| } while (i > 0); | |||
| } | |||
| } | |||
| if (rows & 2) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset += 2 * lda; | |||
| b_offset1 = b_offset; | |||
| b_offset2 = b_offset1 + ldb; | |||
| b_offset3 = b_offset2 + ldb; | |||
| b_offset4 = b_offset3 + ldb; | |||
| b_offset += 2; | |||
| j = (cols >> 2); | |||
| if (j > 0){ | |||
| do { | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||
| *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; | |||
| *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; | |||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||
| *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; | |||
| *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| b_offset1 += ldb * 4; | |||
| b_offset2 += ldb * 4; | |||
| b_offset3 += ldb * 4; | |||
| b_offset4 += ldb * 4; | |||
| j--; | |||
| } while (j > 0); | |||
| } | |||
| if (cols & 2){ | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||
| a_offset1 += 2; | |||
| a_offset2 += 2; | |||
| b_offset1 += ldb*2; | |||
| } | |||
| if (cols & 1){ | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||
| } | |||
| } // if (rows & 2) | |||
| if (rows & 1) { | |||
| a_offset1 = a_offset; | |||
| a_offset += lda; | |||
| b_offset1 = b_offset; | |||
| b_offset2 = b_offset1 + ldb; | |||
| b_offset3 = b_offset2 + ldb; | |||
| b_offset4 = b_offset3 + ldb; | |||
| j = (cols >> 2); | |||
| if (j > 0){ | |||
| do { | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||
| *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; | |||
| *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; | |||
| a_offset1 += 4; | |||
| b_offset1 += ldb * 4; | |||
| b_offset2 += ldb * 4; | |||
| b_offset3 += ldb * 4; | |||
| b_offset4 += ldb * 4; | |||
| j--; | |||
| } while (j > 0); | |||
| } | |||
| if (cols & 2){ | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||
| a_offset1 += 2; | |||
| b_offset1 += ldb * 2; | |||
| } | |||
| if (cols & 1){ | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| dot[0]=0.0; | |||
| dot[1]=0.0; | |||
| #if !defined(__PPC__) && !defined(__SunOS) | |||
| #if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) | |||
| CREAL(result) = 0.0 ; | |||
| CIMAG(result) = 0.0 ; | |||
| #else | |||
| @@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| i++ ; | |||
| } | |||
| #if !defined(__PPC__) && !defined(__SunOS) | |||
| #if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) | |||
| CREAL(result) = dot[0]; | |||
| CIMAG(result) = dot[1]; | |||
| #else | |||
| @@ -97,9 +97,18 @@ CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| @@ -96,11 +96,20 @@ DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.S | |||
| SDOTKERNEL = ../generic/dot.c | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| DSDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| @@ -70,10 +70,19 @@ DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| SNRM2KERNEL = nrm2.S | |||
| @@ -47,8 +47,13 @@ ZCOPYKERNEL = copy.S | |||
| SDOTKERNEL = dot_thunderx.c | |||
| DDOTKERNEL = ddot_thunderx.c | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| SNRM2KERNEL = nrm2.S | |||
| @@ -72,8 +72,13 @@ ZCOPYKERNEL = copy.S | |||
| SDOTKERNEL = dot.S | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| SNRM2KERNEL = nrm2.S | |||
| @@ -58,6 +58,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n | |||
| #define CUR_MAXINV "d8" | |||
| #define CUR_MAXINV_V "v8.2d" | |||
| #define CUR_MAX_V "v8.2d" | |||
| #define REGINF "d9" | |||
| static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| double *ssq, double *scale) | |||
| @@ -79,8 +80,10 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| " ble 9f //nrm2_kernel_L999 \n" | |||
| "1: //nrm2_kernel_F_BEGIN: \n" | |||
| " mov x6, #0x7FF0000000000000 //+Infinity \n" | |||
| " fmov "REGZERO", xzr \n" | |||
| " fmov "REGONE", #1.0 \n" | |||
| " fmov "REGINF", x6 \n" | |||
| " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" | |||
| " mov "J", "N" \n" | |||
| " cmp "J", xzr \n" | |||
| @@ -104,6 +107,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| " ldr d4, ["X"] \n" | |||
| " fabs d4, d4 \n" | |||
| " fmax "CUR_MAX", "SCALE", d4 \n" | |||
| " fcmp "CUR_MAX", "REGINF" \n" | |||
| " beq 10f \n" | |||
| " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | |||
| " fmul "SCALE", "SCALE", "SCALE" \n" | |||
| " fmul "SSQ", "SSQ", "SCALE" \n" | |||
| @@ -116,6 +121,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| " ldr d3, ["X", #8] \n" | |||
| " fabs d3, d3 \n" | |||
| " fmax "CUR_MAX", "SCALE", d3 \n" | |||
| " fcmp "CUR_MAX", "REGINF" \n" | |||
| " beq 10f \n" | |||
| " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | |||
| " fmul "SCALE", "SCALE", "SCALE" \n" | |||
| " fmul "SSQ", "SSQ", "SCALE" \n" | |||
| @@ -158,6 +165,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| " fmaxp v24.2d, v24.2d, v26.2d \n" | |||
| " fmaxp v24.2d, v24.2d, v24.2d \n" | |||
| " fmax "CUR_MAX", "SCALE", d24 \n" | |||
| " fcmp "CUR_MAX", "REGINF" \n" | |||
| " beq 10f \n" | |||
| " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" | |||
| " //dup "CUR_MAX_V", v7.d[0] \n" | |||
| " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | |||
| @@ -217,6 +226,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| " fmaxp v24.2d, v24.2d, v26.2d \n" | |||
| " fmaxp v24.2d, v24.2d, v24.2d \n" | |||
| " fmax "CUR_MAX", "SCALE", d24 \n" | |||
| " fcmp "CUR_MAX", "REGINF" \n" | |||
| " beq 10f \n" | |||
| " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" | |||
| " //dup "CUR_MAX_V", v7.d[0] \n" | |||
| " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | |||
| @@ -265,6 +276,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| " ldr d4, ["X"] \n" | |||
| " fabs d4, d4 \n" | |||
| " fmax "CUR_MAX", "SCALE", d4 \n" | |||
| " fcmp "CUR_MAX", "REGINF" \n" | |||
| " beq 10f \n" | |||
| " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | |||
| " fmul "SCALE", "SCALE", "SCALE" \n" | |||
| " fmul "SSQ", "SSQ", "SCALE" \n" | |||
| @@ -276,6 +289,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| " ldr d3, ["X", #8] \n" | |||
| " fabs d3, d3 \n" | |||
| " fmax "CUR_MAX", "SCALE", d3 \n" | |||
| " fcmp "CUR_MAX", "REGINF" \n" | |||
| " beq 10f \n" | |||
| " fdiv "SCALE", "SCALE", "CUR_MAX" \n" | |||
| " fmul "SCALE", "SCALE", "SCALE" \n" | |||
| " fmul "SSQ", "SSQ", "SCALE" \n" | |||
| @@ -291,6 +306,11 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| "9: //nrm2_kernel_L999: \n" | |||
| " str "SSQ", [%[SSQ_]] \n" | |||
| " str "SCALE", [%[SCALE_]] \n" | |||
| " b 11f \n" | |||
| "10: \n" | |||
| " str "REGINF", [%[SSQ_]] \n" | |||
| " str "REGINF", [%[SCALE_]] \n" | |||
| "11: \n" | |||
| : | |||
| : [SSQ_] "r" (ssq), //%0 | |||
| @@ -300,7 +320,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| [INCX_] "r" (inc_x) //%4 | |||
| : "cc", | |||
| "memory", | |||
| "x0", "x1", "x2", "x3", "x4", "x5", | |||
| "x0", "x1", "x2", "x3", "x4", "x5", "x6", | |||
| "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8" | |||
| ); | |||
| @@ -359,6 +379,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| cur_ssq = *ptr; | |||
| cur_scale = *(ptr + 1); | |||
| if (cur_ssq == INFINITY) { | |||
| ssq = INFINITY; | |||
| scale = INFINITY; | |||
| break; | |||
| } | |||
| if (cur_scale != 0) { | |||
| if (cur_scale > scale) { | |||
| scale = (scale / cur_scale); | |||
| @@ -154,11 +154,7 @@ ZCOPYKERNEL = zcopy_power10.c | |||
| SDOTKERNEL = sdot_power10.c | |||
| DDOTKERNEL = ddot_power10.c | |||
| DSDOTKERNEL = sdot_power10.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| CDOTKERNEL = cdot_power9.S | |||
| else | |||
| CDOTKERNEL = cdot.c | |||
| endif | |||
| ZDOTKERNEL = zdot.c | |||
| # | |||
| SNRM2KERNEL = ../arm/nrm2.c | |||
| @@ -0,0 +1,115 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL 1 | |||
| static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| __asm__ | |||
| ( | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "lxvp 36, 64(%2) \n\t" | |||
| "lxvp 38, 96(%2) \n\t" | |||
| "lxvp 40, 128(%2) \n\t" | |||
| "lxvp 42, 160(%2) \n\t" | |||
| "lxvp 44, 192(%2) \n\t" | |||
| "lxvp 46, 224(%2) \n\t" | |||
| "addi %2, %2, 256 \n\t" | |||
| "addic. %1, %1, -32 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "stxv 33, 0(%3) \n\t" | |||
| "stxv 32, 16(%3) \n\t" | |||
| "stxv 35, 32(%3) \n\t" | |||
| "stxv 34, 48(%3) \n\t" | |||
| "stxv 37, 64(%3) \n\t" | |||
| "stxv 36, 80(%3) \n\t" | |||
| "stxv 39, 96(%3) \n\t" | |||
| "stxv 38, 112(%3) \n\t" | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "lxvp 36, 64(%2) \n\t" | |||
| "lxvp 38, 96(%2) \n\t" | |||
| "stxv 41, 128(%3) \n\t" | |||
| "stxv 40, 144(%3) \n\t" | |||
| "stxv 43, 160(%3) \n\t" | |||
| "stxv 42, 176(%3) \n\t" | |||
| "stxv 45, 192(%3) \n\t" | |||
| "stxv 44, 208(%3) \n\t" | |||
| "stxv 47, 224(%3) \n\t" | |||
| "stxv 46, 240(%3) \n\t" | |||
| "lxvp 40, 128(%2) \n\t" | |||
| "lxvp 42, 160(%2) \n\t" | |||
| "lxvp 44, 192(%2) \n\t" | |||
| "lxvp 46, 224(%2) \n\t" | |||
| "addi %3, %3, 256 \n\t" | |||
| "addi %2, %2, 256 \n\t" | |||
| "addic. %1, %1, -32 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "stxv 33, 0(%3) \n\t" | |||
| "stxv 32, 16(%3) \n\t" | |||
| "stxv 35, 32(%3) \n\t" | |||
| "stxv 34, 48(%3) \n\t" | |||
| "stxv 37, 64(%3) \n\t" | |||
| "stxv 36, 80(%3) \n\t" | |||
| "stxv 39, 96(%3) \n\t" | |||
| "stxv 38, 112(%3) \n\t" | |||
| "stxv 41, 128(%3) \n\t" | |||
| "stxv 40, 144(%3) \n\t" | |||
| "stxv 43, 160(%3) \n\t" | |||
| "stxv 42, 176(%3) \n\t" | |||
| "stxv 45, 192(%3) \n\t" | |||
| "stxv 44, 208(%3) \n\t" | |||
| "stxv 47, 224(%3) \n\t" | |||
| "stxv 46, 240(%3) \n\t" | |||
| "#n=%1 x=%4=%2 y=%0=%3" | |||
| : | |||
| "=m" (*y), | |||
| "+r" (n), // 1 | |||
| "+b" (x), // 2 | |||
| "+b" (y) // 3 | |||
| : | |||
| "m" (*x) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" | |||
| ); | |||
| } | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "copy_microk_power10.c" | |||
| #include "ccopy_microk_power10.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL | |||
| @@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| BLASLONG n1 = n & -64; | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| copy_kernel(n1, x, y); | |||
| @@ -28,6 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #include "common.h" | |||
| #if defined(POWER10) | |||
| #include "cdot_microk_power10.c" | |||
| #else | |||
| #ifndef HAVE_KERNEL_8 | |||
| #include <altivec.h> | |||
| @@ -99,6 +102,7 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) | |||
| } | |||
| #endif | |||
| #endif | |||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||
| @@ -116,7 +120,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| if ((inc_x == 1) && (inc_y == 1)) { | |||
| #if defined(POWER10) | |||
| BLASLONG n1 = n & -16; | |||
| #else | |||
| BLASLONG n1 = n & -8; | |||
| #endif | |||
| BLASLONG j=0; | |||
| if (n1){ | |||
| @@ -0,0 +1,177 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void cdot_kernel_8 (long n, float *x, float *y, float *dot) | |||
| { | |||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| "dcbt 0, %3 \n\t" | |||
| "xxlxor 32, 32, 32 \n\t" | |||
| "xxlxor 33, 33, 33 \n\t" | |||
| "xxlxor 34, 34, 34 \n\t" | |||
| "xxlxor 35, 35, 35 \n\t" | |||
| "xxlxor 36, 36, 36 \n\t" | |||
| "xxlxor 37, 37, 37 \n\t" | |||
| "xxlxor 38, 38, 38 \n\t" | |||
| "xxlxor 39, 39, 39 \n\t" | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "lxvp 42, 32(%2) \n\t" | |||
| "lxvp 44, 64(%2) \n\t" | |||
| "lxvp 46, 96(%2) \n\t" | |||
| "lxvp 48, 0(%3) \n\t" | |||
| "lxvp 50, 32(%3) \n\t" | |||
| "lxvp 52, 64(%3) \n\t" | |||
| "lxvp 54, 96(%3) \n\t" | |||
| "xxperm 56, 48, %x7 \n\t" | |||
| "xxperm 57, 49, %x7 \n\t" | |||
| "xxperm 58, 50, %x7 \n\t" | |||
| "xxperm 59, 51, %x7 \n\t" | |||
| "xxperm 60, 52, %x7 \n\t" | |||
| "xxperm 61, 53, %x7 \n\t" | |||
| "xxperm 62, 54, %x7 \n\t" | |||
| "xxperm 63, 55, %x7 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addic. %1, %1, -16 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i | |||
| "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i | |||
| "lxvp 48, 0(%3) \n\t" | |||
| "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i | |||
| "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i | |||
| "lxvp 50, 32(%3) \n\t" | |||
| "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r | |||
| "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r | |||
| "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r | |||
| "lxvp 42, 32(%2) \n\t" | |||
| "xxperm 56, 48, %x7 \n\t" | |||
| "xxperm 57, 49, %x7 \n\t" | |||
| "xxperm 58, 50, %x7 \n\t" | |||
| "xxperm 59, 51, %x7 \n\t" | |||
| "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i | |||
| "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i | |||
| "lxvp 52, 64(%3) \n\t" | |||
| "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i | |||
| "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i | |||
| "lxvp 54, 96(%3) \n\t" | |||
| "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r | |||
| "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r | |||
| "lxvp 44, 64(%2) \n\t" | |||
| "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r | |||
| "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r | |||
| "lxvp 46, 96(%2) \n\t" | |||
| "xxperm 60, 52, %x7 \n\t" | |||
| "xxperm 61, 53, %x7 \n\t" | |||
| "xxperm 62, 54, %x7 \n\t" | |||
| "xxperm 63, 55, %x7 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addic. %1, %1, -16 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i | |||
| "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i | |||
| "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i | |||
| "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i | |||
| "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r | |||
| "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r | |||
| "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r | |||
| "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r | |||
| "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i | |||
| "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i | |||
| "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i | |||
| "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i | |||
| "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r | |||
| "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r | |||
| "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r | |||
| "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r | |||
| "xvaddsp 32, 32, 34 \n\t" | |||
| "xvaddsp 36, 36, 38 \n\t" | |||
| "xvaddsp 33, 33, 35 \n\t" | |||
| "xvaddsp 37, 37, 39 \n\t" | |||
| "xvaddsp 35, 32, 36 \n\t" | |||
| "xvaddsp 34, 33, 37 \n\t" | |||
| "xxswapd 32, 35 \n\t" | |||
| "xxswapd 33, 34 \n\t" | |||
| "xvaddsp 35, 35, 32 \n\t" | |||
| "xvaddsp 34, 34, 33 \n\t" | |||
| "xxpermdi 34, 34, 35, 2 \n\t" | |||
| "stxv 34, 0(%6) \n\t" | |||
| "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" | |||
| : | |||
| "=m" (*dot), | |||
| "+r" (n), // 1 | |||
| "+b" (x), // 2 | |||
| "+b" (y) // 3 | |||
| : | |||
| "m" (*x), | |||
| "m" (*y), | |||
| "b" (dot), // 6 | |||
| "wa" (mask) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||
| "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" | |||
| ); | |||
| } | |||
| @@ -62,38 +62,39 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| "one%=: \n\t" | |||
| "stxvp 32, 0(%3) \n\t" | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "stxvp 34, 32(%3) \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "stxvp 36, 64(%3) \n\t" | |||
| "lxvp 36, 64(%2) \n\t" | |||
| "stxvp 38, 96(%3) \n\t" | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "lxvp 36, 64(%2) \n\t" | |||
| "lxvp 38, 96(%2) \n\t" | |||
| "stxvp 40, 128(%3) \n\t" | |||
| "lxvp 40, 128(%2) \n\t" | |||
| "stxvp 42, 160(%3) \n\t" | |||
| "lxvp 42, 160(%2) \n\t" | |||
| "stxvp 44, 192(%3) \n\t" | |||
| "lxvp 44, 192(%2) \n\t" | |||
| "stxvp 46, 224(%3) \n\t" | |||
| "lxvp 40, 128(%2) \n\t" | |||
| "lxvp 42, 160(%2) \n\t" | |||
| "lxvp 44, 192(%2) \n\t" | |||
| "lxvp 46, 224(%2) \n\t" | |||
| "stxvp 48, 256(%3) \n\t" | |||
| "lxvp 48, 256(%2) \n\t" | |||
| "stxvp 50, 288(%3) \n\t" | |||
| "lxvp 50, 288(%2) \n\t" | |||
| "stxvp 52, 320(%3) \n\t" | |||
| "lxvp 52, 320(%2) \n\t" | |||
| "stxvp 54, 352(%3) \n\t" | |||
| "lxvp 48, 256(%2) \n\t" | |||
| "lxvp 50, 288(%2) \n\t" | |||
| "lxvp 52, 320(%2) \n\t" | |||
| "lxvp 54, 352(%2) \n\t" | |||
| "stxvp 56, 384(%3) \n\t" | |||
| "lxvp 56, 384(%2) \n\t" | |||
| "stxvp 58, 416(%3) \n\t" | |||
| "lxvp 58, 416(%2) \n\t" | |||
| "stxvp 60, 448(%3) \n\t" | |||
| "lxvp 60, 448(%2) \n\t" | |||
| "stxvp 62, 480(%3) \n\t" | |||
| "lxvp 56, 384(%2) \n\t" | |||
| "lxvp 58, 416(%2) \n\t" | |||
| "lxvp 60, 448(%2) \n\t" | |||
| "lxvp 62, 480(%2) \n\t" | |||
| "addi %3, %3, 512 \n\t" | |||
| @@ -0,0 +1,176 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) | |||
| { | |||
| __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i}; | |||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| "xscvdpspn 32, %x3 \n\t" | |||
| "xxspltw 32, 32, 0 \n\t" | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "lxvp 42, 32(%2) \n\t" | |||
| "lxvp 44, 64(%2) \n\t" | |||
| "lxvp 46, 96(%2) \n\t" | |||
| "addic. %1, %1, -16 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r | |||
| "xvmulsp 49, 41, 32 \n\t" | |||
| "xvmulsp 50, 42, 32 \n\t" | |||
| "xvmulsp 51, 43, 32 \n\t" | |||
| "xvmulsp 52, 44, 32 \n\t" | |||
| "xvmulsp 53, 45, 32 \n\t" | |||
| "xvmulsp 54, 46, 32 \n\t" | |||
| "xvmulsp 55, 47, 32 \n\t" | |||
| "xxperm 34, 40, %x5 \n\t" | |||
| "xxperm 35, 41, %x5 \n\t" | |||
| "xxperm 36, 42, %x5 \n\t" | |||
| "xxperm 37, 43, %x5 \n\t" | |||
| "xxperm 38, 44, %x5 \n\t" | |||
| "xxperm 39, 45, %x5 \n\t" | |||
| "xxperm 56, 46, %x5 \n\t" | |||
| "xxperm 57, 47, %x5 \n\t" | |||
| "xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i | |||
| "xvmulsp 35, 35, %x4 \n\t" | |||
| "lxvp 40, 128(%2) \n\t" | |||
| "xvmulsp 36, 36, %x4 \n\t" | |||
| "xvmulsp 37, 37, %x4 \n\t" | |||
| "lxvp 42, 160(%2) \n\t" | |||
| "xvmulsp 38, 38, %x4 \n\t" | |||
| "xvmulsp 39, 39, %x4 \n\t" | |||
| "lxvp 44, 192(%2) \n\t" | |||
| "xvmulsp 56, 56, %x4 \n\t" | |||
| "xvmulsp 57, 57, %x4 \n\t" | |||
| "lxvp 46, 224(%2) \n\t" | |||
| "xvaddsp 48, 48, 34 \n\t" | |||
| "xvaddsp 49, 49, 35 \n\t" | |||
| "xvaddsp 50, 50, 36 \n\t" | |||
| "xvaddsp 51, 51, 37 \n\t" | |||
| "stxvp 48, 0(%2) \n\t" | |||
| "xvaddsp 52, 52, 38 \n\t" | |||
| "xvaddsp 53, 53, 39 \n\t" | |||
| "stxvp 50, 32(%2) \n\t" | |||
| "xvaddsp 54, 54, 56 \n\t" | |||
| "xvaddsp 55, 55, 57 \n\t" | |||
| "stxvp 52, 64(%2) \n\t" | |||
| "stxvp 54, 96(%2) \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %1, %1, -16 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r | |||
| "xvmulsp 49, 41, 32 \n\t" | |||
| "xvmulsp 50, 42, 32 \n\t" | |||
| "xvmulsp 51, 43, 32 \n\t" | |||
| "xvmulsp 52, 44, 32 \n\t" | |||
| "xvmulsp 53, 45, 32 \n\t" | |||
| "xvmulsp 54, 46, 32 \n\t" | |||
| "xvmulsp 55, 47, 32 \n\t" | |||
| "xxperm 34, 40, %x5 \n\t" | |||
| "xxperm 35, 41, %x5 \n\t" | |||
| "xxperm 36, 42, %x5 \n\t" | |||
| "xxperm 37, 43, %x5 \n\t" | |||
| "xxperm 38, 44, %x5 \n\t" | |||
| "xxperm 39, 45, %x5 \n\t" | |||
| "xxperm 56, 46, %x5 \n\t" | |||
| "xxperm 57, 47, %x5 \n\t" | |||
| "xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i | |||
| "xvmulsp 35, 35, %x4 \n\t" | |||
| "xvmulsp 36, 36, %x4 \n\t" | |||
| "xvmulsp 37, 37, %x4 \n\t" | |||
| "xvmulsp 38, 38, %x4 \n\t" | |||
| "xvmulsp 39, 39, %x4 \n\t" | |||
| "xvmulsp 56, 56, %x4 \n\t" | |||
| "xvmulsp 57, 57, %x4 \n\t" | |||
| "xvaddsp 48, 48, 34 \n\t" | |||
| "xvaddsp 49, 49, 35 \n\t" | |||
| "xvaddsp 50, 50, 36 \n\t" | |||
| "xvaddsp 51, 51, 37 \n\t" | |||
| "stxvp 48, 0(%2) \n\t" | |||
| "xvaddsp 52, 52, 38 \n\t" | |||
| "xvaddsp 53, 53, 39 \n\t" | |||
| "stxvp 50, 32(%2) \n\t" | |||
| "xvaddsp 54, 54, 56 \n\t" | |||
| "xvaddsp 55, 55, 57 \n\t" | |||
| "stxvp 52, 64(%2) \n\t" | |||
| "stxvp 54, 96(%2) \n\t" | |||
| "#n=%1 x=%0=%2 alpha=(%3,%4)\n" | |||
| : | |||
| "+m" (*x), | |||
| "+r" (n), // 1 | |||
| "+b" (x) // 2 | |||
| : | |||
| "f" (alpha_r), // 3 | |||
| "wa" (t0), // 4 | |||
| "wa" (mask) // 5 | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||
| "vs56","vs57" | |||
| ); | |||
| } | |||
| @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "cswap_microk_power8.c" | |||
| #elif defined(POWER10) | |||
| #include "cswap_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -0,0 +1,127 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if defined(DOUBLE) | |||
| #define HAVE_KERNEL_16 1 | |||
| static void zswap_kernel_16 (long n, double *x, double *y) | |||
| #else | |||
| #define HAVE_KERNEL_32 1 | |||
| static void cswap_kernel_32 (long n, float *x, float *y) | |||
| #endif | |||
| { | |||
| __asm__ | |||
| ( | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "lxvp 32, 0(%4) \n\t" | |||
| "lxvp 34, 32(%4) \n\t" | |||
| "lxvp 36, 64(%4) \n\t" | |||
| "lxvp 38, 96(%4) \n\t" | |||
| "lxvp 40, 128(%4) \n\t" | |||
| "lxvp 42, 160(%4) \n\t" | |||
| "lxvp 44, 192(%4) \n\t" | |||
| "lxvp 46, 224(%4) \n\t" | |||
| "lxvp 48, 0(%3) \n\t" | |||
| "lxvp 50, 32(%3) \n\t" | |||
| "lxvp 52, 64(%3) \n\t" | |||
| "lxvp 54, 96(%3) \n\t" | |||
| "lxvp 56, 128(%3) \n\t" | |||
| "lxvp 58, 160(%3) \n\t" | |||
| "lxvp 60, 192(%3) \n\t" | |||
| "lxvp 62, 224(%3) \n\t" | |||
| "stxv 33, 0(%3) \n\t" | |||
| "stxv 32, 16(%3) \n\t" | |||
| "stxv 35, 32(%3) \n\t" | |||
| "stxv 34, 48(%3) \n\t" | |||
| "stxv 37, 64(%3) \n\t" | |||
| "stxv 36, 80(%3) \n\t" | |||
| "stxv 39, 96(%3) \n\t" | |||
| "stxv 38, 112(%3) \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "stxv 41, 0(%3) \n\t" | |||
| "stxv 40, 16(%3) \n\t" | |||
| "stxv 43, 32(%3) \n\t" | |||
| "stxv 42, 48(%3) \n\t" | |||
| "stxv 45, 64(%3) \n\t" | |||
| "stxv 44, 80(%3) \n\t" | |||
| "stxv 47, 96(%3) \n\t" | |||
| "stxv 46, 112(%3) \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "stxv 49, 0(%4) \n\t" | |||
| "stxv 48, 16(%4) \n\t" | |||
| "stxv 51, 32(%4) \n\t" | |||
| "stxv 50, 48(%4) \n\t" | |||
| "stxv 53, 64(%4) \n\t" | |||
| "stxv 52, 80(%4) \n\t" | |||
| "stxv 55, 96(%4) \n\t" | |||
| "stxv 54, 112(%4) \n\t" | |||
| "addi %4, %4, 128 \n\t" | |||
| "stxv 57, 0(%4) \n\t" | |||
| "stxv 56, 16(%4) \n\t" | |||
| "stxv 59, 32(%4) \n\t" | |||
| "stxv 58, 48(%4) \n\t" | |||
| "stxv 61, 64(%4) \n\t" | |||
| "stxv 60, 80(%4) \n\t" | |||
| "stxv 63, 96(%4) \n\t" | |||
| "stxv 62, 112(%4) \n\t" | |||
| "addi %4, %4, 128 \n\t" | |||
| #if defined(DOUBLE) | |||
| "addic. %2, %2, -16 \n\t" | |||
| #else | |||
| "addic. %2, %2, -32 \n\t" | |||
| #endif | |||
| "bgt one%= \n" | |||
| "#n=%2 x=%0=%3 y=%1=%4" | |||
| : | |||
| "+m" (*x), | |||
| "+m" (*y), | |||
| "+r" (n), // 2 | |||
| "+b" (x), // 3 | |||
| "+b" (y) // 4 | |||
| : | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||
| "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" | |||
| ); | |||
| } | |||
| @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dasum_microk_power8.c" | |||
| #elif defined(POWER10) | |||
| #include "dasum_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if ( inc_x == 1 ) | |||
| { | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| for (i = 0; i < align; i++) { | |||
| sumf += ABS(x[i]); | |||
| } | |||
| } | |||
| n1 = (n-i) & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sumf += dasum_kernel_16(n1, &x[i]); | |||
| i+=n1; | |||
| } | |||
| #else | |||
| n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| @@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sumf = dasum_kernel_16(n1, x); | |||
| i=n1; | |||
| } | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| @@ -0,0 +1,152 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static double dasum_kernel_16 (long n, double *x) | |||
| { | |||
| double sum; | |||
| __vector double t0; | |||
| __vector double t1; | |||
| __vector double t2; | |||
| __vector double t3; | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| "xxlxor 32, 32, 32 \n\t" | |||
| "xxlxor 33, 33, 33 \n\t" | |||
| "xxlxor 34, 34, 34 \n\t" | |||
| "xxlxor 35, 35, 35 \n\t" | |||
| "xxlxor 36, 36, 36 \n\t" | |||
| "xxlxor 37, 37, 37 \n\t" | |||
| "xxlxor 38, 38, 38 \n\t" | |||
| "xxlxor 39, 39, 39 \n\t" | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "lxvp 42, 32(%2) \n\t" | |||
| "lxvp 44, 64(%2) \n\t" | |||
| "lxvp 46, 96(%2) \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %1, %1, -16 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "xvabsdp 48, 40 \n\t" | |||
| "xvabsdp 49, 41 \n\t" | |||
| "xvabsdp 50, 42 \n\t" | |||
| "xvabsdp 51, 43 \n\t" | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "xvabsdp %x3, 44 \n\t" | |||
| "xvabsdp %x4, 45 \n\t" | |||
| "lxvp 42, 32(%2) \n\t" | |||
| "xvabsdp %x5, 46 \n\t" | |||
| "xvabsdp %x6, 47 \n\t" | |||
| "lxvp 44, 64(%2) \n\t" | |||
| "xvadddp 32, 32, 48 \n\t" | |||
| "xvadddp 33, 33, 49 \n\t" | |||
| "lxvp 46, 96(%2) \n\t" | |||
| "xvadddp 34, 34, 50 \n\t" | |||
| "xvadddp 35, 35, 51 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "xvadddp 36, 36, %x3 \n\t" | |||
| "xvadddp 37, 37, %x4 \n\t" | |||
| "addic. %1, %1, -16 \n\t" | |||
| "xvadddp 38, 38, %x5 \n\t" | |||
| "xvadddp 39, 39, %x6 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "xvabsdp 48, 40 \n\t" | |||
| "xvabsdp 49, 41 \n\t" | |||
| "xvabsdp 50, 42 \n\t" | |||
| "xvabsdp 51, 43 \n\t" | |||
| "xvabsdp %x3, 44 \n\t" | |||
| "xvabsdp %x4, 45 \n\t" | |||
| "xvabsdp %x5, 46 \n\t" | |||
| "xvabsdp %x6, 47 \n\t" | |||
| "xvadddp 32, 32, 48 \n\t" | |||
| "xvadddp 33, 33, 49 \n\t" | |||
| "xvadddp 34, 34, 50 \n\t" | |||
| "xvadddp 35, 35, 51 \n\t" | |||
| "xvadddp 36, 36, %x3 \n\t" | |||
| "xvadddp 37, 37, %x4 \n\t" | |||
| "xvadddp 38, 38, %x5 \n\t" | |||
| "xvadddp 39, 39, %x6 \n\t" | |||
| "xvadddp 32, 32, 33 \n\t" | |||
| "xvadddp 34, 34, 35 \n\t" | |||
| "xvadddp 36, 36, 37 \n\t" | |||
| "xvadddp 38, 38, 39 \n\t" | |||
| "xvadddp 32, 32, 34 \n\t" | |||
| "xvadddp 36, 36, 38 \n\t" | |||
| "xvadddp 32, 32, 36 \n\t" | |||
| XXSWAPD_S(33,32) | |||
| "xsadddp %x0, 32, 33 \n" | |||
| "#n=%1 x=%3=%2 sum=%0\n" | |||
| "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" | |||
| : | |||
| "=d" (sum), // 0 | |||
| "+r" (n), // 1 | |||
| "+b" (x), // 2 | |||
| "=wa" (t0), // 3 | |||
| "=wa" (t1), // 4 | |||
| "=wa" (t2), // 5 | |||
| "=wa" (t3) // 6 | |||
| : | |||
| "m" (*x) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51" | |||
| ); | |||
| return sum; | |||
| } | |||
| @@ -85,12 +85,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| BLASLONG n1 = n & -64; | |||
| if ( n1 > 0 ) | |||
| if ( n >= 64 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| for (i = 0; i < align; i++) { | |||
| y[i] = x[i] ; | |||
| } | |||
| } | |||
| BLASLONG n1 = (n-i) & -64; | |||
| if ( n1 ) | |||
| { | |||
| copy_kernel(n1, x, y); | |||
| i=n1; | |||
| copy_kernel(n1, &x[i], &y[i]); | |||
| i += n1; | |||
| } | |||
| while(i < n) | |||
| @@ -29,7 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| typedef __vector unsigned char vec_t; | |||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | |||
| typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | |||
| #if !__has_builtin(__builtin_vsx_assemble_pair) | |||
| #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair | |||
| #endif | |||
| #if !__has_builtin(__builtin_vsx_disassemble_pair) | |||
| #define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #define SAVE_ACC(ACC, J) \ | |||
| @@ -186,8 +192,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| vec_t *rowA = (vec_t *) & AO[0]; | |||
| vec_t *rb = (vec_t *) & BO[0]; | |||
| __vector_pair rowB, rowB1; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); | |||
| @@ -200,8 +206,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| { | |||
| rowA = (vec_t *) & AO[l << 3]; | |||
| rb = (vec_t *) & BO[l << 3]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); | |||
| @@ -242,8 +248,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| vec_t *rowA = (vec_t *) & AO[0]; | |||
| __vector_pair rowB, rowB1; | |||
| vec_t *rb = (vec_t *) & BO[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); | |||
| @@ -252,8 +258,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| { | |||
| rowA = (vec_t *) & AO[l << 2]; | |||
| rb = (vec_t *) & BO[l << 3]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); | |||
| @@ -286,16 +292,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| vec_t *rowA = (vec_t *) & AO[0]; | |||
| __vector_pair rowB, rowB1; | |||
| vec_t *rb = (vec_t *) & BO[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | |||
| for (l = 1; l < temp; l++) | |||
| { | |||
| rowA = (vec_t *) & AO[l << 1]; | |||
| rb = (vec_t *) & BO[l << 3]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | |||
| } | |||
| @@ -398,7 +404,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| vec_t *rowA = (vec_t *) & AO[0]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & BO[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | |||
| __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); | |||
| @@ -407,7 +413,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| { | |||
| rowA = (vec_t *) & AO[l << 3]; | |||
| rb = (vec_t *) & BO[l << 2]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | |||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); | |||
| @@ -440,14 +446,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| vec_t *rowA = (vec_t *) & AO[0]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & BO[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | |||
| for (l = 1; l < temp; l++) | |||
| { | |||
| rowA = (vec_t *) & AO[l << 2]; | |||
| rb = (vec_t *) & BO[l << 2]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | |||
| } | |||
| @@ -476,13 +482,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| vec_t *rowA = (vec_t *) & AO[0]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & BO[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
| for (l = 1; l < temp; l++) | |||
| { | |||
| rowA = (vec_t *) & AO[l << 1]; | |||
| rb = (vec_t *) & BO[l << 2]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| } | |||
| SAVE_ACC (&acc0, 0); | |||
| @@ -562,11 +568,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| v4sf_t result[4]; | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| BLASLONG l = 0; | |||
| FLOAT t[4] = { 0, 0, 0, 0 }; | |||
| t[0] = BO[0], t[1] = BO[1]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & t[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| vec_t *rb = (vec_t *) & BO[0]; | |||
| __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); | |||
| vec_t *rowA = (vec_t *) & AO[0]; | |||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | |||
| @@ -574,9 +578,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); | |||
| for (l = 1; l < temp; l++) | |||
| { | |||
| t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; | |||
| rb = (vec_t *) & t[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| rb = (vec_t *) & BO[l << 1]; | |||
| __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); | |||
| rowA = (vec_t *) & AO[l << 3]; | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | |||
| @@ -607,19 +610,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| v4sf_t result[4]; | |||
| __vector_quad acc0, acc1; | |||
| BLASLONG l = 0; | |||
| FLOAT t[4] = { 0, 0, 0, 0 }; | |||
| t[0] = BO[0], t[1] = BO[1]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & t[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| vec_t *rb = (vec_t *) & BO[0]; | |||
| __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); | |||
| vec_t *rowA = (vec_t *) & AO[0]; | |||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | |||
| for (l = 1; l < temp; l++) | |||
| { | |||
| t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; | |||
| rb = (vec_t *) & t[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| rb = (vec_t *) & BO[l << 1]; | |||
| __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); | |||
| rowA = (vec_t *) & AO[l << 2]; | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | |||
| @@ -646,18 +646,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| v4sf_t result[4]; | |||
| __vector_quad acc0; | |||
| BLASLONG l = 0; | |||
| FLOAT t[4] = { 0, 0, 0, 0 }; | |||
| t[0] = BO[0], t[1] = BO[1]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & t[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| vec_t *rb = (vec_t *) & BO[0]; | |||
| __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); | |||
| vec_t *rowA = (vec_t *) & AO[0]; | |||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
| for (l = 1; l < temp; l++) | |||
| { | |||
| t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; | |||
| rb = (vec_t *) & t[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| rb = (vec_t *) & BO[l << 1]; | |||
| __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); | |||
| rowA = (vec_t *) & AO[l << 1]; | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| } | |||
| @@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #pragma GCC optimize "O1" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "drot_microk_power8.c" | |||
| #elif defined(POWER10) | |||
| #include "drot_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| for (i = 0; i < align; i++) { | |||
| temp = c*x[i] + s*y[i] ; | |||
| y[i] = c*y[i] - s*x[i] ; | |||
| x[i] = temp ; | |||
| } | |||
| } | |||
| BLASLONG n1 = (n-i) & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| drot_kernel_16(n1,&x[i], &y[i], c, s); | |||
| i+=n1; | |||
| } | |||
| #else | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| drot_kernel_16(n1, x1, y1, c, s); | |||
| i=n1; | |||
| } | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| @@ -0,0 +1,148 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void drot_kernel_16 (long n, double *x, double *y, double c, double s) | |||
| { | |||
| __asm__ | |||
| ( | |||
| XXSPLTD_S(36,%x5,0) // load c to both dwords | |||
| XXSPLTD_S(37,%x6,0) // load s to both dwords | |||
| "lxvp 32, 0(%3) \n\t" // load x | |||
| "lxvp 34, 32(%3) \n\t" | |||
| "lxvp 48, 0(%4) \n\t" // load y | |||
| "lxvp 50, 32(%4) \n\t" | |||
| "addic. %2, %2, -8 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "xvmuldp 40, 32, 36 \n\t" // c * x | |||
| "xvmuldp 41, 33, 36 \n\t" | |||
| "xvmuldp 42, 34, 36 \n\t" | |||
| "xvmuldp 43, 35, 36 \n\t" | |||
| "xvmuldp 44, 32, 37 \n\t" // s * x | |||
| "xvmuldp 45, 33, 37 \n\t" | |||
| "xvmuldp 46, 34, 37 \n\t" | |||
| "xvmuldp 47, 35, 37 \n\t" | |||
| "lxvp 32, 64(%3) \n\t" // load x | |||
| "lxvp 34, 96(%3) \n\t" | |||
| "xvmuldp 52, 48, 36 \n\t" // c * y | |||
| "xvmuldp 53, 49, 36 \n\t" | |||
| "xvmuldp 54, 50, 36 \n\t" | |||
| "xvmuldp 55, 51, 36 \n\t" | |||
| "xvmuldp 38, 48, 37 \n\t" // s * y | |||
| "xvmuldp 39, 49, 37 \n\t" | |||
| "xvmuldp 56, 50, 37 \n\t" | |||
| "xvmuldp 57, 51, 37 \n\t" | |||
| "lxvp 48, 64(%4) \n\t" // load y | |||
| "lxvp 50, 96(%4) \n\t" | |||
| "xvadddp 40, 40, 38 \n\t" // c * x + s * y | |||
| "xvadddp 41, 41, 39 \n\t" // c * x + s * y | |||
| "xvadddp 42, 42, 56 \n\t" // c * x + s * y | |||
| "xvadddp 43, 43, 57 \n\t" // c * x + s * y | |||
| "stxvp 40, 0(%3) \n\t" // store x | |||
| "stxvp 42, 32(%3) \n\t" | |||
| "xvsubdp 52, 52, 44 \n\t" // c * y - s * x | |||
| "xvsubdp 53, 53, 45 \n\t" // c * y - s * x | |||
| "xvsubdp 54, 54, 46 \n\t" // c * y - s * x | |||
| "xvsubdp 55, 55, 47 \n\t" // c * y - s * x | |||
| "stxvp 52, 0(%4) \n\t" // store y | |||
| "stxvp 54, 32(%4) \n\t" | |||
| "addi %3, %3, 64 \n\t" | |||
| "addi %4, %4, 64 \n\t" | |||
| "addic. %2, %2, -8 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "xvmuldp 40, 32, 36 \n\t" // c * x | |||
| "xvmuldp 41, 33, 36 \n\t" | |||
| "xvmuldp 42, 34, 36 \n\t" | |||
| "xvmuldp 43, 35, 36 \n\t" | |||
| "xvmuldp 52, 48, 36 \n\t" // c * y | |||
| "xvmuldp 53, 49, 36 \n\t" | |||
| "xvmuldp 54, 50, 36 \n\t" | |||
| "xvmuldp 55, 51, 36 \n\t" | |||
| "xvmuldp 44, 32, 37 \n\t" // s * x | |||
| "xvmuldp 45, 33, 37 \n\t" | |||
| "xvmuldp 46, 34, 37 \n\t" | |||
| "xvmuldp 47, 35, 37 \n\t" | |||
| "xvmuldp 38, 48, 37 \n\t" // s * y | |||
| "xvmuldp 39, 49, 37 \n\t" | |||
| "xvmuldp 56, 50, 37 \n\t" | |||
| "xvmuldp 57, 51, 37 \n\t" | |||
| "xvadddp 40, 40, 38 \n\t" // c * x + s * y | |||
| "xvadddp 41, 41, 39 \n\t" // c * x + s * y | |||
| "xvadddp 42, 42, 56 \n\t" // c * x + s * y | |||
| "xvadddp 43, 43, 57 \n\t" // c * x + s * y | |||
| "stxvp 40, 0(%3) \n\t" // store x | |||
| "stxvp 42, 32(%3) \n\t" | |||
| "xvsubdp 52, 52, 44 \n\t" // c * y - s * x | |||
| "xvsubdp 53, 53, 45 \n\t" // c * y - s * x | |||
| "xvsubdp 54, 54, 46 \n\t" // c * y - s * x | |||
| "xvsubdp 55, 55, 47 \n\t" // c * y - s * x | |||
| "stxvp 52, 0(%4) \n\t" // store y | |||
| "stxvp 54, 32(%4) \n\t" | |||
| "#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n" | |||
| : | |||
| "+m" (*x), | |||
| "+m" (*y), | |||
| "+r" (n), // 2 | |||
| "+b" (x), // 3 | |||
| "+b" (y) // 4 | |||
| : | |||
| "d" (c), // 5 | |||
| "d" (s) // 6 | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||
| "vs56","vs57" | |||
| ); | |||
| } | |||
| @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dscal_microk_power8.c" | |||
| #elif defined(POWER10) | |||
| #include "dscal_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -100,12 +102,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( da == 0.0 ) | |||
| { | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| for (j = 0; j < align; j++) { | |||
| x[j] = 0.0; | |||
| } | |||
| } | |||
| BLASLONG n1 = (n-j) & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| dscal_kernel_8_zero(n1, &x[j]); | |||
| j+=n1; | |||
| } | |||
| #else | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| dscal_kernel_8_zero(n1, x); | |||
| j=n1; | |||
| } | |||
| #endif | |||
| while(j < n) | |||
| { | |||
| @@ -118,12 +136,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| else | |||
| { | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| for (j = 0; j < align; j++) { | |||
| x[j] = da * x[j]; | |||
| } | |||
| } | |||
| BLASLONG n1 = (n-j) & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| dscal_kernel_8(n1, &x[j], da); | |||
| j+=n1; | |||
| } | |||
| #else | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| dscal_kernel_8(n1, x, da); | |||
| j=n1; | |||
| } | |||
| #endif | |||
| while(j < n) | |||
| { | |||
| @@ -0,0 +1,134 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void dscal_kernel_8 (long n, double *x, double alpha) | |||
| { | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| XXSPLTD_S(48,%x3,0) | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "lxvp 36, 64(%2) \n\t" | |||
| "lxvp 38, 96(%2) \n\t" | |||
| "addic. %1, %1, -16 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "xvmuldp 40, 32, 48 \n\t" | |||
| "xvmuldp 41, 33, 48 \n\t" | |||
| "xvmuldp 42, 34, 48 \n\t" | |||
| "xvmuldp 43, 35, 48 \n\t" | |||
| "lxvp 32, 128(%2) \n\t" | |||
| "lxvp 34, 160(%2) \n\t" | |||
| "xvmuldp 44, 36, 48 \n\t" | |||
| "xvmuldp 45, 37, 48 \n\t" | |||
| "xvmuldp 46, 38, 48 \n\t" | |||
| "xvmuldp 47, 39, 48 \n\t" | |||
| "lxvp 36, 192(%2) \n\t" | |||
| "lxvp 38, 224(%2) \n\t" | |||
| "stxvp 40, 0(%2) \n\t" | |||
| "stxvp 42, 32(%2) \n\t" | |||
| "stxvp 44, 64(%2) \n\t" | |||
| "stxvp 46, 96(%2) \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %1, %1, -16 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "xvmuldp 40, 32, 48 \n\t" | |||
| "xvmuldp 41, 33, 48 \n\t" | |||
| "xvmuldp 42, 34, 48 \n\t" | |||
| "xvmuldp 43, 35, 48 \n\t" | |||
| "xvmuldp 44, 36, 48 \n\t" | |||
| "xvmuldp 45, 37, 48 \n\t" | |||
| "xvmuldp 46, 38, 48 \n\t" | |||
| "xvmuldp 47, 39, 48 \n\t" | |||
| "stxvp 40, 0(%2) \n\t" | |||
| "stxvp 42, 32(%2) \n\t" | |||
| "stxvp 44, 64(%2) \n\t" | |||
| "stxvp 46, 96(%2) \n\t" | |||
| "#n=%1 alpha=%3 x=%0=%2" | |||
| : | |||
| "+m" (*x), | |||
| "+r" (n), // 1 | |||
| "+b" (x) // 2 | |||
| : | |||
| "d" (alpha) // 3 | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48" | |||
| ); | |||
| } | |||
| static void dscal_kernel_8_zero (long n, double *x) | |||
| { | |||
| __asm__ | |||
| ( | |||
| "xxlxor 32, 32, 32 \n\t" | |||
| "xxlxor 33, 33, 33 \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "stxvp 32, 0(%2) \n\t" | |||
| "stxvp 32, 32(%2) \n\t" | |||
| "stxvp 32, 64(%2) \n\t" | |||
| "stxvp 32, 96(%2) \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %1, %1, -16 \n\t" | |||
| "bgt one%= \n" | |||
| "#n=%1 x=%0=%2 " | |||
| : | |||
| "=m" (*x), | |||
| "+r" (n), // 1 | |||
| "+b" (x) // 2 | |||
| : | |||
| : | |||
| "cr0","vs32","vs33" | |||
| ); | |||
| } | |||
| @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dswap_microk_power8.c" | |||
| #elif defined(POWER10) | |||
| #include "swap_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| for (i = 0; i < align; i++) { | |||
| temp = y[i]; | |||
| y[i] = x[i]; | |||
| x[i] = temp; | |||
| } | |||
| } | |||
| BLASLONG n1 = (n-i) & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| dswap_kernel_32(n1,&x[i], &y[i]); | |||
| i+=n1; | |||
| } | |||
| #else | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| dswap_kernel_32(n1, x, y); | |||
| i=n1; | |||
| } | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "sasum_microk_power8.c" | |||
| #elif defined(POWER10) | |||
| #include "sasum_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if ( inc_x == 1 ) | |||
| { | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| for (i = 0; i < align; i++) { | |||
| sumf += ABS(x[i]); | |||
| } | |||
| } | |||
| n1 = (n-i) & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sumf += sasum_kernel_32(n1, &x[i]); | |||
| i+=n1; | |||
| } | |||
| #else | |||
| n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| @@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sumf = sasum_kernel_32(n1, x); | |||
| i=n1; | |||
| } | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| @@ -0,0 +1,153 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_32 1 | |||
| static float sasum_kernel_32 (long n, float *x) | |||
| { | |||
| float sum; | |||
| __vector float t0; | |||
| __vector float t1; | |||
| __vector float t2; | |||
| __vector float t3; | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| "xxlxor 32, 32, 32 \n\t" | |||
| "xxlxor 33, 33, 33 \n\t" | |||
| "xxlxor 34, 34, 34 \n\t" | |||
| "xxlxor 35, 35, 35 \n\t" | |||
| "xxlxor 36, 36, 36 \n\t" | |||
| "xxlxor 37, 37, 37 \n\t" | |||
| "xxlxor 38, 38, 38 \n\t" | |||
| "xxlxor 39, 39, 39 \n\t" | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "lxvp 42, 32(%2) \n\t" | |||
| "lxvp 44, 64(%2) \n\t" | |||
| "lxvp 46, 96(%2) \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %1, %1, -32 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "xvabssp 48, 40 \n\t" | |||
| "xvabssp 49, 41 \n\t" | |||
| "xvabssp 50, 42 \n\t" | |||
| "xvabssp 51, 43 \n\t" | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "xvabssp %x3, 44 \n\t" | |||
| "xvabssp %x4, 45 \n\t" | |||
| "lxvp 42, 32(%2) \n\t" | |||
| "xvabssp %x5, 46 \n\t" | |||
| "xvabssp %x6, 47 \n\t" | |||
| "lxvp 44, 64(%2) \n\t" | |||
| "xvaddsp 32, 32, 48 \n\t" | |||
| "xvaddsp 33, 33, 49 \n\t" | |||
| "lxvp 46, 96(%2) \n\t" | |||
| "xvaddsp 34, 34, 50 \n\t" | |||
| "xvaddsp 35, 35, 51 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "xvaddsp 36, 36, %x3 \n\t" | |||
| "xvaddsp 37, 37, %x4 \n\t" | |||
| "addic. %1, %1, -32 \n\t" | |||
| "xvaddsp 38, 38, %x5 \n\t" | |||
| "xvaddsp 39, 39, %x6 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "xvabssp 48, 40 \n\t" | |||
| "xvabssp 49, 41 \n\t" | |||
| "xvabssp 50, 42 \n\t" | |||
| "xvabssp 51, 43 \n\t" | |||
| "xvabssp %x3, 44 \n\t" | |||
| "xvabssp %x4, 45 \n\t" | |||
| "xvabssp %x5, 46 \n\t" | |||
| "xvabssp %x6, 47 \n\t" | |||
| "xvaddsp 32, 32, 48 \n\t" | |||
| "xvaddsp 33, 33, 49 \n\t" | |||
| "xvaddsp 34, 34, 50 \n\t" | |||
| "xvaddsp 35, 35, 51 \n\t" | |||
| "xvaddsp 36, 36, %x3 \n\t" | |||
| "xvaddsp 37, 37, %x4 \n\t" | |||
| "xvaddsp 38, 38, %x5 \n\t" | |||
| "xvaddsp 39, 39, %x6 \n\t" | |||
| "xvaddsp 32, 32, 33 \n\t" | |||
| "xvaddsp 34, 34, 35 \n\t" | |||
| "xvaddsp 36, 36, 37 \n\t" | |||
| "xvaddsp 38, 38, 39 \n\t" | |||
| "xvaddsp 32, 32, 34 \n\t" | |||
| "xvaddsp 36, 36, 38 \n\t" | |||
| "xvaddsp 32, 32, 36 \n\t" | |||
| "xxsldwi 33, 32, 32, 2 \n\t" | |||
| "xvaddsp 32, 32, 33 \n\t" | |||
| "xxsldwi 33, 32, 32, 1 \n\t" | |||
| "xvaddsp 32, 32, 33 \n\t" | |||
| "xscvspdp %x0, 32 \n" | |||
| "#n=%1 x=%3=%2 sum=%0\n" | |||
| "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" | |||
| : | |||
| "=f" (sum), // 0 | |||
| "+r" (n), // 1 | |||
| "+b" (x), // 2 | |||
| "=wa" (t0), // 3 | |||
| "=wa" (t1), // 4 | |||
| "=wa" (t2), // 5 | |||
| "=wa" (t3) // 6 | |||
| : | |||
| "m" (*x) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51" | |||
| ); | |||
| return sum; | |||
| } | |||
| @@ -86,11 +86,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| BLASLONG n1 = n & -128; | |||
| if ( n1 > 0 ) | |||
| if ( n >= 128 ) | |||
| { | |||
| copy_kernel (n1, x, y); | |||
| i=n1; | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| for (i = 0; i < align; i++) { | |||
| y[i] = x[i] ; | |||
| } | |||
| } | |||
| BLASLONG n1 = (n-i) & -128; | |||
| if ( n1 ) | |||
| { | |||
| copy_kernel(n1, &x[i], &y[i]); | |||
| i += n1; | |||
| } | |||
| while(i < n) | |||
| @@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #pragma GCC optimize "O1" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "srot_microk_power8.c" | |||
| #elif defined(POWER10) | |||
| #include "srot_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -115,6 +117,23 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| for (i = 0; i < align; i++) { | |||
| temp = c*x[i] + s*y[i] ; | |||
| y[i] = c*y[i] - s*x[i] ; | |||
| x[i] = temp ; | |||
| } | |||
| } | |||
| BLASLONG n1 = (n-i) & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| srot_kernel_16(n1, &x1[i], &y1[i], c, s); | |||
| i+=n1; | |||
| } | |||
| #else | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| @@ -122,6 +141,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| i=n1; | |||
| } | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| temp = c*x[i] + s*y[i] ; | |||
| @@ -0,0 +1,151 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void srot_kernel_16 (long n, float *x, float *y, float c, float s) | |||
| { | |||
| __asm__ | |||
| ( | |||
| "xscvdpspn 36, %x5 \n\t" // load c to all words | |||
| "xxspltw 36, 36, 0 \n\t" | |||
| "xscvdpspn 37, %x6 \n\t" // load s to all words | |||
| "xxspltw 37, 37, 0 \n\t" | |||
| "lxvp 32, 0(%3) \n\t" // load x | |||
| "lxvp 34, 32(%3) \n\t" | |||
| "lxvp 48, 0(%4) \n\t" // load y | |||
| "lxvp 50, 32(%4) \n\t" | |||
| "addic. %2, %2, -16 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "lxvp 32, 64(%3) \n\t" // load x | |||
| "lxvp 34, 96(%3) \n\t" | |||
| "xvmulsp 52, 48, 36 \n\t" // c * y | |||
| "xvmulsp 53, 49, 36 \n\t" | |||
| "xvmulsp 54, 50, 36 \n\t" | |||
| "xvmulsp 55, 51, 36 \n\t" | |||
| "xvmulsp 38, 48, 37 \n\t" // s * y | |||
| "xvmulsp 39, 49, 37 \n\t" | |||
| "xvmulsp 56, 50, 37 \n\t" | |||
| "xvmulsp 57, 51, 37 \n\t" | |||
| "lxvp 48, 64(%4) \n\t" // load y | |||
| "lxvp 50, 96(%4) \n\t" | |||
| "xvaddsp 40, 40, 38 \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, 39 \n\t" // c * x + s * y | |||
| "xvaddsp 42, 42, 56 \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, 57 \n\t" // c * x + s * y | |||
| "stxvp 40, 0(%3) \n\t" // store x | |||
| "stxvp 42, 32(%3) \n\t" | |||
| "xvsubsp 52, 52, 44 \n\t" // c * y - s * x | |||
| "xvsubsp 53, 53, 45 \n\t" // c * y - s * x | |||
| "xvsubsp 54, 54, 46 \n\t" // c * y - s * x | |||
| "xvsubsp 55, 55, 47 \n\t" // c * y - s * x | |||
| "stxvp 52, 0(%4) \n\t" // store y | |||
| "stxvp 54, 32(%4) \n\t" | |||
| "addi %3, %3, 64 \n\t" | |||
| "addi %4, %4, 64 \n\t" | |||
| "addic. %2, %2, -16 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp 52, 48, 36 \n\t" // c * y | |||
| "xvmulsp 53, 49, 36 \n\t" | |||
| "xvmulsp 54, 50, 36 \n\t" | |||
| "xvmulsp 55, 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "xvmulsp 38, 48, 37 \n\t" // s * y | |||
| "xvmulsp 39, 49, 37 \n\t" | |||
| "xvmulsp 56, 50, 37 \n\t" | |||
| "xvmulsp 57, 51, 37 \n\t" | |||
| "xvaddsp 40, 40, 38 \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, 39 \n\t" // c * x + s * y | |||
| "xvaddsp 42, 42, 56 \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, 57 \n\t" // c * x + s * y | |||
| "stxvp 40, 0(%3) \n\t" // store x | |||
| "stxvp 42, 32(%3) \n\t" | |||
| "xvsubsp 52, 52, 44 \n\t" // c * y - s * x | |||
| "xvsubsp 53, 53, 45 \n\t" // c * y - s * x | |||
| "xvsubsp 54, 54, 46 \n\t" // c * y - s * x | |||
| "xvsubsp 55, 55, 47 \n\t" // c * y - s * x | |||
| "stxvp 52, 0(%4) \n\t" // store y | |||
| "stxvp 54, 32(%4) \n\t" | |||
| "#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n" | |||
| : | |||
| "+m" (*x), | |||
| "+m" (*y), | |||
| "+r" (n), // 2 | |||
| "+b" (x), // 3 | |||
| "+b" (y) // 4 | |||
| : | |||
| "f" (c), // 5 | |||
| "f" (s) // 6 | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||
| "vs56","vs57" | |||
| ); | |||
| } | |||
| @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "sscal_microk_power8.c" | |||
| #elif defined(POWER10) | |||
| #include "sscal_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -102,12 +104,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( da == 0.0 ) | |||
| { | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| for (j = 0; j < align; j++) { | |||
| x[j] = 0.0; | |||
| } | |||
| } | |||
| BLASLONG n1 = (n-j) & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sscal_kernel_16_zero(n1, &x[j]); | |||
| j+=n1; | |||
| } | |||
| #else | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sscal_kernel_16_zero(n1, x); | |||
| j=n1; | |||
| } | |||
| #endif | |||
| while(j < n) | |||
| { | |||
| @@ -120,12 +138,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| else | |||
| { | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| for (j = 0; j < align; j++) { | |||
| x[j] = da * x[j]; | |||
| } | |||
| } | |||
| BLASLONG n1 = (n-j) & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sscal_kernel_16(n1, &x[j], da); | |||
| j+=n1; | |||
| } | |||
| #else | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sscal_kernel_16(n1, x, da); | |||
| j=n1; | |||
| } | |||
| #endif | |||
| while(j < n) | |||
| { | |||
| @@ -0,0 +1,135 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void sscal_kernel_16 (long n, float *x, float alpha) | |||
| { | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| "xscvdpspn 48, %x3 \n\t" | |||
| "xxspltw 48, 48, 0 \n\t" | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "lxvp 36, 64(%2) \n\t" | |||
| "lxvp 38, 96(%2) \n\t" | |||
| "addic. %1, %1, -32 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "xvmulsp 40, 32, 48 \n\t" | |||
| "xvmulsp 41, 33, 48 \n\t" | |||
| "xvmulsp 42, 34, 48 \n\t" | |||
| "xvmulsp 43, 35, 48 \n\t" | |||
| "lxvp 32, 128(%2) \n\t" | |||
| "lxvp 34, 160(%2) \n\t" | |||
| "xvmulsp 44, 36, 48 \n\t" | |||
| "xvmulsp 45, 37, 48 \n\t" | |||
| "xvmulsp 46, 38, 48 \n\t" | |||
| "xvmulsp 47, 39, 48 \n\t" | |||
| "lxvp 36, 192(%2) \n\t" | |||
| "lxvp 38, 224(%2) \n\t" | |||
| "stxvp 40, 0(%2) \n\t" | |||
| "stxvp 42, 32(%2) \n\t" | |||
| "stxvp 44, 64(%2) \n\t" | |||
| "stxvp 46, 96(%2) \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %1, %1, -32 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "xvmulsp 40, 32, 48 \n\t" | |||
| "xvmulsp 41, 33, 48 \n\t" | |||
| "xvmulsp 42, 34, 48 \n\t" | |||
| "xvmulsp 43, 35, 48 \n\t" | |||
| "xvmulsp 44, 36, 48 \n\t" | |||
| "xvmulsp 45, 37, 48 \n\t" | |||
| "xvmulsp 46, 38, 48 \n\t" | |||
| "xvmulsp 47, 39, 48 \n\t" | |||
| "stxvp 40, 0(%2) \n\t" | |||
| "stxvp 42, 32(%2) \n\t" | |||
| "stxvp 44, 64(%2) \n\t" | |||
| "stxvp 46, 96(%2) \n\t" | |||
| "#n=%1 alpha=%3 x=%0=%2" | |||
| : | |||
| "+m" (*x), | |||
| "+r" (n), // 1 | |||
| "+b" (x) // 2 | |||
| : | |||
| "f" (alpha) // 3 | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48" | |||
| ); | |||
| } | |||
| static void sscal_kernel_16_zero (long n, float *x) | |||
| { | |||
| __asm__ | |||
| ( | |||
| "xxlxor 32, 32, 32 \n\t" | |||
| "xxlxor 33, 33, 33 \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "stxvp 32, 0(%2) \n\t" | |||
| "stxvp 32, 32(%2) \n\t" | |||
| "stxvp 32, 64(%2) \n\t" | |||
| "stxvp 32, 96(%2) \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %1, %1, -32 \n\t" | |||
| "bgt one%= \n" | |||
| "#n=%1 x=%0=%2 " | |||
| : | |||
| "=m" (*x), | |||
| "+r" (n), // 1 | |||
| "+b" (x) // 2 | |||
| : | |||
| : | |||
| "cr0","vs32","vs33" | |||
| ); | |||
| } | |||
| @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "sswap_microk_power8.c" | |||
| #elif defined(POWER10) | |||
| #include "swap_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| #if defined(POWER10) | |||
| if ( n >= 64 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| for (i = 0; i < align; i++) { | |||
| temp = y[i]; | |||
| y[i] = x[i]; | |||
| x[i] = temp; | |||
| } | |||
| } | |||
| BLASLONG n1 = (n-i) & -64; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sswap_kernel_32(n1,&x[i], &y[i]); | |||
| i+=n1; | |||
| } | |||
| #else | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sswap_kernel_32(n1, x, y); | |||
| i=n1; | |||
| } | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| @@ -0,0 +1,105 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_32 1 | |||
| #if defined(DOUBLE) | |||
| static void dswap_kernel_32 (long n, double *x, double *y) | |||
| #else | |||
| static void sswap_kernel_32 (long n, float *x, float *y) | |||
| #endif | |||
| { | |||
| __asm__ | |||
| ( | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "lxvp 32, 0(%4) \n\t" | |||
| "lxvp 34, 32(%4) \n\t" | |||
| "lxvp 36, 64(%4) \n\t" | |||
| "lxvp 38, 96(%4) \n\t" | |||
| "lxvp 40, 128(%4) \n\t" | |||
| "lxvp 42, 160(%4) \n\t" | |||
| "lxvp 44, 192(%4) \n\t" | |||
| "lxvp 46, 224(%4) \n\t" | |||
| "lxvp 48, 0(%3) \n\t" | |||
| "lxvp 50, 32(%3) \n\t" | |||
| "lxvp 52, 64(%3) \n\t" | |||
| "lxvp 54, 96(%3) \n\t" | |||
| "lxvp 56, 128(%3) \n\t" | |||
| "lxvp 58, 160(%3) \n\t" | |||
| "lxvp 60, 192(%3) \n\t" | |||
| "lxvp 62, 224(%3) \n\t" | |||
| "stxvp 32, 0(%3) \n\t" | |||
| "stxvp 34, 32(%3) \n\t" | |||
| "stxvp 36, 64(%3) \n\t" | |||
| "stxvp 38, 96(%3) \n\t" | |||
| "stxvp 40, 128(%3) \n\t" | |||
| "stxvp 42, 160(%3) \n\t" | |||
| "stxvp 44, 192(%3) \n\t" | |||
| "stxvp 46, 224(%3) \n\t" | |||
| "stxvp 48, 0(%4) \n\t" | |||
| "stxvp 50, 32(%4) \n\t" | |||
| "stxvp 52, 64(%4) \n\t" | |||
| "stxvp 54, 96(%4) \n\t" | |||
| "stxvp 56, 128(%4) \n\t" | |||
| "stxvp 58, 160(%4) \n\t" | |||
| "stxvp 60, 192(%4) \n\t" | |||
| "stxvp 62, 224(%4) \n\t" | |||
| "addi %4, %4, 256 \n\t" | |||
| "addi %3, %3, 256 \n\t" | |||
| #if defined(DOUBLE) | |||
| "addic. %2, %2, -32 \n\t" | |||
| #else | |||
| "addic. %2, %2, -64 \n\t" | |||
| #endif | |||
| "bgt one%= \n" | |||
| "#n=%2 x=%0=%3 y=%1=%4" | |||
| : | |||
| "+m" (*x), | |||
| "+m" (*y), | |||
| "+r" (n), // 2 | |||
| "+b" (x), // 3 | |||
| "+b" (y) // 4 | |||
| : | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||
| "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" | |||
| ); | |||
| } | |||
| @@ -38,11 +38,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #pragma GCC optimize "O1" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(DOUBLE) | |||
| #include "zscal_microk_power8.c" | |||
| #endif | |||
| #elif defined(POWER10) | |||
| #if defined(DOUBLE) | |||
| #include "zscal_microk_power10.c" | |||
| #else | |||
| #include "cscal_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| #endif | |||
| @@ -145,7 +151,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| { | |||
| #if defined(DOUBLE) | |||
| n1 = n & -8; | |||
| #else | |||
| n1 = n & -16; | |||
| #endif | |||
| if ( n1 > 0 ) | |||
| { | |||
| zscal_kernel_8(n1, x, da_r, da_i); | |||
| @@ -0,0 +1,195 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) | |||
| { | |||
| __vector double t0; | |||
| __vector double t1; | |||
| __vector double t2; | |||
| __vector double t3; | |||
| __vector double t4; | |||
| __vector double t5; | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| "xsnegdp 33, %x10 \n\t" // -alpha_i | |||
| XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r | |||
| XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "lxvp 42, 32(%2) \n\t" | |||
| "lxvp 44, 64(%2) \n\t" | |||
| "lxvp 46, 96(%2) \n\t" | |||
| "addic. %1, %1, -8 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r | |||
| "xvmuldp 49, 41, 32 \n\t" | |||
| "xvmuldp 50, 42, 32 \n\t" | |||
| "xvmuldp 51, 43, 32 \n\t" | |||
| "xvmuldp 34, 44, 32 \n\t" | |||
| "xvmuldp 35, 45, 32 \n\t" | |||
| "xvmuldp 36, 46, 32 \n\t" | |||
| "xvmuldp 37, 47, 32 \n\t" | |||
| XXSWAPD_S(38,40) | |||
| XXSWAPD_S(39,41) | |||
| XXSWAPD_S(%x3,42) | |||
| XXSWAPD_S(%x4,43) | |||
| XXSWAPD_S(%x5,44) | |||
| XXSWAPD_S(%x6,45) | |||
| XXSWAPD_S(%x7,46) | |||
| XXSWAPD_S(%x8,47) | |||
| "xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i | |||
| "xvmuldp 39, 39, 33 \n\t" | |||
| "xvmuldp %x3, %x3, 33 \n\t" | |||
| "xvmuldp %x4, %x4, 33 \n\t" | |||
| "lxvp 40, 128(%2) \n\t" | |||
| "lxvp 42, 160(%2) \n\t" | |||
| "xvmuldp %x5, %x5, 33 \n\t" | |||
| "xvmuldp %x6, %x6, 33 \n\t" | |||
| "xvmuldp %x7, %x7, 33 \n\t" | |||
| "xvmuldp %x8, %x8, 33 \n\t" | |||
| "lxvp 44, 192(%2) \n\t" | |||
| "lxvp 46, 224(%2) \n\t" | |||
| "xvadddp 48, 48, 38 \n\t" | |||
| "xvadddp 49, 49, 39 \n\t" | |||
| "xvadddp 50, 50, %x3 \n\t" | |||
| "xvadddp 51, 51, %x4 \n\t" | |||
| "stxv 49, 0(%2) \n\t" | |||
| "stxv 48, 16(%2) \n\t" | |||
| "stxv 51, 32(%2) \n\t" | |||
| "stxv 50, 48(%2) \n\t" | |||
| "xvadddp 34, 34, %x5 \n\t" | |||
| "xvadddp 35, 35, %x6 \n\t" | |||
| "xvadddp 36, 36, %x7 \n\t" | |||
| "xvadddp 37, 37, %x8 \n\t" | |||
| "stxv 35, 64(%2) \n\t" | |||
| "stxv 34, 80(%2) \n\t" | |||
| "stxv 37, 96(%2) \n\t" | |||
| "stxv 36, 112(%2) \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %1, %1, -8 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r | |||
| "xvmuldp 49, 41, 32 \n\t" | |||
| "xvmuldp 50, 42, 32 \n\t" | |||
| "xvmuldp 51, 43, 32 \n\t" | |||
| "xvmuldp 34, 44, 32 \n\t" | |||
| "xvmuldp 35, 45, 32 \n\t" | |||
| "xvmuldp 36, 46, 32 \n\t" | |||
| "xvmuldp 37, 47, 32 \n\t" | |||
| XXSWAPD_S(38,40) | |||
| XXSWAPD_S(39,41) | |||
| XXSWAPD_S(%x3,42) | |||
| XXSWAPD_S(%x4,43) | |||
| XXSWAPD_S(%x5,44) | |||
| XXSWAPD_S(%x6,45) | |||
| XXSWAPD_S(%x7,46) | |||
| XXSWAPD_S(%x8,47) | |||
| "xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i | |||
| "xvmuldp 39, 39, 33 \n\t" | |||
| "xvmuldp %x3, %x3, 33 \n\t" | |||
| "xvmuldp %x4, %x4, 33 \n\t" | |||
| "xvmuldp %x5, %x5, 33 \n\t" | |||
| "xvmuldp %x6, %x6, 33 \n\t" | |||
| "xvmuldp %x7, %x7, 33 \n\t" | |||
| "xvmuldp %x8, %x8, 33 \n\t" | |||
| "xvadddp 48, 48, 38 \n\t" | |||
| "xvadddp 49, 49, 39 \n\t" | |||
| "xvadddp 50, 50, %x3 \n\t" | |||
| "xvadddp 51, 51, %x4 \n\t" | |||
| "stxv 49, 0(%2) \n\t" | |||
| "stxv 48, 16(%2) \n\t" | |||
| "stxv 51, 32(%2) \n\t" | |||
| "stxv 50, 48(%2) \n\t" | |||
| "xvadddp 34, 34, %x5 \n\t" | |||
| "xvadddp 35, 35, %x6 \n\t" | |||
| "xvadddp 36, 36, %x7 \n\t" | |||
| "xvadddp 37, 37, %x8 \n\t" | |||
| "stxv 35, 64(%2) \n\t" | |||
| "stxv 34, 80(%2) \n\t" | |||
| "stxv 37, 96(%2) \n\t" | |||
| "stxv 36, 112(%2) \n\t" | |||
| "#n=%1 x=%0=%2 alpha=(%9,%10) \n" | |||
| : | |||
| "+m" (*x), | |||
| "+r" (n), // 1 | |||
| "+b" (x), // 2 | |||
| "=wa" (t0), // 3 | |||
| "=wa" (t1), // 4 | |||
| "=wa" (t2), // 5 | |||
| "=wa" (t3), // 6 | |||
| "=wa" (t4), // 7 | |||
| "=wa" (t5) // 8 | |||
| : | |||
| "d" (alpha_r), // 9 | |||
| "d" (alpha_i) // 10 | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51" | |||
| ); | |||
| } | |||
| @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "zswap_microk_power8.c" | |||
| #elif defined(POWER10) | |||
| #include "cswap_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -489,3 +489,6 @@ XGEMM3MKERNEL = xgemm3m_kernel_2x2.S | |||
| SSUMKERNEL = ../arm/sum.c | |||
| DSUMKERNEL = ../arm/sum.c | |||
| SOMATCOPY_RT = omatcopy_rt.c | |||
| DOMATCOPY_RT = omatcopy_rt.c | |||
| @@ -97,3 +97,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c | |||
| SROTKERNEL = srot.c | |||
| DROTKERNEL = drot.c | |||
| @@ -6,7 +6,7 @@ | |||
| #if defined(SKYLAKEX) | |||
| #include "dasum_microk_skylakex-2.c" | |||
| #elif defined(HASWELL) | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #include "dasum_microk_haswell-2.c" | |||
| #endif | |||
| @@ -93,7 +93,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| #if defined(SMP) | |||
| int nthreads; | |||
| FLOAT dummy_alpha; | |||
| FLOAT * dummy_b; | |||
| #endif | |||
| FLOAT sumf = 0.0; | |||
| @@ -115,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| #else | |||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||
| #endif | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads); | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); | |||
| ptr = (FLOAT *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| sumf += (*ptr); | |||
| @@ -2,7 +2,7 @@ | |||
| #if defined(SKYLAKEX) | |||
| #include "drot_microk_skylakex-2.c" | |||
| #elif defined(HASWELL) | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #include "drot_microk_haswell-2.c" | |||
| #endif | |||
| @@ -0,0 +1,373 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #ifdef HAVE_AVX | |||
| #define ROWS_OF_BLOCK 384 | |||
| /* +r: %0 = src, %1 = dst, %2 = src_ld, %3 = dst_ld, %4 = dst_tmp */ | |||
| /* m: %5 = num_rows, %6 = alpha */ | |||
| /* xmm15 = alpha */ | |||
| #define TRANS_4x4(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ | |||
| "vunpcklps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t1_no"; vunpckhps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t2_no";"\ | |||
| "vunpcklps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t3_no"; vunpckhps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t4_no";"\ | |||
| "vunpcklpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a1_no"; vunpckhpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a2_no";"\ | |||
| "vunpcklpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a3_no"; vunpckhpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a4_no";" | |||
| #define TRANS_4x8(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ | |||
| "vunpcklps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t1_no"; vunpckhps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t2_no";"\ | |||
| "vunpcklps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t3_no"; vunpckhps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t4_no";"\ | |||
| "vunpcklpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a1_no"; vunpckhpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a2_no";"\ | |||
| "vunpcklpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a3_no"; vunpckhpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a4_no";" | |||
| #define SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\ | |||
| "vmovups %%xmm"#b1_no",(%4); vmovups %%xmm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ | |||
| "vmovups %%xmm"#b3_no",(%4); vmovups %%xmm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;" | |||
| #define SAVE_4x8(b1_no,b2_no,b3_no,b4_no) SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\ | |||
| "vextractf128 $1,%%ymm"#b1_no",(%4); vextractf128 $1,%%ymm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ | |||
| "vextractf128 $1,%%ymm"#b3_no",(%4); vextractf128 $1,%%ymm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;" | |||
| #define COPY_4x16 "movq %1,%4; addq $16,%1;"\ | |||
| "vmulps (%0),%%ymm15,%%ymm0; vmulps 32(%0),%%ymm15,%%ymm4; vmulps (%0,%2,1),%%ymm15,%%ymm1; vmulps 32(%0,%2,1),%%ymm15,%%ymm5; leaq (%0,%2,2),%0;"\ | |||
| "vmulps (%0),%%ymm15,%%ymm2; vmulps 32(%0),%%ymm15,%%ymm6; vmulps (%0,%2,1),%%ymm15,%%ymm3; vmulps 32(%0,%2,1),%%ymm15,%%ymm7; leaq (%0,%2,2),%0;"\ | |||
| TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3)\ | |||
| TRANS_4x8(4,5,6,7,8,9,10,11) SAVE_4x8(4,5,6,7) | |||
| #define COPY_4x8 "movq %1,%4; addq $16,%1;"\ | |||
| "vmulps (%0),%%ymm15,%%ymm0; vmulps (%0,%2,1),%%ymm15,%%ymm1; leaq (%0,%2,2),%0;"\ | |||
| "vmulps (%0),%%ymm15,%%ymm2; vmulps (%0,%2,1),%%ymm15,%%ymm3; leaq (%0,%2,2),%0;"\ | |||
| TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3) | |||
| #define COPY_4x4 "movq %1,%4; addq $16,%1;"\ | |||
| "vmulps (%0),%%xmm15,%%xmm0; vmulps (%0,%2,1),%%xmm15,%%xmm1; leaq (%0,%2,2),%0;"\ | |||
| "vmulps (%0),%%xmm15,%%xmm2; vmulps (%0,%2,1),%%xmm15,%%xmm3; leaq (%0,%2,2),%0;"\ | |||
| TRANS_4x4(0,1,2,3,8,9,10,11) SAVE_4x4(0,1,2,3) | |||
| #define COPY_4x2 \ | |||
| "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ | |||
| "vmovsd (%0),%%xmm1; vmovhpd (%0,%2,1),%%xmm1,%%xmm1; vmulps %%xmm15,%%xmm1,%%xmm1; leaq (%0,%2,2),%0;"\ | |||
| "vpermilps $216,%%xmm0,%%xmm0; vpermilps $216,%%xmm1,%%xmm1; vunpcklpd %%xmm1,%%xmm0,%%xmm2; vunpckhpd %%xmm1,%%xmm0,%%xmm3;"\ | |||
| "vmovups %%xmm2,(%1); vmovups %%xmm3,(%1,%3,1); addq $16,%1;" | |||
| #define COPY_4x1 \ | |||
| "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ | |||
| "vinsertps $32,(%0),%%xmm0,%%xmm0; vinsertps $48,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ | |||
| "vmulps %%xmm15,%%xmm0,%%xmm0; vmovups %%xmm0,(%1); addq $16,%1;" | |||
| #define SAVE_2x4(c1_no,c2_no,t1_no,t2_no) \ | |||
| "vunpcklps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t1_no"; vmulps %%xmm15,%%xmm"#t1_no",%%xmm"#t1_no";"\ | |||
| "vmovsd %%xmm"#t1_no",(%4); vmovhpd %%xmm"#t1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ | |||
| "vunpckhps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t2_no"; vmulps %%xmm15,%%xmm"#t2_no",%%xmm"#t2_no";"\ | |||
| "vmovsd %%xmm"#t2_no",(%4); vmovhpd %%xmm"#t2_no",(%4,%3,1); leaq (%4,%3,2),%4;" | |||
| #define COPY_2x16 "movq %1,%4; addq $8,%1;"\ | |||
| "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm2; vmovups (%0,%2,1),%%ymm1; vmovups 32(%0,%2,1),%%ymm3; leaq (%0,%2,2),%0;"\ | |||
| "vextractf128 $1,%%ymm0,%%xmm4; vextractf128 $1,%%ymm2,%%xmm6; vextractf128 $1,%%ymm1,%%xmm5; vextractf128 $1,%%ymm3,%%xmm7;"\ | |||
| SAVE_2x4(0,1,8,9) SAVE_2x4(4,5,8,9) SAVE_2x4(2,3,8,9) SAVE_2x4(6,7,8,9) | |||
| #define COPY_2x8 "movq %1,%4; addq $8,%1;"\ | |||
| "vmovups (%0),%%ymm0; vmovups (%0,%2,1),%%ymm1; leaq (%0,%2,2),%0;"\ | |||
| "vextractf128 $1,%%ymm0,%%xmm2; vextractf128 $1,%%ymm1,%%xmm3;"\ | |||
| SAVE_2x4(0,1,4,5) SAVE_2x4(2,3,4,5) | |||
| #define COPY_2x4 "movq %1,%4; addq $8,%1;"\ | |||
| "vmovups (%0),%%xmm0; vmovups (%0,%2,1),%%xmm1; leaq (%0,%2,2),%0;"\ | |||
| SAVE_2x4(0,1,4,5) | |||
| #define COPY_2x2 \ | |||
| "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vpermilps $216,%%xmm0,%%xmm0;"\ | |||
| "vmovsd %%xmm0,(%1); vmovhpd %%xmm0,(%1,%3,1); addq $8,%1;" | |||
| #define COPY_2x1 \ | |||
| "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vmovsd %%xmm0,(%1); addq $8,%1;" | |||
| #define SAVE_1x4(c1_no)\ | |||
| "vmulps %%xmm15,%%xmm"#c1_no",%%xmm"#c1_no"; vmovss %%xmm"#c1_no",(%4); vextractps $1,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ | |||
| "vextractps $2,%%xmm"#c1_no",(%4); vextractps $3,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;" | |||
| #define COPY_1x16 "movq %1,%4; addq $4,%1;"\ | |||
| "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2)\ | |||
| "vmovups 32(%0),%%xmm1;" SAVE_1x4(1) "vmovups 48(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;" | |||
| #define COPY_1x8 "movq %1,%4; addq $4,%1;"\ | |||
| "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;" | |||
| #define COPY_1x4 "movq %1,%4; addq $4,%1; vmovups (%0),%%xmm1;" SAVE_1x4(1) "addq %2,%0;" | |||
| #define COPY_1x2 "vmovsd (%0),%%xmm1; addq %2,%0; vmulps %%xmm15,%%xmm1,%%xmm1; vmovss %%xmm1,(%1); vextractps $1,%%xmm1,(%1,%3,1); addq $4,%1;" | |||
| #define COPY_1x1 "vmulss (%0),%%xmm15,%%xmm1; vmovss %%xmm1,(%1); addq %2,%0; addq $4,%1;" | |||
| #define COMPUTE(ndim){\ | |||
| src = src_base; dst = dst_base;\ | |||
| __asm__ __volatile__(\ | |||
| "vbroadcastss %6,%%ymm15; movq %5,%%r11; cmpq $4,%%r11; jb "#ndim"32f;"\ | |||
| #ndim"31:\n\t"\ | |||
| COPY_4x##ndim "subq $4,%%r11; cmpq $4,%%r11; jnb "#ndim"31b;"\ | |||
| #ndim"32:\n\t"\ | |||
| "cmpq $2,%%r11; jb "#ndim"33f;"\ | |||
| COPY_2x##ndim "subq $2,%%r11;"\ | |||
| #ndim"33:\n\t"\ | |||
| "testq %%r11,%%r11; jz "#ndim"34f;"\ | |||
| COPY_1x##ndim "subq $1,%%r11;"\ | |||
| #ndim"34:\n\t"\ | |||
| :"+r"(src),"+r"(dst),"+r"(src_ld_bytes),"+r"(dst_ld_bytes),"+r"(dst_tmp):"m"(num_rows),"m"(ALPHA):"r11","cc","memory"\ | |||
| ,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| } | |||
| int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){ | |||
| float *src, *dst, *dst_tmp, *src_base, *dst_base; | |||
| uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0; | |||
| BLASLONG cols_left, rows_done; float ALPHA = alpha; | |||
| if(ALPHA==0.0){ | |||
| dst_base = b; | |||
| for(cols_left=cols;cols_left>0;cols_left--) {memset(dst_base,0,rows*sizeof(float)); dst_base += ldb;} | |||
| return 0; | |||
| } | |||
| for(rows_done=0;rows_done<rows;rows_done+=num_rows){ | |||
| num_rows = rows-rows_done; | |||
| if(num_rows > ROWS_OF_BLOCK) num_rows = ROWS_OF_BLOCK; | |||
| cols_left = cols; src_base = a + (int64_t)lda * (int64_t)rows_done; dst_base = b + rows_done; | |||
| if(ldb%1024>3 && ldb%1024<1021) for(;cols_left>15;cols_left-=16){COMPUTE(16) src_base += 16; dst_base += 16 * ldb;} | |||
| for(;cols_left>7;cols_left-=8){COMPUTE(8) src_base += 8; dst_base += 8 * ldb;} | |||
| for(;cols_left>3;cols_left-=4){COMPUTE(4) src_base += 4; dst_base += 4 * ldb;} | |||
| for(;cols_left>1;cols_left-=2){COMPUTE(2) src_base += 2; dst_base += 2 * ldb;} | |||
| if(cols_left>0){COMPUTE(1) src_base ++; dst_base += ldb;} | |||
| } | |||
| return 0; | |||
| } | |||
| #else | |||
| int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||
| FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; | |||
| if (rows <= 0) return 0; | |||
| if (cols <= 0) return 0; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| i = (rows >> 2); | |||
| if (i > 0) { | |||
| do { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset += 4 * lda; | |||
| b_offset1 = b_offset; | |||
| b_offset2 = b_offset1 + ldb; | |||
| b_offset3 = b_offset2 + ldb; | |||
| b_offset4 = b_offset3 + ldb; | |||
| b_offset += 4; | |||
| j = (cols >> 2); | |||
| if (j > 0) { | |||
| do { | |||
| /* Column 1 of MAT_B */ | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A | |||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||
| *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; | |||
| *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; | |||
| /* Column 2 of MAT_B */ | |||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A | |||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||
| *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; | |||
| *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; | |||
| /* Column 3 of MAT_B */ | |||
| *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A | |||
| *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; | |||
| *(b_offset3 + 2) = *(a_offset3 + 2)*alpha; | |||
| *(b_offset4 + 2) = *(a_offset3 + 3)*alpha; | |||
| /* Column 4 of MAT_B */ | |||
| *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A | |||
| *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; | |||
| *(b_offset3 + 3) = *(a_offset4 + 2)*alpha; | |||
| *(b_offset4 + 3) = *(a_offset4 + 3)*alpha; | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| a_offset3 += 4; | |||
| a_offset4 += 4; | |||
| b_offset1 += ldb * 4; | |||
| b_offset2 += ldb * 4; | |||
| b_offset3 += ldb * 4; | |||
| b_offset4 += ldb * 4; | |||
| j--; | |||
| } while (j > 0); | |||
| } // if(j > 0) | |||
| if (cols & 2) { | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||
| *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; | |||
| *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; | |||
| *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; | |||
| *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; | |||
| a_offset1 += 2; | |||
| a_offset2 += 2; | |||
| a_offset3 += 2; | |||
| a_offset4 += 2; | |||
| b_offset1 += ldb*2; | |||
| } | |||
| if (cols & 1) { | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||
| *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; | |||
| *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; | |||
| } | |||
| i--; | |||
| } while (i > 0); | |||
| } | |||
| if (rows & 2) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset += 2 * lda; | |||
| b_offset1 = b_offset; | |||
| b_offset2 = b_offset1 + ldb; | |||
| b_offset3 = b_offset2 + ldb; | |||
| b_offset4 = b_offset3 + ldb; | |||
| b_offset += 2; | |||
| j = (cols >> 2); | |||
| if (j > 0){ | |||
| do { | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||
| *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; | |||
| *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; | |||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||
| *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; | |||
| *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| b_offset1 += ldb * 4; | |||
| b_offset2 += ldb * 4; | |||
| b_offset3 += ldb * 4; | |||
| b_offset4 += ldb * 4; | |||
| j--; | |||
| } while (j > 0); | |||
| } | |||
| if (cols & 2){ | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||
| *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; | |||
| a_offset1 += 2; | |||
| a_offset2 += 2; | |||
| b_offset1 += ldb*2; | |||
| } | |||
| if (cols & 1){ | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; | |||
| } | |||
| } // if (rows & 2) | |||
| if (rows & 1) { | |||
| a_offset1 = a_offset; | |||
| a_offset += lda; | |||
| b_offset1 = b_offset; | |||
| b_offset2 = b_offset1 + ldb; | |||
| b_offset3 = b_offset2 + ldb; | |||
| b_offset4 = b_offset3 + ldb; | |||
| j = (cols >> 2); | |||
| if (j > 0){ | |||
| do { | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||
| *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; | |||
| *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; | |||
| a_offset1 += 4; | |||
| b_offset1 += ldb * 4; | |||
| b_offset2 += ldb * 4; | |||
| b_offset3 += ldb * 4; | |||
| b_offset4 += ldb * 4; | |||
| j--; | |||
| } while (j > 0); | |||
| } | |||
| if (cols & 2){ | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; | |||
| a_offset1 += 2; | |||
| b_offset1 += ldb * 2; | |||
| } | |||
| if (cols & 1){ | |||
| *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| #endif | |||
| @@ -11,7 +11,7 @@ | |||
| #if defined(SKYLAKEX) | |||
| #include "sasum_microk_skylakex-2.c" | |||
| #elif defined(HASWELL) | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #include "sasum_microk_haswell-2.c" | |||
| #endif | |||
| @@ -0,0 +1,426 @@ | |||
| #include "sbgemm.h" | |||
| #include <immintrin.h> | |||
| // Walk around those intrinsics that missed by compiler | |||
| #define MM256_LOADU_EPI16(addr) \ | |||
| _mm256_maskz_loadu_epi16(~0, (addr)) | |||
| #define MM256_STOREU_EPI16(addr, reg) \ | |||
| _mm256_mask_storeu_epi16((addr), ~0, (reg)) | |||
| #include <stdio.h> | |||
| void print_block(BLASLONG m, BLASLONG n, bfloat16 * mat) | |||
| { | |||
| printf("---- BLOCK %ld x %ld ----\n", m, n); | |||
| for (BLASLONG i=0; i<m; i++) { | |||
| for (BLASLONG j=0; j<n; j++) { | |||
| printf("%-4X ", *(mat + i*n +j)); | |||
| } | |||
| printf("\n"); | |||
| } | |||
| printf("---- End of BLOCK ----\n"); | |||
| } | |||
| void COL_MAJOR_INCOPY_KERNEL_Kx32(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) | |||
| { | |||
| BLASLONG tag_k_2x = k & (~1); | |||
| __m512i array512_0, array512_1, array512_2, array512_3; | |||
| BLASLONG idx_src_base0, idx_src_base1; | |||
| BLASLONG idx_target_base0, idx_target_base1; | |||
| BLASLONG LDA_2x = 2*lda; | |||
| BLASLONG BF16_BLOCK_T_M_2x = 2*32; | |||
| idx_src_base0 = 0; | |||
| idx_src_base1 = lda; | |||
| idx_target_base0 = 0; | |||
| idx_target_base1 = 32; | |||
| for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { | |||
| array512_0 = _mm512_loadu_si512(&A[idx_src_base0]); | |||
| array512_1 = _mm512_loadu_si512(&A[idx_src_base1]); | |||
| array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1); | |||
| array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1); | |||
| _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); | |||
| _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); | |||
| idx_src_base0 += LDA_2x; | |||
| idx_src_base1 += LDA_2x; | |||
| idx_target_base0 += BF16_BLOCK_T_M_2x; | |||
| idx_target_base1 += BF16_BLOCK_T_M_2x; | |||
| } | |||
| if (tag_k_2x != k) { | |||
| __m512i ZERO512 = _mm512_setzero_si512(); | |||
| array512_0 = _mm512_loadu_si512(&A[idx_src_base0]); | |||
| array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512); | |||
| array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512); | |||
| _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); | |||
| _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); | |||
| } | |||
| #ifdef DEBUG_PROFILE | |||
| print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); | |||
| #endif | |||
| } | |||
| void COL_MAJOR_INCOPY_KERNEL_Kx32m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) | |||
| { | |||
| BLASLONG tag_k_2x = k & (~1); | |||
| unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-m)); | |||
| __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); | |||
| __m512i array512_0, array512_1, array512_2, array512_3; | |||
| BLASLONG idx_src_base0, idx_src_base1; | |||
| BLASLONG idx_target_base0, idx_target_base1; | |||
| BLASLONG LDA_2x = 2*lda; | |||
| BLASLONG BF16_BLOCK_T_M_2x = 2*32; | |||
| idx_src_base0 = 0; | |||
| idx_src_base1 = lda; | |||
| idx_target_base0 = 0; | |||
| idx_target_base1 = 32; | |||
| for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { | |||
| array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); | |||
| array512_1 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]); | |||
| array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1); | |||
| array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1); | |||
| _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); | |||
| _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); | |||
| idx_src_base0 += LDA_2x; | |||
| idx_src_base1 += LDA_2x; | |||
| idx_target_base0 += BF16_BLOCK_T_M_2x; | |||
| idx_target_base1 += BF16_BLOCK_T_M_2x; | |||
| } | |||
| if (tag_k_2x != k) { | |||
| __m512i ZERO512 = _mm512_setzero_si512(); | |||
| array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); | |||
| array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512); | |||
| array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512); | |||
| _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); | |||
| _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); | |||
| } | |||
| #ifdef DEBUG_PROFILE | |||
| print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); | |||
| #endif | |||
| } | |||
| void COL_MAJOR_INCOPY_KERNEL_Kx16(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) | |||
| { | |||
| BLASLONG tag_k_2x = k & (~1); | |||
| __m256i array256_0, array256_1, array256_2, array256_3; | |||
| BLASLONG idx_src_base0, idx_src_base1; | |||
| BLASLONG idx_target_base0; | |||
| BLASLONG LDA_2x = 2*lda; | |||
| idx_src_base0 = 0; | |||
| idx_src_base1 = lda; | |||
| idx_target_base0 = 0; | |||
| for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { | |||
| array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]); | |||
| array256_1 = MM256_LOADU_EPI16(&A[idx_src_base1]); | |||
| array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1); | |||
| array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1); | |||
| // Store in one row of block_B | |||
| MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); | |||
| MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); | |||
| idx_src_base0 += LDA_2x; | |||
| idx_src_base1 += LDA_2x; | |||
| idx_target_base0 += 32; | |||
| } | |||
| if (tag_k_2x != k) { | |||
| __m256i ZERO256 = _mm256_setzero_si256(); | |||
| array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]); | |||
| array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256); | |||
| array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256); | |||
| // Store in one row of block_B | |||
| MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); | |||
| MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); | |||
| } | |||
| #ifdef DEBUG_PROFILE | |||
| print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); | |||
| #endif | |||
| } | |||
| void COL_MAJOR_INCOPY_KERNEL_Kx16m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) | |||
| { | |||
| BLASLONG tag_k_2x = k & (~1); | |||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); | |||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||
| __m256i array256_0, array256_1, array256_2, array256_3; | |||
| BLASLONG idx_src_base0, idx_src_base1; | |||
| BLASLONG idx_target_base0; | |||
| BLASLONG LDA_2x = 2*lda; | |||
| idx_src_base0 = 0; | |||
| idx_src_base1 = lda; | |||
| idx_target_base0 = 0; | |||
| for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { | |||
| array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); | |||
| array256_1 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]); | |||
| array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1); | |||
| array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1); | |||
| // Store in one row of block_B | |||
| MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); | |||
| MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); | |||
| idx_src_base0 += LDA_2x; | |||
| idx_src_base1 += LDA_2x; | |||
| idx_target_base0 += 32; | |||
| } | |||
| if (tag_k_2x != k) { | |||
| __m256i ZERO256 = _mm256_setzero_si256(); | |||
| array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); | |||
| array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256); | |||
| array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256); | |||
| // Store in one row of block_B | |||
| MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); | |||
| MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); | |||
| } | |||
| #ifdef DEBUG_PROFILE | |||
| print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); | |||
| #endif | |||
| } | |||
| void COL_MAJOR_ONCOPY_KERNEL_8x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) | |||
| { | |||
| BLASLONG tag_k_32x = k & (~31); | |||
| BLASLONG idx_src_base0, idx_src_base1, idx_src_base2, idx_src_base3, idx_src_base4, idx_src_base5, idx_src_base6, idx_src_base7; | |||
| BLASLONG idx_target_base0; | |||
| idx_src_base0 = 0; | |||
| idx_src_base1 = 1*ldb; | |||
| idx_src_base2 = 2*ldb; | |||
| idx_src_base3 = 3*ldb; | |||
| idx_src_base4 = 4*ldb; | |||
| idx_src_base5 = 5*ldb; | |||
| idx_src_base6 = 6*ldb; | |||
| idx_src_base7 = 7*ldb; | |||
| idx_target_base0 = 0; | |||
| for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0+idx_k])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base1+idx_k])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_loadu_si512(&B[idx_src_base2+idx_k])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_loadu_si512(&B[idx_src_base3+idx_k])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_loadu_si512(&B[idx_src_base4+idx_k])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_loadu_si512(&B[idx_src_base5+idx_k])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_loadu_si512(&B[idx_src_base6+idx_k])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_loadu_si512(&B[idx_src_base7+idx_k])); | |||
| idx_target_base0 += 32*8; | |||
| } | |||
| if (tag_k_32x != k) { | |||
| unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); | |||
| __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0+tag_k_32x])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base1+tag_k_32x])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base2+tag_k_32x])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base3+tag_k_32x])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base4+tag_k_32x])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base5+tag_k_32x])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base6+tag_k_32x])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base7+tag_k_32x])); | |||
| } | |||
| #ifdef DEBUG_PROFILE | |||
| print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B); | |||
| #endif | |||
| } | |||
| void COL_MAJOR_ONCOPY_KERNEL_Nx32(BLASLONG n, BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) | |||
| { | |||
| BLASLONG tag_k_32x = k & (~31); | |||
| BLASLONG tag_n_2x = n & (~1); | |||
| BLASLONG idx_src_base0; | |||
| BLASLONG idx_target_base0; | |||
| BLASLONG LDB_2x = 2*ldb; | |||
| idx_target_base0 = 0; | |||
| for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { | |||
| idx_src_base0 = 0; | |||
| for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) { | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base0 + ldb + idx_k])); | |||
| idx_src_base0 += LDB_2x; | |||
| idx_target_base0 += 64; | |||
| } | |||
| if (tag_n_2x != n) { | |||
| _mm512_storeu_si512(&block_B[idx_target_base0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k])); | |||
| idx_target_base0 += 32; | |||
| } | |||
| } | |||
| if (tag_k_32x != k) { | |||
| unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); | |||
| __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); | |||
| idx_src_base0 = 0; | |||
| for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) { | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x])); | |||
| _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + ldb + tag_k_32x])); | |||
| idx_src_base0 += LDB_2x; | |||
| idx_target_base0 += 64; | |||
| } | |||
| if (tag_n_2x != n) { | |||
| _mm512_storeu_si512(&block_B[idx_target_base0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x])); | |||
| } | |||
| } | |||
| #ifdef DEBUG_PROFILE | |||
| print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B); | |||
| #endif | |||
| } | |||
| // Scale matrix C while beta is not ZERO or ONE | |||
| void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc) | |||
| { | |||
| BLASLONG tag_n_Nx = N & (~3); | |||
| BLASLONG tag_n_Mx = M & (~15); | |||
| BLASLONG LDC4x = ldc*4; | |||
| BLASLONG idx_base_0 = 0; | |||
| BLASLONG idx_base_1 = ldc; | |||
| BLASLONG idx_base_2 = ldc*2; | |||
| BLASLONG idx_base_3 = ldc*3; | |||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); | |||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||
| __m512 array_512_0, array_512_1, array_512_2, array_512_3; | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| if (Order == CblasColMajor) { | |||
| for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { | |||
| for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { | |||
| array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]); | |||
| array_512_1 = _mm512_loadu_ps(&C[idx_base_1+idx_m]); | |||
| array_512_2 = _mm512_loadu_ps(&C[idx_base_2+idx_m]); | |||
| array_512_3 = _mm512_loadu_ps(&C[idx_base_3+idx_m]); | |||
| array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); | |||
| array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); | |||
| array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); | |||
| array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); | |||
| _mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0); | |||
| _mm512_storeu_ps(&C[idx_base_1+idx_m], array_512_1); | |||
| _mm512_storeu_ps(&C[idx_base_2+idx_m], array_512_2); | |||
| _mm512_storeu_ps(&C[idx_base_3+idx_m], array_512_3); | |||
| } | |||
| if (tag_n_Mx != M) { | |||
| array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]); | |||
| array_512_1 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_1+tag_n_Mx]); | |||
| array_512_2 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_2+tag_n_Mx]); | |||
| array_512_3 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_3+tag_n_Mx]); | |||
| array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); | |||
| array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); | |||
| array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); | |||
| array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); | |||
| _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0); | |||
| _mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, array_512_1); | |||
| _mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, array_512_2); | |||
| _mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, array_512_3); | |||
| } | |||
| idx_base_0 += LDC4x; | |||
| idx_base_1 += LDC4x; | |||
| idx_base_2 += LDC4x; | |||
| idx_base_3 += LDC4x; | |||
| } | |||
| if (tag_n_Nx != N) { | |||
| for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { | |||
| for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { | |||
| array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]); | |||
| array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); | |||
| _mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0); | |||
| } | |||
| if (tag_n_Mx != M) { | |||
| array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]); | |||
| array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); | |||
| _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0); | |||
| } | |||
| idx_base_0 += ldc; | |||
| } | |||
| } | |||
| } else { | |||
| } | |||
| } | |||
| // Scale matrix C while beta is not ZERO or ONE | |||
| void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, float *C, OPENBLAS_CONST blasint ldc) | |||
| { | |||
| BLASLONG tag_n_Nx = N & (~3); | |||
| BLASLONG tag_n_Mx = M & (~15); | |||
| BLASLONG LDC4x = ldc*4; | |||
| BLASLONG idx_base_0 = 0; | |||
| BLASLONG idx_base_1 = ldc; | |||
| BLASLONG idx_base_2 = ldc*2; | |||
| BLASLONG idx_base_3 = ldc*3; | |||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); | |||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||
| __m512 ZEROVECTOR = _mm512_setzero_ps(); | |||
| if (Order == CblasColMajor) { | |||
| for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { | |||
| for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { | |||
| _mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR); | |||
| _mm512_storeu_ps(&C[idx_base_1+idx_m], ZEROVECTOR); | |||
| _mm512_storeu_ps(&C[idx_base_2+idx_m], ZEROVECTOR); | |||
| _mm512_storeu_ps(&C[idx_base_3+idx_m], ZEROVECTOR); | |||
| } | |||
| if (tag_n_Mx != M) { | |||
| _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR); | |||
| _mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, ZEROVECTOR); | |||
| _mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, ZEROVECTOR); | |||
| _mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, ZEROVECTOR); | |||
| } | |||
| idx_base_0 += LDC4x; | |||
| idx_base_1 += LDC4x; | |||
| idx_base_2 += LDC4x; | |||
| idx_base_3 += LDC4x; | |||
| } | |||
| if (tag_n_Nx != N) { | |||
| for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { | |||
| for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { | |||
| _mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR); | |||
| } | |||
| if (tag_n_Mx != M) { | |||
| _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR); | |||
| } | |||
| idx_base_0 += ldc; | |||
| } | |||
| } | |||
| } else { | |||
| } | |||
| } | |||
| @@ -0,0 +1,625 @@ | |||
| #include "sbgemm.h" | |||
| #include "bf16_common_macros.h" | |||
| #include <immintrin.h> | |||
| #undef STORE16_COMPLETE_RESULT | |||
| #undef STORE16_MASK_COMPLETE_RESULT | |||
| #undef SBGEMM_BLOCK_KERNEL_32x8x32 | |||
| #undef SBGEMM_BLOCK_KERNEL_16x8x32 | |||
| #undef SBGEMM_BLOCK_KERNEL_32xNx32 | |||
| #undef SBGEMM_BLOCK_KERNEL_16xNx32 | |||
| #undef SBGEMM_BLOCKING_KERNEL_2 | |||
| #ifndef ONE_ALPHA // ALPHA is not ONE | |||
| #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE | |||
| #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE | |||
| #define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_alpha | |||
| #define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_alpha | |||
| #define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_alpha | |||
| #define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_alpha | |||
| #define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_alpha | |||
| #else // ALPHA is ONE | |||
| #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ONE_ONE | |||
| #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ONE_ONE | |||
| #define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_one | |||
| #define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_one | |||
| #define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_one | |||
| #define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_one | |||
| #define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_one | |||
| #endif | |||
| // SBGEMM Kernel for 16<M<=32, N=8, K can be any number, but the processing will take 32 as a base | |||
| #ifndef ONE_ALPHA // ALPHA is not ONE | |||
| void sbgemm_block_kernel_32x8x32_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||
| #else // ALPHA is ONE | |||
| void sbgemm_block_kernel_32x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||
| #endif | |||
| { | |||
| int SHUFFLE_MAGIC_NO = 0x39; | |||
| BLASLONG tag_k_32x = k & (~31); | |||
| BLASLONG idxA_base = 0; | |||
| BLASLONG idxB_base = 0; | |||
| BLASLONG width = 32; | |||
| #ifndef ONE_ALPHA | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| __m512i arrayA_512_0, arrayA_512_1; | |||
| __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7; | |||
| __m512 result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7, | |||
| result_512_8, result_512_9, result_512_10, result_512_11, result_512_12, result_512_13, result_512_14, result_512_15; | |||
| __m512 result_512_tmp_0, result_512_tmp_1, result_512_tmp_2, result_512_tmp_3; | |||
| __m512i M512_EPI32_8 = _mm512_set1_epi32(8); | |||
| __m512i shuffle_idx_base0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0); | |||
| __m512i shuffle_idx_base1 = _mm512_add_epi32(shuffle_idx_base0, M512_EPI32_8); | |||
| result_512_0 = _mm512_setzero_ps(); | |||
| result_512_1 = _mm512_setzero_ps(); | |||
| result_512_2 = _mm512_setzero_ps(); | |||
| result_512_3 = _mm512_setzero_ps(); | |||
| result_512_4 = _mm512_setzero_ps(); | |||
| result_512_5 = _mm512_setzero_ps(); | |||
| result_512_6 = _mm512_setzero_ps(); | |||
| result_512_7 = _mm512_setzero_ps(); | |||
| result_512_8 = _mm512_setzero_ps(); | |||
| result_512_9 = _mm512_setzero_ps(); | |||
| result_512_10 = _mm512_setzero_ps(); | |||
| result_512_11 = _mm512_setzero_ps(); | |||
| result_512_12 = _mm512_setzero_ps(); | |||
| result_512_13 = _mm512_setzero_ps(); | |||
| result_512_14 = _mm512_setzero_ps(); | |||
| result_512_15 = _mm512_setzero_ps(); | |||
| for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) { | |||
| // Load B with unroll 8 | |||
| idxB_base = idx_k << 3; | |||
| arrayB_512_0 = _mm512_loadu_si512(&B[idxB_base + 32*0]); | |||
| arrayB_512_1 = _mm512_loadu_si512(&B[idxB_base + 32*1]); | |||
| arrayB_512_2 = _mm512_loadu_si512(&B[idxB_base + 32*2]); | |||
| arrayB_512_3 = _mm512_loadu_si512(&B[idxB_base + 32*3]); | |||
| arrayB_512_4 = _mm512_loadu_si512(&B[idxB_base + 32*4]); | |||
| arrayB_512_5 = _mm512_loadu_si512(&B[idxB_base + 32*5]); | |||
| arrayB_512_6 = _mm512_loadu_si512(&B[idxB_base + 32*6]); | |||
| arrayB_512_7 = _mm512_loadu_si512(&B[idxB_base + 32*7]); | |||
| if (idx_k == tag_k_32x) {width = k - tag_k_32x;} | |||
| for (BLASLONG idx = 0; idx < width;) { | |||
| // Each two rows are a group for 32-pair bf16 elements | |||
| idxA_base = idx << 5; | |||
| arrayA_512_0 = _mm512_loadu_si512(&A[idxA_base]); | |||
| arrayA_512_1 = _mm512_loadu_si512(&A[idxA_base + 32]); | |||
| result_512_0 = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0))); | |||
| result_512_1 = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1))); | |||
| result_512_2 = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2))); | |||
| result_512_3 = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3))); | |||
| result_512_4 = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4))); | |||
| result_512_5 = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5))); | |||
| result_512_6 = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6))); | |||
| result_512_7 = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7))); | |||
| result_512_8 = _mm512_dpbf16_ps(result_512_8, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0))); | |||
| result_512_9 = _mm512_dpbf16_ps(result_512_9, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1))); | |||
| result_512_10 = _mm512_dpbf16_ps(result_512_10, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2))); | |||
| result_512_11 = _mm512_dpbf16_ps(result_512_11, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3))); | |||
| result_512_12 = _mm512_dpbf16_ps(result_512_12, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4))); | |||
| result_512_13 = _mm512_dpbf16_ps(result_512_13, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5))); | |||
| result_512_14 = _mm512_dpbf16_ps(result_512_14, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6))); | |||
| result_512_15 = _mm512_dpbf16_ps(result_512_15, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7))); | |||
| arrayB_512_0 = _mm512_shuffle_epi32(arrayB_512_0, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_1 = _mm512_shuffle_epi32(arrayB_512_1, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_2 = _mm512_shuffle_epi32(arrayB_512_2, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_3 = _mm512_shuffle_epi32(arrayB_512_3, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_4 = _mm512_shuffle_epi32(arrayB_512_4, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_5 = _mm512_shuffle_epi32(arrayB_512_5, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_6 = _mm512_shuffle_epi32(arrayB_512_6, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_7 = _mm512_shuffle_epi32(arrayB_512_7, SHUFFLE_MAGIC_NO); | |||
| idx += 2; | |||
| // Every 4 loops we need to switch to next 128 bits of arrayB registers | |||
| if ((idx & (~7)) == idx) { | |||
| arrayB_512_0 = _mm512_shuffle_i32x4(arrayB_512_0, arrayB_512_0, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_1 = _mm512_shuffle_i32x4(arrayB_512_1, arrayB_512_1, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_2 = _mm512_shuffle_i32x4(arrayB_512_2, arrayB_512_2, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_3 = _mm512_shuffle_i32x4(arrayB_512_3, arrayB_512_3, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_4 = _mm512_shuffle_i32x4(arrayB_512_4, arrayB_512_4, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_5 = _mm512_shuffle_i32x4(arrayB_512_5, arrayB_512_5, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_6 = _mm512_shuffle_i32x4(arrayB_512_6, arrayB_512_6, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_7 = _mm512_shuffle_i32x4(arrayB_512_7, arrayB_512_7, SHUFFLE_MAGIC_NO); | |||
| } | |||
| } | |||
| } | |||
| if (m != 32) { | |||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (32-m)); | |||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8); | |||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8); | |||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9); | |||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9); | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*0])) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*0+16]), tail_mask) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*1])) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*1+16]), tail_mask) | |||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10); | |||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10); | |||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11); | |||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11); | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*2])) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*2+16]), tail_mask) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*3])) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*3+16]), tail_mask) | |||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12); | |||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12); | |||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13); | |||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13); | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*4])) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*4+16]), tail_mask) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*5])) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*5+16]), tail_mask) | |||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14); | |||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14); | |||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15); | |||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15); | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*6])) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*6+16]), tail_mask) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*7])) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*7+16]), tail_mask) | |||
| } else { | |||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8); | |||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8); | |||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9); | |||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9); | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*0])) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*0+16])) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*1])) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*1+16])) | |||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10); | |||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10); | |||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11); | |||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11); | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*2])) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*2+16])) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*3])) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*3+16])) | |||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12); | |||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12); | |||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13); | |||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13); | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*4])) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*4+16])) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*5])) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*5+16])) | |||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14); | |||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14); | |||
| result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15); | |||
| result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15); | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*6])) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*6+16])) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*7])) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*7+16])) | |||
| } | |||
| } | |||
| // SBGEMM Kernel for M<=16, N=8, K can be any number, but the processing will take 32 as a base | |||
| #ifndef ONE_ALPHA // ALPHA is not ONE | |||
| void sbgemm_block_kernel_16x8x32_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||
| #else // ALPHA is ONE | |||
| void sbgemm_block_kernel_16x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||
| #endif | |||
| { | |||
| int SHUFFLE_MAGIC_NO = 0x39; | |||
| BLASLONG tag_k_32x = k & (~31); | |||
| BLASLONG idxB_base = 0; | |||
| BLASLONG width = 32; | |||
| #ifndef ONE_ALPHA | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| __m512i arrayA_512_0; | |||
| __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7; | |||
| __m512 result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7; | |||
| result_512_0 = _mm512_setzero_ps(); | |||
| result_512_1 = _mm512_setzero_ps(); | |||
| result_512_2 = _mm512_setzero_ps(); | |||
| result_512_3 = _mm512_setzero_ps(); | |||
| result_512_4 = _mm512_setzero_ps(); | |||
| result_512_5 = _mm512_setzero_ps(); | |||
| result_512_6 = _mm512_setzero_ps(); | |||
| result_512_7 = _mm512_setzero_ps(); | |||
| for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) { | |||
| // Load B with unroll 8 | |||
| idxB_base = idx_k << 3; | |||
| arrayB_512_0 = _mm512_loadu_si512(&B[idxB_base + 32*0]); | |||
| arrayB_512_1 = _mm512_loadu_si512(&B[idxB_base + 32*1]); | |||
| arrayB_512_2 = _mm512_loadu_si512(&B[idxB_base + 32*2]); | |||
| arrayB_512_3 = _mm512_loadu_si512(&B[idxB_base + 32*3]); | |||
| arrayB_512_4 = _mm512_loadu_si512(&B[idxB_base + 32*4]); | |||
| arrayB_512_5 = _mm512_loadu_si512(&B[idxB_base + 32*5]); | |||
| arrayB_512_6 = _mm512_loadu_si512(&B[idxB_base + 32*6]); | |||
| arrayB_512_7 = _mm512_loadu_si512(&B[idxB_base + 32*7]); | |||
| if (idx_k == tag_k_32x) {width = k - tag_k_32x;} | |||
| for (BLASLONG idx = 0; idx < width;) { | |||
| // Each two rows are a group for 32-pair bf16 elements | |||
| // Load two rows into a 512 register | |||
| arrayA_512_0 = _mm512_loadu_si512(&A[idx<<4]); | |||
| result_512_0 = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0))); | |||
| result_512_1 = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1))); | |||
| result_512_2 = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2))); | |||
| result_512_3 = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3))); | |||
| result_512_4 = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4))); | |||
| result_512_5 = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5))); | |||
| result_512_6 = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6))); | |||
| result_512_7 = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7))); | |||
| arrayB_512_0 = _mm512_shuffle_epi32(arrayB_512_0, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_1 = _mm512_shuffle_epi32(arrayB_512_1, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_2 = _mm512_shuffle_epi32(arrayB_512_2, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_3 = _mm512_shuffle_epi32(arrayB_512_3, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_4 = _mm512_shuffle_epi32(arrayB_512_4, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_5 = _mm512_shuffle_epi32(arrayB_512_5, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_6 = _mm512_shuffle_epi32(arrayB_512_6, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_7 = _mm512_shuffle_epi32(arrayB_512_7, SHUFFLE_MAGIC_NO); | |||
| idx += 2; | |||
| // Every 4 loops we need to switch to next 128 bits of arrayB registers | |||
| if ((idx & (~7)) == idx) { | |||
| arrayB_512_0 = _mm512_shuffle_i32x4(arrayB_512_0, arrayB_512_0, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_1 = _mm512_shuffle_i32x4(arrayB_512_1, arrayB_512_1, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_2 = _mm512_shuffle_i32x4(arrayB_512_2, arrayB_512_2, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_3 = _mm512_shuffle_i32x4(arrayB_512_3, arrayB_512_3, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_4 = _mm512_shuffle_i32x4(arrayB_512_4, arrayB_512_4, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_5 = _mm512_shuffle_i32x4(arrayB_512_5, arrayB_512_5, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_6 = _mm512_shuffle_i32x4(arrayB_512_6, arrayB_512_6, SHUFFLE_MAGIC_NO); | |||
| arrayB_512_7 = _mm512_shuffle_i32x4(arrayB_512_7, arrayB_512_7, SHUFFLE_MAGIC_NO); | |||
| } | |||
| } | |||
| } | |||
| if (m != 16) { | |||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); | |||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||
| result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8); | |||
| result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8); | |||
| result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8); | |||
| result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8); | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_0, (&C[ldc*0]), tail_mask) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_1, (&C[ldc*1]), tail_mask) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_2, (&C[ldc*2]), tail_mask) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_3, (&C[ldc*3]), tail_mask) | |||
| result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8); | |||
| result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8); | |||
| result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8); | |||
| result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8); | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_4, (&C[ldc*4]), tail_mask) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_5, (&C[ldc*5]), tail_mask) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_6, (&C[ldc*6]), tail_mask) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_7, (&C[ldc*7]), tail_mask) | |||
| } else { | |||
| result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8); | |||
| result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8); | |||
| result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8); | |||
| result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8); | |||
| STORE16_COMPLETE_RESULT(result_512_0, (&C[ldc*0])) | |||
| STORE16_COMPLETE_RESULT(result_512_1, (&C[ldc*1])) | |||
| STORE16_COMPLETE_RESULT(result_512_2, (&C[ldc*2])) | |||
| STORE16_COMPLETE_RESULT(result_512_3, (&C[ldc*3])) | |||
| result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8); | |||
| result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8); | |||
| result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8); | |||
| result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8); | |||
| STORE16_COMPLETE_RESULT(result_512_4, (&C[ldc*4])) | |||
| STORE16_COMPLETE_RESULT(result_512_5, (&C[ldc*5])) | |||
| STORE16_COMPLETE_RESULT(result_512_6, (&C[ldc*6])) | |||
| STORE16_COMPLETE_RESULT(result_512_7, (&C[ldc*7])) | |||
| } | |||
| } | |||
| // SBGEMM Kernel for 16<M<=32, N<8, K can be any number, but the processing will take 32 as a base | |||
| #ifndef ONE_ALPHA // ALPHA is not ONE | |||
| void sbgemm_block_kernel_32xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||
| #else // ALPHA is ONE | |||
| void sbgemm_block_kernel_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||
| #endif | |||
| { | |||
| int SHUFFLE_MAGIC_NO = 0x39; | |||
| BLASLONG tag_k_32x = k & (~31); | |||
| BLASLONG idxA_base = 0; | |||
| BLASLONG idxB_base = 0; | |||
| BLASLONG width = 32; | |||
| #ifndef ONE_ALPHA | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| __m512i arrayA_512[2]; | |||
| __m512i arrayB_512[8]; | |||
| __m512 result_512[16]; | |||
| __m512 result_512_tmp_0, result_512_tmp_1; | |||
| __m512i M512_EPI32_8 = _mm512_set1_epi32(8); | |||
| __m512i shuffle_idx_base0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0); | |||
| __m512i shuffle_idx_base1 = _mm512_add_epi32(shuffle_idx_base0, M512_EPI32_8); | |||
| for (int i = 0; i < 15; i += 2) { | |||
| result_512[i] = _mm512_setzero_ps(); | |||
| result_512[i+1] = _mm512_setzero_ps(); | |||
| } | |||
| for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) { | |||
| // Load B with unroll n | |||
| for (int i = 0; i < n; i ++) { | |||
| arrayB_512[i] = _mm512_loadu_si512(&B[idxB_base]); | |||
| idxB_base += 32; | |||
| } | |||
| if (idx_k == tag_k_32x) {width = k - tag_k_32x;} | |||
| for (BLASLONG idx = 0; idx < width;) { | |||
| // Each two rows are a group for 32-pair bf16 elements | |||
| idxA_base = idx << 5; | |||
| arrayA_512[0] = _mm512_loadu_si512(&A[idxA_base]); | |||
| arrayA_512[1] = _mm512_loadu_si512(&A[idxA_base + 32]); | |||
| for (int i = 0; i < n; i++) { | |||
| result_512[i] = _mm512_dpbf16_ps(result_512[i] , (__m512bh) arrayA_512[0], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); | |||
| result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8], (__m512bh) arrayA_512[1], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); | |||
| arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); | |||
| } | |||
| idx += 2; | |||
| // Every 4 loops we need to switch to next 128 bits of arrayB registers | |||
| if ((idx & (~7)) == idx) { | |||
| for (int i = 0; i < n; i++) { | |||
| arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| if (m != 32) { | |||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (32-m)); | |||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||
| for (int i = 0; i < n; i++) { | |||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); | |||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i])) | |||
| STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16]), tail_mask) | |||
| } | |||
| } else { | |||
| for (int i = 0; i < n; i++) { | |||
| result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); | |||
| result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i])) | |||
| STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16])) | |||
| } | |||
| } | |||
| } | |||
| // SBGEMM Kernel for 16<=M, N<8, K can be any number, but the processing will take 32 as a base | |||
| #ifndef ONE_ALPHA // ALPHA is not ONE | |||
| void sbgemm_block_kernel_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||
| #else // ALPHA is ONE | |||
| void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) | |||
| #endif | |||
| { | |||
| int SHUFFLE_MAGIC_NO = 0x39; | |||
| BLASLONG tag_k_32x = k & (~31); | |||
| BLASLONG idxB_base = 0; | |||
| BLASLONG width = 32; | |||
| #ifndef ONE_ALPHA | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| __m512i arrayA_512; | |||
| __m512i arrayB_512[8]; | |||
| __m512 result_512[8]; | |||
| for (int i = 0; i < 8; i += 2) { | |||
| result_512[i] = _mm512_setzero_ps(); | |||
| result_512[i+1] = _mm512_setzero_ps(); | |||
| } | |||
| for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) { | |||
| // Load B with unroll n | |||
| for (int i = 0; i < n; i ++) { | |||
| arrayB_512[i] = _mm512_loadu_si512(&B[idxB_base]); | |||
| idxB_base += 32; | |||
| } | |||
| if (idx_k == tag_k_32x) {width = k - tag_k_32x;} | |||
| for (BLASLONG idx = 0; idx < width;) { | |||
| // Each two rows are a group for 32-pair bf16 elements | |||
| // Load two rows into a 512 register | |||
| arrayA_512 = _mm512_loadu_si512(&A[idx<<4]); | |||
| for (int i = 0; i < n; i ++) { | |||
| result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); | |||
| arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); | |||
| } | |||
| idx += 2; | |||
| // Every 4 loops we need to switch to next 128 bits of arrayB registers | |||
| if ((idx & (~7)) == idx) { | |||
| for (int i = 0; i < n; i++) { | |||
| arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| if (m != 16) { | |||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); | |||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||
| for (int i = 0; i < n; i++) { | |||
| result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); | |||
| STORE16_MASK_COMPLETE_RESULT(result_512[i], (&C[ldc*i]), tail_mask) | |||
| } | |||
| } else { | |||
| for (int i = 0; i < n; i++) { | |||
| result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); | |||
| STORE16_COMPLETE_RESULT(result_512[i], (&C[ldc*i])) | |||
| } | |||
| } | |||
| } | |||
| #ifndef ONE_ALPHA // ALPHA is not ONE | |||
| void sbgemm_blocking_kernel_2_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) | |||
| #else // ALPHA is ONE | |||
| void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) | |||
| #endif | |||
| { | |||
| BLASLONG m_step, n_step, k_step, k_step_round32; | |||
| BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1)); | |||
| BLASLONG n_from, n_to; | |||
| BLASLONG tag_n_Nx; | |||
| n_from = 0; | |||
| n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N; | |||
| tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); | |||
| k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K; | |||
| k_step_round32 = k_step & (~31); | |||
| k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; | |||
| if (M >= BF16_BLOCK_THRES_M) { | |||
| while (n_from < N) { | |||
| for (BLASLONG idx_k = 0; idx_k < K;) { | |||
| // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... | |||
| COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, 0), lda, block_A); | |||
| // TODO: MT | |||
| for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { | |||
| // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... | |||
| COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); | |||
| SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); | |||
| } | |||
| if (tag_n_Nx != n_to) { | |||
| n_step = n_to - tag_n_Nx; | |||
| COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); | |||
| SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); | |||
| } | |||
| for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { | |||
| COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, idx_m), lda, block_A); | |||
| for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { | |||
| SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); | |||
| } | |||
| if (tag_n_Nx != n_to) { | |||
| n_step = n_to - tag_n_Nx; | |||
| SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); | |||
| } | |||
| } | |||
| if (tag_m_Nx != M) { | |||
| m_step = M - tag_m_Nx; | |||
| if (m_step > 16) { | |||
| COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); | |||
| for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { | |||
| SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); | |||
| } | |||
| if (tag_n_Nx != n_to) { | |||
| n_step = n_to - tag_n_Nx; | |||
| SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); | |||
| } | |||
| } else if (m_step == 16) { | |||
| COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); | |||
| for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { | |||
| SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); | |||
| } | |||
| if (tag_n_Nx != n_to) { | |||
| n_step = n_to - tag_n_Nx; | |||
| SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); | |||
| } | |||
| } else { | |||
| COL_MAJOR_INCOPY_KERNEL_Kx16m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); | |||
| for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { | |||
| SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); | |||
| } | |||
| if (tag_n_Nx != n_to) { | |||
| n_step = n_to - tag_n_Nx; | |||
| SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); | |||
| } | |||
| } | |||
| } | |||
| idx_k += k_step; | |||
| k_step = K - idx_k; | |||
| k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; | |||
| k_step_round32 = k_step & (~31); | |||
| k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; | |||
| } | |||
| n_from = n_to; | |||
| n_to += BF16_BLOCK_THRES_N; | |||
| n_to = (n_to > N) ? N : n_to; | |||
| tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); | |||
| } | |||
| } else { | |||
| m_step = M - tag_m_Nx; | |||
| while (n_from < N) { | |||
| for (BLASLONG idx_k = 0; idx_k < K;) { | |||
| // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... | |||
| COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, 0), lda, block_A); | |||
| // TODO: MT | |||
| for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { | |||
| // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... | |||
| COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); | |||
| SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); | |||
| } | |||
| if (tag_n_Nx != n_to) { | |||
| n_step = n_to - tag_n_Nx; | |||
| COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); | |||
| SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); | |||
| } | |||
| idx_k += k_step; | |||
| k_step = K - idx_k; | |||
| k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; | |||
| k_step_round32 = k_step & (~31); | |||
| k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; | |||
| } | |||
| n_from = n_to; | |||
| n_to += BF16_BLOCK_THRES_N; | |||
| n_to = (n_to > N) ? N : n_to; | |||
| tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); | |||
| } | |||
| } | |||
| } | |||
| #ifndef ONE_ALPHA // ALPHA is not ONE | |||
| void sbgemm_internal_kernel_alpha(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc) | |||
| #else // ALPHA is ONE | |||
| void sbgemm_internal_kernel_one(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc) | |||
| #endif | |||
| { | |||
| bfloat16 block_A[BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M]; | |||
| bfloat16 block_B[BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K]; | |||
| // TODO: assume no trans for both A and B, to complement these scenarios later | |||
| if (Order == CblasColMajor) { | |||
| SBGEMM_BLOCKING_KERNEL_2(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); | |||
| } else { | |||
| } | |||
| } | |||
| @@ -1,8 +1,11 @@ | |||
| /* the direct sgemm code written by Arjan van der Ven */ | |||
| #if defined(SKYLAKEX) || defined (COOPERLAKE) | |||
| #include <immintrin.h> | |||
| #include "common.h" | |||
| #if defined(SKYLAKEX) || defined (COOPERLAKE) | |||
| /* | |||
| * "Direct sgemm" code. This code operates directly on the inputs and outputs | |||
| * of the sgemm call, avoiding the copies, memory realignments and threading, | |||
| @@ -2,7 +2,7 @@ | |||
| #if defined(SKYLAKEX) | |||
| #include "srot_microk_skylakex-2.c" | |||
| #elif defined(HASWELL) | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #include "srot_microk_haswell-2.c" | |||
| #endif | |||
| @@ -13,7 +13,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i = 0; | |||
| #if V_SIMD && (defined(HAVE_FMA3) || V_SIMD > 128) | |||
| #if V_SIMD && !defined(C_PGI) && (defined(HAVE_FMA3) || V_SIMD > 128) | |||
| const int vstep = v_nlanes_f32; | |||
| const int unrollx4 = n & (-vstep * 4); | |||
| const int unrollx = n & -vstep; | |||
| @@ -1,5 +1,4 @@ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) | |||
| #if defined(HAVE_FMA3) && defined(HAVE_AVX2) | |||
| #define HAVE_SROT_KERNEL 1 | |||
| @@ -320,12 +320,13 @@ | |||
| $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP | |||
| COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, | |||
| $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, | |||
| $ U12, X | |||
| $ U12, X, ABI12, Y | |||
| * .. | |||
| * .. External Functions .. | |||
| COMPLEX CLADIV | |||
| LOGICAL LSAME | |||
| REAL CLANHS, SLAMCH | |||
| EXTERNAL LSAME, CLANHS, SLAMCH | |||
| EXTERNAL CLADIV, LLSAME, CLANHS, SLAMCH | |||
| * .. | |||
| * .. External Subroutines .. | |||
| EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA | |||
| @@ -729,22 +730,34 @@ | |||
| AD22 = ( ASCALE*H( ILAST, ILAST ) ) / | |||
| $ ( BSCALE*T( ILAST, ILAST ) ) | |||
| ABI22 = AD22 - U12*AD21 | |||
| ABI12 = AD12 - U12*AD11 | |||
| * | |||
| T1 = HALF*( AD11+ABI22 ) | |||
| RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 ) | |||
| TEMP = REAL( T1-ABI22 )*REAL( RTDISC ) + | |||
| $ AIMAG( T1-ABI22 )*AIMAG( RTDISC ) | |||
| IF( TEMP.LE.ZERO ) THEN | |||
| SHIFT = T1 + RTDISC | |||
| ELSE | |||
| SHIFT = T1 - RTDISC | |||
| SHIFT = ABI22 | |||
| CTEMP = SQRT( ABI12 )*SQRT( AD21 ) | |||
| TEMP = ABS1( CTEMP ) | |||
| IF( CTEMP.NE.ZERO ) THEN | |||
| X = HALF*( AD11-SHIFT ) | |||
| TEMP2 = ABS1( X ) | |||
| TEMP = MAX( TEMP, ABS1( X ) ) | |||
| Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 ) | |||
| IF( TEMP2.GT.ZERO ) THEN | |||
| IF( REAL( X / TEMP2 )*REAL( Y )+ | |||
| $ AIMAG( X / TEMP2 )*AIMAG( Y ).LT.ZERO )Y = -Y | |||
| END IF | |||
| SHIFT = SHIFT - CTEMP*CLADIV( CTEMP, ( X+Y ) ) | |||
| END IF | |||
| ELSE | |||
| * | |||
| * Exceptional shift. Chosen for no particularly good reason. | |||
| * | |||
| ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/ | |||
| $ (BSCALE*T(ILAST-1,ILAST-1)) | |||
| IF( ( IITER / 20 )*20.EQ.IITER .AND. | |||
| $ BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN | |||
| ESHIFT = ESHIFT + ( ASCALE*H( ILAST, | |||
| $ ILAST ) )/( BSCALE*T( ILAST, ILAST ) ) | |||
| ELSE | |||
| ESHIFT = ESHIFT + ( ASCALE*H( ILAST, | |||
| $ ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) ) | |||
| END IF | |||
| SHIFT = ESHIFT | |||
| END IF | |||
| * | |||
| @@ -320,12 +320,13 @@ | |||
| $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP | |||
| COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, | |||
| $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, | |||
| $ U12, X | |||
| $ U12, X, ABI12, Y | |||
| * .. | |||
| * .. External Functions .. | |||
| COMPLEX*16 ZLADIV | |||
| LOGICAL LSAME | |||
| DOUBLE PRECISION DLAMCH, ZLANHS | |||
| EXTERNAL LSAME, DLAMCH, ZLANHS | |||
| EXTERNAL ZLADIV, LSAME, DLAMCH, ZLANHS | |||
| * .. | |||
| * .. External Subroutines .. | |||
| EXTERNAL XERBLA, ZLARTG, ZLASET, ZROT, ZSCAL | |||
| @@ -730,22 +731,34 @@ | |||
| AD22 = ( ASCALE*H( ILAST, ILAST ) ) / | |||
| $ ( BSCALE*T( ILAST, ILAST ) ) | |||
| ABI22 = AD22 - U12*AD21 | |||
| ABI12 = AD12 - U12*AD11 | |||
| * | |||
| T1 = HALF*( AD11+ABI22 ) | |||
| RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 ) | |||
| TEMP = DBLE( T1-ABI22 )*DBLE( RTDISC ) + | |||
| $ DIMAG( T1-ABI22 )*DIMAG( RTDISC ) | |||
| IF( TEMP.LE.ZERO ) THEN | |||
| SHIFT = T1 + RTDISC | |||
| ELSE | |||
| SHIFT = T1 - RTDISC | |||
| SHIFT = ABI22 | |||
| CTEMP = SQRT( ABI12 )*SQRT( AD21 ) | |||
| TEMP = ABS1( CTEMP ) | |||
| IF( CTEMP.NE.ZERO ) THEN | |||
| X = HALF*( AD11-SHIFT ) | |||
| TEMP2 = ABS1( X ) | |||
| TEMP = MAX( TEMP, ABS1( X ) ) | |||
| Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 ) | |||
| IF( TEMP2.GT.ZERO ) THEN | |||
| IF( DBLE( X / TEMP2 )*DBLE( Y )+ | |||
| $ DIMAG( X / TEMP2 )*DIMAG( Y ).LT.ZERO )Y = -Y | |||
| END IF | |||
| SHIFT = SHIFT - CTEMP*ZLADIV( CTEMP, ( X+Y ) ) | |||
| END IF | |||
| ELSE | |||
| * | |||
| * Exceptional shift. Chosen for no particularly good reason. | |||
| * | |||
| ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/ | |||
| $ (BSCALE*T(ILAST-1,ILAST-1)) | |||
| IF( ( IITER / 20 )*20.EQ.IITER .AND. | |||
| $ BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN | |||
| ESHIFT = ESHIFT + ( ASCALE*H( ILAST, | |||
| $ ILAST ) )/( BSCALE*T( ILAST, ILAST ) ) | |||
| ELSE | |||
| ESHIFT = ESHIFT + ( ASCALE*H( ILAST, | |||
| $ ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) ) | |||
| END IF | |||
| SHIFT = ESHIFT | |||
| END IF | |||
| * | |||
| @@ -174,7 +174,20 @@ if(PYTHONINTERP_FOUND) | |||
| endif() | |||
| if(WIN32) | |||
| FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1 | |||
| "if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n" | |||
| "$ErrorActionPreference = \"Stop\"\n" | |||
| "Get-Content $args[1] | & \"$($args[0]).exe\" | Out-File $args[2]\n" | |||
| "If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n" | |||
| "echo Error\n" | |||
| "exit 1\n" | |||
| "} else {\n" | |||
| "exit 0\n" | |||
| "}\n" | |||
| ) | |||
| set(helper_prefix powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1") | |||
| else() | |||
| # $1 exec, $2 input, $3 output_result | |||
| FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh | |||
| "rm -f $3\n" | |||
| @@ -187,51 +200,52 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh | |||
| "exit 0\n" | |||
| "fi\n" | |||
| ) | |||
| set(helper_prefix sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh") | |||
| endif() | |||
| add_test(NAME "REAL_LAPACK_linear_equation_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out" | |||
| ) | |||
| add_test(NAME "COMPLEX_LAPACK_linear_equation_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out" | |||
| ) | |||
| add_test(NAME "DOUBLE_PRECISION_LAPACK_linear_equation_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out" | |||
| ) | |||
| add_test(NAME "COMPLEX16_LAPACK_linear_equation_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out" | |||
| ) | |||
| add_test(NAME "SINGLE-DOUBLE_PRECISION_LAPACK_prototype_linear_equation_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out" | |||
| ) | |||
| # ======== COMPLEX-COMPLEX16 LIN TESTS ======================== | |||
| add_test(NAME "Testing_COMPLEX-COMPLEX16_LAPACK_prototype_linear_equation_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out" | |||
| ) | |||
| # ======== SINGLE RFP LIN TESTS ======================== | |||
| add_test(NAME "Testing_REAL_LAPACK_RFP_prototype_linear_equation_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out" | |||
| ) | |||
| # ======== COMPLEX16 RFP LIN TESTS ======================== | |||
| add_test(NAME "Testing_DOUBLE_PRECISION_LAPACK_RFP_prototype_linear_equation_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out" | |||
| ) | |||
| # ======== COMPLEX16 RFP LIN TESTS ======================== | |||
| add_test(NAME "Testing_COMPLEX_LAPACK_RFP_prototype_linear_equation_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out" | |||
| ) | |||
| # ======== COMPLEX16 RFP LIN TESTS ======================== | |||
| add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out" | |||
| ) | |||
| # | |||
| # | |||
| @@ -239,327 +253,327 @@ add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines" | |||
| # | |||
| add_test(NAME "SNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out" | |||
| ) | |||
| add_test(NAME "SSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out" | |||
| ) | |||
| add_test(NAME "SSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out" | |||
| ) | |||
| add_test(NAME "SSVD:_Testing_Singular_Value_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out" | |||
| ) | |||
| add_test(NAME "SSEC:_Testing_REAL_Eigen_Condition_Routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out" | |||
| ) | |||
| add_test(NAME "SSEV:_Testing_REAL_Nonsymmetric_Eigenvalue_Driver" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out" | |||
| ) | |||
| add_test(NAME "SGG:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out" | |||
| ) | |||
| add_test(NAME "SGD:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out" | |||
| ) | |||
| add_test(NAME "SSB:_Testing_REAL_Symmetric_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out" | |||
| ) | |||
| add_test(NAME "SSG:_Testing_REAL_Symmetric_Generalized_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out" | |||
| ) | |||
| add_test(NAME "SGEBAL:_Testing_the_balancing_of_a_REAL_general_matrix" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out" | |||
| ) | |||
| add_test(NAME "SGEBAK:_Testing_the_back_transformation_of_a_REAL_balanced_matrix" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out" | |||
| ) | |||
| add_test(NAME "SGGBAL:_Testing_the_balancing_of_a_pair_of_REAL_general_matrices" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out" | |||
| ) | |||
| add_test(NAME "SGGBAK:_Testing_the_back_transformation_of_a_pair_of_REAL_balanced_matrices" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out" | |||
| ) | |||
| add_test(NAME "SBB:_Testing_banded_Singular_Value_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out" | |||
| ) | |||
| add_test(NAME "SGLM:_Testing_Generalized_Linear_Regression_Model_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out" | |||
| ) | |||
| add_test(NAME "SGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out" | |||
| ) | |||
| add_test(NAME "SGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out" | |||
| ) | |||
| add_test(NAME "SCSD:_Testing_CS_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out" | |||
| ) | |||
| add_test(NAME "SLSE:_Testing_Constrained_Linear_Least_Squares_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out" | |||
| ) | |||
| # ======== COMPLEX EIG TESTS =========================== | |||
| add_test(NAME "CNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out" | |||
| ) | |||
| add_test(NAME "CSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out" | |||
| ) | |||
| add_test(NAME "CSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out" | |||
| ) | |||
| add_test(NAME "CSVD:_Testing_Singular_Value_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out" | |||
| ) | |||
| add_test(NAME "CEC:_Testing_COMPLEX_Eigen_Condition_Routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out" | |||
| ) | |||
| add_test(NAME "CES:_Testing_COMPLEX_Nonsymmetric_Schur_Form_Driver" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out" | |||
| ) | |||
| add_test(NAME "CGG:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out" | |||
| ) | |||
| add_test(NAME "CGD:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out" | |||
| ) | |||
| add_test(NAME "CHB:_Testing_Hermitian_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out" | |||
| ) | |||
| add_test(NAME "CSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out" | |||
| ) | |||
| add_test(NAME "CGEBAL:_Testing_the_balancing_of_a_COMPLEX_general_matrix" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out" | |||
| ) | |||
| add_test(NAME "CGEBAK:_Testing_the_back_transformation_of_a_COMPLEX_balanced_matrix" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out" | |||
| ) | |||
| add_test(NAME "CGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out" | |||
| ) | |||
| add_test(NAME "CGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX_balanced_matrices" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out" | |||
| ) | |||
| add_test(NAME "CBB:_Testing_banded_Singular_Value_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out" | |||
| ) | |||
| add_test(NAME "CGLM:_Testing_Generalized_Linear_Regression_Model_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out" | |||
| ) | |||
| add_test(NAME "CGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out" | |||
| ) | |||
| add_test(NAME "CGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out" | |||
| ) | |||
| add_test(NAME "CCSD:_Testing_CS_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out" | |||
| ) | |||
| add_test(NAME "CLSE:_Testing_Constrained_Linear_Least_Squares_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out" | |||
| ) | |||
| # ======== DOUBLE EIG TESTS =========================== | |||
| add_test(NAME "DNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out" | |||
| ) | |||
| add_test(NAME "DSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out" | |||
| ) | |||
| add_test(NAME "DSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out" | |||
| ) | |||
| add_test(NAME "DSVD:_Testing_Singular_Value_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out" | |||
| ) | |||
| add_test(NAME "DEC:_Testing_DOUBLE_PRECISION_Eigen_Condition_Routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out" | |||
| ) | |||
| add_test(NAME "DEV:_Testing_DOUBLE_PRECISION_Nonsymmetric_Eigenvalue_Driver" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out" | |||
| ) | |||
| add_test(NAME "DGG:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out" | |||
| ) | |||
| add_test(NAME "DGD:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out" | |||
| ) | |||
| add_test(NAME "DSB:_Testing_DOUBLE_PRECISION_Symmetric_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out" | |||
| ) | |||
| add_test(NAME "DSG:_Testing_DOUBLE_PRECISION_Symmetric_Generalized_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out" | |||
| ) | |||
| add_test(NAME "DGEBAL:_Testing_the_balancing_of_a_DOUBLE_PRECISION_general_matrix" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out" | |||
| ) | |||
| add_test(NAME "DGEBAK:_Testing_the_back_transformation_of_a_DOUBLE_PRECISION_balanced_matrix" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out" | |||
| ) | |||
| add_test(NAME "DGGBAL:_Testing_the_balancing_of_a_pair_of_DOUBLE_PRECISION_general_matrices" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out" | |||
| ) | |||
| add_test(NAME "DGGBAK:_Testing_the_back_transformation_of_a_pair_of_DOUBLE_PRECISION_balanced_matrices" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out" | |||
| ) | |||
| add_test(NAME "DBB:_Testing_banded_Singular_Value_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out" | |||
| ) | |||
| add_test(NAME "DGLM:_Testing_Generalized_Linear_Regression_Model_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out" | |||
| ) | |||
| add_test(NAME "DGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out" | |||
| ) | |||
| add_test(NAME "DGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out" | |||
| ) | |||
| add_test(NAME "DCSD:_Testing_CS_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out" | |||
| ) | |||
| add_test(NAME "DLSE:_Testing_Constrained_Linear_Least_Squares_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out" | |||
| ) | |||
| # ======== COMPLEX16 EIG TESTS =========================== | |||
| add_test(NAME "ZNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out" | |||
| ) | |||
| add_test(NAME "ZSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out" | |||
| ) | |||
| add_test(NAME "ZSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out" | |||
| ) | |||
| add_test(NAME "ZSVD:_Testing_Singular_Value_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out" | |||
| ) | |||
| add_test(NAME "ZEC:_Testing_COMPLEX16_Eigen_Condition_Routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out" | |||
| ) | |||
| add_test(NAME "ZES:_Testing_COMPLEX16_Nonsymmetric_Schur_Form_Driver" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out" | |||
| ) | |||
| add_test(NAME "ZGG:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out" | |||
| ) | |||
| add_test(NAME "ZGD:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out" | |||
| ) | |||
| add_test(NAME "ZHB:_Testing_Hermitian_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out" | |||
| ) | |||
| add_test(NAME "ZSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out" | |||
| ) | |||
| add_test(NAME "ZGEBAL:_Testing_the_balancing_of_a_COMPLEX16_general_matrix" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out" | |||
| ) | |||
| add_test(NAME "ZGEBAK:_Testing_the_back_transformation_of_a_COMPLEX16_balanced_matrix" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out" | |||
| ) | |||
| add_test(NAME "ZGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out" | |||
| ) | |||
| add_test(NAME "ZGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX16_balanced_matrices" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out" | |||
| ) | |||
| add_test(NAME "ZBB:_Testing_banded_Singular_Value_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out" | |||
| ) | |||
| add_test(NAME "ZGLM:_Testing_Generalized_Linear_Regression_Model_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out" | |||
| ) | |||
| add_test(NAME "ZGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out" | |||
| ) | |||
| add_test(NAME "ZGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out" | |||
| ) | |||
| add_test(NAME "ZCSD:_Testing_CS_Decomposition_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out" | |||
| ) | |||
| add_test(NAME "Constrained_Linear_Least_Squares_routines" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out" | |||
| COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out" | |||
| ) | |||
| @@ -25,7 +25,7 @@ set(AEIGTST | |||
| set(SCIGTST slafts.f slahd2.f slasum.f slatb9.f sstech.f sstect.f | |||
| ssvdch.f ssvdct.f ssxt1.f) | |||
| set(SEIGTST schkee.f | |||
| set(SEIGTST schkee.F | |||
| sbdt01.f sbdt02.f sbdt03.f sbdt04.f sbdt05.f | |||
| schkbb.f schkbd.f schkbk.f schkbl.f schkec.f | |||
| schkgg.f schkgk.f schkgl.f schkhs.f schksb.f schkst.f schkst2stg.f schksb2stg.f | |||
| @@ -42,7 +42,7 @@ set(SEIGTST schkee.f | |||
| sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f | |||
| sstt22.f ssyt21.f ssyt22.f) | |||
| set(CEIGTST cchkee.f | |||
| set(CEIGTST cchkee.F | |||
| cbdt01.f cbdt02.f cbdt03.f cbdt05.f | |||
| cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f | |||
| cchkgg.f cchkgk.f cchkgl.f cchkhb.f cchkhs.f cchkst.f cchkst2stg.f cchkhb2stg.f | |||
| @@ -62,7 +62,7 @@ set(CEIGTST cchkee.f | |||
| set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f | |||
| dsvdch.f dsvdct.f dsxt1.f) | |||
| set(DEIGTST dchkee.f | |||
| set(DEIGTST dchkee.F | |||
| dbdt01.f dbdt02.f dbdt03.f dbdt04.f dbdt05.f | |||
| dchkbb.f dchkbd.f dchkbk.f dchkbl.f dchkec.f | |||
| dchkgg.f dchkgk.f dchkgl.f dchkhs.f dchksb.f dchkst.f dchkst2stg.f dchksb2stg.f | |||
| @@ -79,7 +79,7 @@ set(DEIGTST dchkee.f | |||
| dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f | |||
| dstt22.f dsyt21.f dsyt22.f) | |||
| set(ZEIGTST zchkee.f | |||
| set(ZEIGTST zchkee.F | |||
| zbdt01.f zbdt02.f zbdt03.f zbdt05.f | |||
| zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f | |||
| zchkgg.f zchkgk.f zchkgl.f zchkhb.f zchkhs.f zchkst.f zchkst2stg.f zchkhb2stg.f | |||
| @@ -157,11 +157,11 @@ cleanobj: | |||
| cleanexe: | |||
| rm -f xeigtst* | |||
| schkee.o: schkee.f | |||
| schkee.o: schkee.F | |||
| $(FC) $(FFLAGS_DRV) -c -o $@ $< | |||
| dchkee.o: dchkee.f | |||
| dchkee.o: dchkee.F | |||
| $(FC) $(FFLAGS_DRV) -c -o $@ $< | |||
| cchkee.o: cchkee.f | |||
| cchkee.o: cchkee.F | |||
| $(FC) $(FFLAGS_DRV) -c -o $@ $< | |||
| zchkee.o: zchkee.f | |||
| zchkee.o: zchkee.F | |||
| $(FC) $(FFLAGS_DRV) -c -o $@ $< | |||
| @@ -1034,6 +1034,10 @@ | |||
| * ===================================================================== | |||
| PROGRAM CCHKEE | |||
| * | |||
| #if defined(_OPENMP) | |||
| use omp_lib | |||
| #endif | |||
| * | |||
| * -- LAPACK test routine (version 3.7.0) -- | |||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | |||
| @@ -1071,7 +1075,7 @@ | |||
| CHARACTER*80 LINE | |||
| INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, | |||
| $ NK, NN, NPARMS, NRHS, NTYPES, | |||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH | |||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS | |||
| REAL EPS, S1, S2, THRESH, THRSHN | |||
| * .. | |||
| * .. Local Arrays .. | |||
| @@ -1084,12 +1088,16 @@ | |||
| INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), | |||
| $ ISHFTS( MAXIN ), IACC22( MAXIN ) | |||
| REAL ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), | |||
| $ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX ) | |||
| COMPLEX A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), | |||
| $ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ), | |||
| $ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ), | |||
| $ RESULT( 500 ) | |||
| COMPLEX DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ), | |||
| $ X( 5*NMAX ) | |||
| * .. | |||
| * .. Allocatable Arrays .. | |||
| INTEGER AllocateStatus | |||
| REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S | |||
| COMPLEX, DIMENSION(:), ALLOCATABLE :: WORK | |||
| COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, C | |||
| * .. | |||
| * .. External Functions .. | |||
| LOGICAL LSAMEN | |||
| REAL SECOND, SLAMCH | |||
| @@ -1130,6 +1138,21 @@ | |||
| DATA INTSTR / '0123456789' / | |||
| DATA IOLDSD / 0, 0, 0, 1 / | |||
| * .. | |||
| * .. Allocate memory dynamically .. | |||
| * | |||
| ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| A = 0.0 | |||
| @@ -1846,8 +1869,16 @@ | |||
| CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | |||
| CALL XLAENV( 1, 1 ) | |||
| CALL XLAENV( 9, 25 ) | |||
| IF( TSTERR ) | |||
| $ CALL CERRST( 'CST', NOUT ) | |||
| IF( TSTERR ) THEN | |||
| #if defined(_OPENMP) | |||
| N_THREADS = OMP_GET_NUM_THREADS() | |||
| CALL OMP_SET_NUM_THREADS(1) | |||
| #endif | |||
| CALL CERRST( 'CST', NOUT ) | |||
| #if defined(_OPENMP) | |||
| CALL OMP_SET_NUM_THREADS(N_THREADS) | |||
| #endif | |||
| END IF | |||
| DO 290 I = 1, NPARMS | |||
| CALL XLAENV( 1, NBVAL( I ) ) | |||
| CALL XLAENV( 2, NBMIN( I ) ) | |||
| @@ -2305,8 +2336,16 @@ | |||
| MAXTYP = 15 | |||
| NTYPES = MIN( MAXTYP, NTYPES ) | |||
| CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | |||
| IF( TSTERR ) | |||
| $ CALL CERRST( 'CHB', NOUT ) | |||
| IF( TSTERR ) THEN | |||
| #if defined(_OPENMP) | |||
| N_THREADS = OMP_GET_NUM_THREADS() | |||
| CALL OMP_SET_NUM_THREADS(1) | |||
| #endif | |||
| CALL CERRST( 'CHB', NOUT ) | |||
| #if defined(_OPENMP) | |||
| CALL OMP_SET_NUM_THREADS(N_THREADS) | |||
| #endif | |||
| END IF | |||
| * CALL CCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, | |||
| * $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), | |||
| * $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, | |||
| @@ -2436,7 +2475,14 @@ | |||
| 380 CONTINUE | |||
| WRITE( NOUT, FMT = 9994 ) | |||
| S2 = SECOND( ) | |||
| WRITE( NOUT, FMT = 9993 )S2 - S1 | |||
| WRITE( NOUT, FMT = 9993 )S2 - S1 | |||
| * | |||
| DEALLOCATE (S, STAT = AllocateStatus) | |||
| DEALLOCATE (A, STAT = AllocateStatus) | |||
| DEALLOCATE (B, STAT = AllocateStatus) | |||
| DEALLOCATE (C, STAT = AllocateStatus) | |||
| DEALLOCATE (RWORK, STAT = AllocateStatus) | |||
| DEALLOCATE (WORK, STAT = AllocateStatus) | |||
| * | |||
| 9999 FORMAT( / ' Execution not attempted due to input errors' ) | |||
| 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) | |||
| @@ -1038,7 +1038,11 @@ | |||
| *> \ingroup double_eig | |||
| * | |||
| * ===================================================================== | |||
| PROGRAM DCHKEE | |||
| PROGRAM DCHKEE | |||
| * | |||
| #if defined(_OPENMP) | |||
| use omp_lib | |||
| #endif | |||
| * | |||
| * -- LAPACK test routine (version 3.7.0) -- | |||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| @@ -1077,7 +1081,7 @@ | |||
| CHARACTER*80 LINE | |||
| INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, | |||
| $ NK, NN, NPARMS, NRHS, NTYPES, | |||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH | |||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS | |||
| DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN | |||
| * .. | |||
| * .. Local Arrays .. | |||
| @@ -1089,10 +1093,13 @@ | |||
| $ PVAL( MAXIN ) | |||
| INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), | |||
| $ ISHFTS( MAXIN ), IACC22( MAXIN ) | |||
| DOUBLE PRECISION A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), | |||
| $ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ), | |||
| $ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ), | |||
| $ WORK( LWORK ), X( 5*NMAX ) | |||
| DOUBLE PRECISION D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ), | |||
| $ TAUB( NMAX ), X( 5*NMAX ) | |||
| * .. | |||
| * .. Allocatable Arrays .. | |||
| INTEGER AllocateStatus | |||
| DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: WORK | |||
| DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, C | |||
| * .. | |||
| * .. External Functions .. | |||
| LOGICAL LSAMEN | |||
| @@ -1132,7 +1139,18 @@ | |||
| * .. | |||
| * .. Data statements .. | |||
| DATA INTSTR / '0123456789' / | |||
| DATA IOLDSD / 0, 0, 0, 1 / | |||
| DATA IOLDSD / 0, 0, 0, 1 / | |||
| * .. | |||
| * .. Allocate memory dynamically .. | |||
| * | |||
| ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| @@ -1856,8 +1874,16 @@ | |||
| CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | |||
| CALL XLAENV( 1, 1 ) | |||
| CALL XLAENV( 9, 25 ) | |||
| IF( TSTERR ) | |||
| $ CALL DERRST( 'DST', NOUT ) | |||
| IF( TSTERR ) THEN | |||
| #if defined(_OPENMP) | |||
| N_THREADS = OMP_GET_NUM_THREADS() | |||
| CALL OMP_SET_NUM_THREADS(1) | |||
| #endif | |||
| CALL DERRST( 'DST', NOUT ) | |||
| #if defined(_OPENMP) | |||
| CALL OMP_SET_NUM_THREADS(N_THREADS) | |||
| #endif | |||
| END IF | |||
| DO 290 I = 1, NPARMS | |||
| CALL XLAENV( 1, NBVAL( I ) ) | |||
| CALL XLAENV( 2, NBMIN( I ) ) | |||
| @@ -2436,7 +2462,12 @@ | |||
| 380 CONTINUE | |||
| WRITE( NOUT, FMT = 9994 ) | |||
| S2 = DSECND( ) | |||
| WRITE( NOUT, FMT = 9993 )S2 - S1 | |||
| WRITE( NOUT, FMT = 9993 )S2 - S1 | |||
| * | |||
| DEALLOCATE (A, STAT = AllocateStatus) | |||
| DEALLOCATE (B, STAT = AllocateStatus) | |||
| DEALLOCATE (C, STAT = AllocateStatus) | |||
| DEALLOCATE (WORK, STAT = AllocateStatus) | |||
| * | |||
| 9999 FORMAT( / ' Execution not attempted due to input errors' ) | |||
| 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) | |||
| @@ -1040,6 +1040,10 @@ | |||
| * ===================================================================== | |||
| PROGRAM SCHKEE | |||
| * | |||
| #if defined(_OPENMP) | |||
| use omp_lib | |||
| #endif | |||
| * | |||
| * -- LAPACK test routine (version 3.7.0) -- | |||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | |||
| @@ -1077,7 +1081,7 @@ | |||
| CHARACTER*80 LINE | |||
| INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, | |||
| $ NK, NN, NPARMS, NRHS, NTYPES, | |||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH | |||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS | |||
| REAL EPS, S1, S2, THRESH, THRSHN | |||
| * .. | |||
| * .. Local Arrays .. | |||
| @@ -1089,10 +1093,13 @@ | |||
| $ PVAL( MAXIN ) | |||
| INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), | |||
| $ ISHFTS( MAXIN ), IACC22( MAXIN ) | |||
| REAL A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), | |||
| $ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ), | |||
| $ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ), | |||
| $ WORK( LWORK ), X( 5*NMAX ) | |||
| REAL D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ), | |||
| $ TAUB( NMAX ), X( 5*NMAX ) | |||
| * .. | |||
| * .. Allocatable Arrays .. | |||
| INTEGER AllocateStatus | |||
| REAL, DIMENSION(:), ALLOCATABLE :: WORK | |||
| REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, C | |||
| * .. | |||
| * .. External Functions .. | |||
| LOGICAL LSAMEN | |||
| @@ -1132,7 +1139,18 @@ | |||
| * .. | |||
| * .. Data statements .. | |||
| DATA INTSTR / '0123456789' / | |||
| DATA IOLDSD / 0, 0, 0, 1 / | |||
| DATA IOLDSD / 0, 0, 0, 1 / | |||
| * .. | |||
| * .. Allocate memory dynamically .. | |||
| * | |||
| ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| @@ -1857,8 +1875,16 @@ | |||
| CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | |||
| CALL XLAENV( 1, 1 ) | |||
| CALL XLAENV( 9, 25 ) | |||
| IF( TSTERR ) | |||
| $ CALL SERRST( 'SST', NOUT ) | |||
| IF( TSTERR ) THEN | |||
| #if defined(_OPENMP) | |||
| N_THREADS = OMP_GET_NUM_THREADS() | |||
| CALL OMP_SET_NUM_THREADS(1) | |||
| #endif | |||
| CALL SERRST( 'SST', NOUT ) | |||
| #if defined(_OPENMP) | |||
| CALL OMP_SET_NUM_THREADS(N_THREADS) | |||
| #endif | |||
| END IF | |||
| DO 290 I = 1, NPARMS | |||
| CALL XLAENV( 1, NBVAL( I ) ) | |||
| CALL XLAENV( 2, NBMIN( I ) ) | |||
| @@ -2440,6 +2466,11 @@ | |||
| WRITE( NOUT, FMT = 9994 ) | |||
| S2 = SECOND( ) | |||
| WRITE( NOUT, FMT = 9993 )S2 - S1 | |||
| * | |||
| DEALLOCATE (A, STAT = AllocateStatus) | |||
| DEALLOCATE (B, STAT = AllocateStatus) | |||
| DEALLOCATE (C, STAT = AllocateStatus) | |||
| DEALLOCATE (WORK, STAT = AllocateStatus) | |||
| * | |||
| 9999 FORMAT( / ' Execution not attempted due to input errors' ) | |||
| 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) | |||
| @@ -1034,6 +1034,10 @@ | |||
| * ===================================================================== | |||
| PROGRAM ZCHKEE | |||
| * | |||
| #if defined(_OPENMP) | |||
| use omp_lib | |||
| #endif | |||
| * | |||
| * -- LAPACK test routine (version 3.7.0) -- | |||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | |||
| @@ -1071,7 +1075,7 @@ | |||
| CHARACTER*80 LINE | |||
| INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, | |||
| $ NK, NN, NPARMS, NRHS, NTYPES, | |||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH | |||
| $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS | |||
| DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN | |||
| * .. | |||
| * .. Local Arrays .. | |||
| @@ -1084,12 +1088,16 @@ | |||
| INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), | |||
| $ ISHFTS( MAXIN ), IACC22( MAXIN ) | |||
| DOUBLE PRECISION ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), | |||
| $ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX ) | |||
| COMPLEX*16 A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), | |||
| $ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ), | |||
| $ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ), | |||
| $ RESULT( 500 ) | |||
| COMPLEX*16 DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ), | |||
| $ X( 5*NMAX ) | |||
| * .. | |||
| * .. Allocatable Arrays .. | |||
| INTEGER AllocateStatus | |||
| DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S | |||
| COMPLEX*16, DIMENSION(:), ALLOCATABLE :: WORK | |||
| COMPLEX*16, DIMENSION(:,:), ALLOCATABLE :: A, B, C | |||
| * .. | |||
| * .. External Functions .. | |||
| LOGICAL LSAMEN | |||
| DOUBLE PRECISION DLAMCH, DSECND | |||
| @@ -1130,6 +1138,21 @@ | |||
| DATA INTSTR / '0123456789' / | |||
| DATA IOLDSD / 0, 0, 0, 1 / | |||
| * .. | |||
| * .. Allocate memory dynamically .. | |||
| * | |||
| ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) | |||
| IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| A = 0.0 | |||
| @@ -1846,8 +1869,16 @@ | |||
| CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | |||
| CALL XLAENV( 1, 1 ) | |||
| CALL XLAENV( 9, 25 ) | |||
| IF( TSTERR ) | |||
| $ CALL ZERRST( 'ZST', NOUT ) | |||
| IF( TSTERR ) THEN | |||
| #if defined(_OPENMP) | |||
| N_THREADS = OMP_GET_NUM_THREADS() | |||
| CALL OMP_SET_NUM_THREADS(1) | |||
| #endif | |||
| CALL ZERRST( 'ZST', NOUT ) | |||
| #if defined(_OPENMP) | |||
| CALL OMP_SET_NUM_THREADS(N_THREADS) | |||
| #endif | |||
| END IF | |||
| DO 290 I = 1, NPARMS | |||
| CALL XLAENV( 1, NBVAL( I ) ) | |||
| CALL XLAENV( 2, NBMIN( I ) ) | |||
| @@ -2303,8 +2334,16 @@ | |||
| MAXTYP = 15 | |||
| NTYPES = MIN( MAXTYP, NTYPES ) | |||
| CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) | |||
| IF( TSTERR ) | |||
| $ CALL ZERRST( 'ZHB', NOUT ) | |||
| IF( TSTERR ) THEN | |||
| #if defined(_OPENMP) | |||
| N_THREADS = OMP_GET_NUM_THREADS() | |||
| CALL OMP_SET_NUM_THREADS(1) | |||
| #endif | |||
| CALL ZERRST( 'ZHB', NOUT ) | |||
| #if defined(_OPENMP) | |||
| CALL OMP_SET_NUM_THREADS(N_THREADS) | |||
| #endif | |||
| END IF | |||
| * CALL ZCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, | |||
| * $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), | |||
| * $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, | |||
| @@ -2435,6 +2474,13 @@ | |||
| WRITE( NOUT, FMT = 9994 ) | |||
| S2 = DSECND( ) | |||
| WRITE( NOUT, FMT = 9993 )S2 - S1 | |||
| * | |||
| DEALLOCATE (S, STAT = AllocateStatus) | |||
| DEALLOCATE (A, STAT = AllocateStatus) | |||
| DEALLOCATE (B, STAT = AllocateStatus) | |||
| DEALLOCATE (C, STAT = AllocateStatus) | |||
| DEALLOCATE (RWORK, STAT = AllocateStatus) | |||
| DEALLOCATE (WORK, STAT = AllocateStatus) | |||
| * | |||
| 9999 FORMAT( / ' Execution not attempted due to input errors' ) | |||
| 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) | |||
| @@ -72,6 +72,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifndef PARAM_H | |||
| #define PARAM_H | |||
| #define LONGCAST (BLASLONG) | |||
| #if defined(__BYTE_ORDER__) | |||
| #if __GNUC__ < 9 | |||
| #undef LONGCAST | |||
| #define LONGCAST | |||
| #endif | |||
| #endif | |||
| #define SBGEMM_DEFAULT_UNROLL_N 4 | |||
| #define SBGEMM_DEFAULT_UNROLL_M 8 | |||
| #define SBGEMM_DEFAULT_UNROLL_MN 32 | |||
| @@ -85,7 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 64 | |||
| #define GEMM_DEFAULT_OFFSET_B 256 | |||
| #define GEMM_DEFAULT_ALIGN 0x01ffffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -157,7 +165,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 64 | |||
| #define GEMM_DEFAULT_OFFSET_B 832 | |||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -237,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 64 | |||
| #define GEMM_DEFAULT_OFFSET_B 832 | |||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL | |||
| @@ -330,7 +338,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 64 | |||
| #define GEMM_DEFAULT_OFFSET_B 832 | |||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL | |||
| @@ -422,7 +430,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 64 | |||
| #define GEMM_DEFAULT_OFFSET_B 832 | |||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL | |||
| @@ -515,7 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 64 | |||
| #define GEMM_DEFAULT_OFFSET_B 832 | |||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL | |||
| @@ -607,7 +615,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SYMV_P 8 | |||
| @@ -726,7 +734,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 384 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -774,7 +782,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 256 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -821,7 +829,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 64 | |||
| #define GEMM_DEFAULT_OFFSET_B 256 | |||
| #define GEMM_DEFAULT_ALIGN 0x01ffffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL | |||
| #ifdef ARCH_X86 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -890,7 +898,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||
| #ifdef HAVE_SSE | |||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||
| @@ -945,7 +953,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||
| #ifdef CORE_YONAH | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| @@ -1011,7 +1019,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 32 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||
| #define SYMV_P 8 | |||
| @@ -1068,7 +1076,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_B 256 | |||
| #endif | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||
| #define SYMV_P 8 | |||
| @@ -1128,7 +1136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 448 | |||
| #define GEMM_DEFAULT_OFFSET_B 128 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SYMV_P 8 | |||
| @@ -1201,7 +1209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 128 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SYMV_P 8 | |||
| @@ -1272,7 +1280,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 128 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SYMV_P 8 | |||
| @@ -1344,7 +1352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 32 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SYMV_P 8 | |||
| @@ -1417,7 +1425,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SYMV_P 8 | |||
| @@ -1510,7 +1518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SYMV_P 8 | |||
| @@ -1636,7 +1644,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SYMV_P 8 | |||
| @@ -1877,7 +1885,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 64 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||
| #define SYMV_P 8 | |||
| @@ -1939,7 +1947,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 128 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| @@ -1993,7 +2001,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 512 | |||
| #define GEMM_DEFAULT_OFFSET_B 512 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -2061,7 +2069,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 8192 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -2088,7 +2096,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef PPCG4 | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 1024 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -2119,7 +2127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 2688 | |||
| #define GEMM_DEFAULT_OFFSET_B 3072 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL | |||
| #if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| @@ -2168,7 +2176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A (32 * 0) | |||
| #define GEMM_DEFAULT_OFFSET_B (32 * 0) | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -2204,7 +2212,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A (32 * 0) | |||
| #define GEMM_DEFAULT_OFFSET_B (32 * 0) | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -2239,7 +2247,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(POWER3) || defined(POWER4) || defined(POWER5) | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 2048 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -2312,7 +2320,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 384 | |||
| #define GEMM_DEFAULT_OFFSET_B 1024 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -2344,7 +2352,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 65536 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL | |||
| #if defined(__32BIT__) | |||
| #warning using BINARY32==POWER6 | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| @@ -2397,7 +2406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 65536 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL | |||
| #define SWITCH_RATIO 16 | |||
| #define GEMM_PREFERED_SIZE 16 | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| @@ -2433,24 +2445,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 65536 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL | |||
| #define SWITCH_RATIO 16 | |||
| #define GEMM_PREFERED_SIZE 16 | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| #define DGEMM_DEFAULT_UNROLL_M 16 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #else | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||
| #endif | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 8 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define SGEMM_DEFAULT_P 832 | |||
| #define DGEMM_DEFAULT_P 320 | |||
| #define SGEMM_DEFAULT_P 512 | |||
| #define DGEMM_DEFAULT_P 384 | |||
| #define CGEMM_DEFAULT_P 512 | |||
| #define ZGEMM_DEFAULT_P 256 | |||
| #define SGEMM_DEFAULT_Q 1026 | |||
| #define DGEMM_DEFAULT_Q 960 | |||
| #define SGEMM_DEFAULT_Q 512 | |||
| #define DGEMM_DEFAULT_Q 512 | |||
| #define CGEMM_DEFAULT_Q 1026 | |||
| #define ZGEMM_DEFAULT_Q 1026 | |||
| @@ -2480,7 +2500,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 2048 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 2 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| @@ -2512,7 +2532,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 2048 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -2543,7 +2563,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 2 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| @@ -2578,7 +2598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #ifdef HAVE_MSA | |||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||
| @@ -2634,7 +2654,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -2675,7 +2695,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL | |||
| #ifdef HAVE_MSA | |||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||
| @@ -2724,7 +2744,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef RISCV64_GENERIC | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 2 | |||
| #define SGEMM_DEFAULT_UNROLL_N 2 | |||
| @@ -2805,7 +2825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -2846,7 +2866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 2 | |||
| @@ -3121,7 +3141,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 2 | |||
| #define SGEMM_DEFAULT_UNROLL_N 2 | |||
| @@ -3162,7 +3182,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -3203,7 +3223,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -3244,7 +3264,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 2 | |||
| #define SGEMM_DEFAULT_UNROLL_N 2 | |||
| @@ -3283,7 +3303,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -3365,7 +3385,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL | |||
| #define SGEMM_DEFAULT_UNROLL_N 2 | |||
| #define DGEMM_DEFAULT_UNROLL_N 2 | |||