| @@ -6,7 +6,7 @@ | |||
| INCLUDED = 1 | |||
| ifndef TOPDIR | |||
| TOPDIR = . | |||
| TOPDIR = . | |||
| endif | |||
| # If ARCH is not set, we use the host system's architecture for getarch compile options. | |||
| @@ -252,6 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" | |||
| ifndef TARGET_CORE | |||
| include $(TOPDIR)/Makefile.conf | |||
| else | |||
| HAVE_NEON= | |||
| HAVE_VFP= | |||
| HAVE_VFPV3= | |||
| HAVE_VFPV4= | |||
| HAVE_MMX= | |||
| HAVE_SSE= | |||
| HAVE_SSE2= | |||
| HAVE_SSE3= | |||
| HAVE_SSSE3= | |||
| HAVE_SSE4_1= | |||
| HAVE_SSE4_2= | |||
| HAVE_SSE4A= | |||
| HAVE_SSE5= | |||
| HAVE_AVX= | |||
| HAVE_AVX2= | |||
| HAVE_FMA3= | |||
| include $(TOPDIR)/Makefile_kernel.conf | |||
| endif | |||
| @@ -1522,6 +1538,8 @@ export HAVE_SSE4_2 | |||
| export HAVE_SSE4A | |||
| export HAVE_SSE5 | |||
| export HAVE_AVX | |||
| export HAVE_AVX2 | |||
| export HAVE_FMA3 | |||
| export HAVE_VFP | |||
| export HAVE_VFPV3 | |||
| export HAVE_VFPV4 | |||
| @@ -9,9 +9,9 @@ endif | |||
| endif | |||
| ifdef HAVE_SSE3 | |||
| ifndef DYNAMIC_ARCH | |||
| CCOMMON_OPT += -msse3 | |||
| FCOMMON_OPT += -msse3 | |||
| endif | |||
| ifdef HAVE_SSSE3 | |||
| CCOMMON_OPT += -mssse3 | |||
| FCOMMON_OPT += -mssse3 | |||
| @@ -20,7 +20,17 @@ ifdef HAVE_SSE4_1 | |||
| CCOMMON_OPT += -msse4.1 | |||
| FCOMMON_OPT += -msse4.1 | |||
| endif | |||
| ifdef HAVE_AVX | |||
| CCOMMON_OPT += -mavx | |||
| FCOMMON_OPT += -mavx | |||
| endif | |||
| ifdef HAVE_AVX2 | |||
| CCOMMON_OPT += -mavx2 | |||
| FCOMMON_OPT += -mavx2 | |||
| endif | |||
| ifdef HAVE_FMA3 | |||
| CCOMMON_OPT += -mfma | |||
| FCOMMON_OPT += -mfma | |||
| endif | |||
| ifeq ($(CORE), SKYLAKEX) | |||
| @@ -66,8 +76,7 @@ endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE)) | |||
| ifndef DYNAMIC_ARCH | |||
| ifdef HAVE_AVX2 | |||
| ifndef NO_AVX2 | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # AVX2 support was added in 4.7.0 | |||
| @@ -96,7 +105,6 @@ endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -96,7 +96,7 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN") | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL "SKYLAKEX") | |||
| if (${CORE} STREQUAL SKYLAKEX) | |||
| if (NOT DYNAMIC_ARCH) | |||
| if (NOT NO_AVX512) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") | |||
| @@ -104,7 +104,7 @@ if (${CORE} STREQUAL "SKYLAKEX") | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL "COOPERLAKE") | |||
| if (${CORE} STREQUAL COOPERLAKE) | |||
| if (NOT DYNAMIC_ARCH) | |||
| if (NOT NO_AVX512) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| @@ -139,36 +139,6 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(CGEMM3M_UNROLL_N 4) | |||
| set(ZGEMM3M_UNROLL_M 4) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "BARCELONA") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define HAVE_SSE3\n") | |||
| elseif ("${TCORE}" STREQUAL "STEAMROLLER") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define HAVE_SSE3\n") | |||
| elseif ("${TCORE}" STREQUAL "EXCAVATOR") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define HAVE_SSE3\n") | |||
| elseif ("${TCORE}" STREQUAL "NEHALEM") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define HAVE_SSE3\n") | |||
| elseif ("${TCORE}" STREQUAL "PRESCOTT") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define HAVE_SSE3\n") | |||
| elseif ("${TCORE}" STREQUAL "SANDYBRIDGE") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define HAVE_AVX\n") | |||
| elseif ("${TCORE}" STREQUAL "HASWELL") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define HAVE_AVX2\n") | |||
| elseif ("${TCORE}" STREQUAL "ZEN") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define HAVE_AVX2\n") | |||
| elseif ("${TCORE}" STREQUAL "SKYLAKEX") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define HAVE_AVX512\n") | |||
| elseif ("${TCORE}" STREQUAL "COOPERLAKE") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define HAVE_AVX512\n") | |||
| elseif ("${TCORE}" STREQUAL "ARMV7") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t65536\n" | |||
| @@ -586,6 +556,21 @@ else(NOT CMAKE_CROSSCOMPILING) | |||
| MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") | |||
| endif () | |||
| endif () | |||
| unset (HAVE_AVX2) | |||
| unset (HAVE_AVX) | |||
| unset (HAVE_FMA3) | |||
| unset (HAVE_MMX) | |||
| unset (HAVE_SSE) | |||
| unset (HAVE_SSE2) | |||
| unset (HAVE_SSE3) | |||
| unset (HAVE_SSSE3) | |||
| unset (HAVE_SSE4A) | |||
| unset (HAVE_SSE4_1) | |||
| unset (HAVE_SSE4_2) | |||
| unset (HAVE_NEON) | |||
| unset (HAVE_VFP) | |||
| unset (HAVE_VFPV3) | |||
| unset (HAVE_VFPV4) | |||
| message(STATUS "Running getarch") | |||
| # use the cmake binary w/ the -E param to run a shell command in a cross-platform way | |||
| @@ -44,74 +44,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
| endif () | |||
| endif () | |||
| if (DEFINED TARGET) | |||
| if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512) | |||
| # if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") | |||
| else() | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| # elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") | |||
| # set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
| # endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") | |||
| endif() | |||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse -msse3 -mavx2") | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") | |||
| endif() | |||
| if (${TARGET} STREQUAL "ZEN" AND NOT NO_AVX2) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") | |||
| endif() | |||
| if (${TARGET} STREQUAL "SANDYBRIDGE" AND NOT NO_AVX) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx") | |||
| endif() | |||
| if (${TARGET} STREQUAL "BARCELONA" OR ${TARGET} STREQUAL "STEAMROLLER" OR ${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "EXCAVATOR") | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") | |||
| endif() | |||
| if (${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "BOBCAT" OR ${TARGET} STREQUAL "OPTERON_SSE3") | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") | |||
| endif() | |||
| if (${TARGET} STREQUAL "PRESCOTT" OR ${TARGET} STREQUAL "NANO") | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") | |||
| endif() | |||
| if (${TARGET} STREQUAL "NEHALEM" OR ${TARGET} STREQUAL "ATOM") | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") | |||
| endif() | |||
| if (${TARGET} STREQUAL "CORE2" OR ${TARGET} STREQUAL "PENRYN" OR ${TARGET} STREQUAL "DUNNINGTON") | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") | |||
| endif() | |||
| if (DEFINED HAVE_SSE) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") | |||
| endif() | |||
| if (DEFINED HAVE_SSE2) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2") | |||
| endif() | |||
| if (DEFINED HAVE_SSE3) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") | |||
| endif() | |||
| if (DEFINED HAVE_SSSE3) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3") | |||
| endif() | |||
| if (DEFINED HAVE_SSE4_1) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") | |||
| endif() | |||
| endif() | |||
| if (DEFINED TARGET) | |||
| message(STATUS "-- -- -- -- -- -- -- -- -- -- -- -- --") | |||
| message(STATUS "Targeting the ${TARGET} architecture.") | |||
| set(GETARCH_FLAGS "-DFORCE_${TARGET}") | |||
| endif () | |||
| @@ -211,6 +146,63 @@ else() | |||
| endif () | |||
| include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") | |||
| if (DEFINED TARGET) | |||
| if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) | |||
| # if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") | |||
| else() | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| # elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") | |||
| # set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
| # endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
| endif() | |||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
| endif() | |||
| endif() | |||
| if (DEFINED HAVE_AVX) | |||
| if (NOT NO_AVX) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx") | |||
| endif() | |||
| endif() | |||
| if (DEFINED HAVE_AVX2) | |||
| if (NOT NO_AVX2) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
| endif() | |||
| endif() | |||
| if (DEFINED HAVE_FMA3) | |||
| if (NOT NO_AVX2) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma") | |||
| endif() | |||
| endif() | |||
| if (DEFINED HAVE_SSE) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") | |||
| endif() | |||
| if (DEFINED HAVE_SSE2) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2") | |||
| endif() | |||
| if (DEFINED HAVE_SSE3) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") | |||
| endif() | |||
| if (DEFINED HAVE_SSSE3) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3") | |||
| endif() | |||
| if (DEFINED HAVE_SSE4_1) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") | |||
| endif() | |||
| endif() | |||
| if (DEFINED BINARY) | |||
| message(STATUS "Compiling a ${BINARY}-bit binary.") | |||
| endif () | |||
| @@ -80,7 +80,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha | |||
| break; | |||
| } | |||
| mode |= BLAS_LEGACY; | |||
| if(!(mode & BLAS_PTHREAD)) mode |= BLAS_LEGACY; | |||
| for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]); | |||
| @@ -476,12 +476,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| routine = queue -> routine; | |||
| if (!(queue -> mode & BLAS_LEGACY)) { | |||
| if (queue -> mode & BLAS_LEGACY) { | |||
| legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); | |||
| } else | |||
| if (queue -> mode & BLAS_PTHREAD) { | |||
| void (*pthreadcompat)(void *) = queue -> routine; | |||
| (pthreadcompat)(queue -> args); | |||
| } else | |||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, | |||
| queue -> sa, queue -> sb, 0); | |||
| } else { | |||
| legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); | |||
| } | |||
| if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); | |||
| @@ -330,7 +330,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
| "-DFMA3" | |||
| "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" | |||
| #define LIBNAME "haswell" | |||
| #define CORENAME "HASWELL" | |||
| #endif | |||
| @@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
| "-DFMA3" | |||
| "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" | |||
| #define LIBNAME "haswell" | |||
| #define CORENAME "HASWELL" | |||
| #else | |||
| @@ -359,7 +359,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
| "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" | |||
| "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" | |||
| #define LIBNAME "skylakex" | |||
| #define CORENAME "SKYLAKEX" | |||
| #endif | |||
| @@ -376,7 +376,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
| "-DFMA3" | |||
| "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" | |||
| #define LIBNAME "haswell" | |||
| #define CORENAME "HASWELL" | |||
| #else | |||
| @@ -389,7 +389,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
| "-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake" | |||
| "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake" | |||
| #define LIBNAME "cooperlake" | |||
| #define CORENAME "COOPERLAKE" | |||
| #endif | |||
| @@ -559,7 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ | |||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ | |||
| "-DHAVE_AVX -DHAVE_FMA3 -DFMA3" | |||
| "-DHAVE_AVX -DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" | |||
| #define LIBNAME "zen" | |||
| #define CORENAME "ZEN" | |||
| #endif | |||
| @@ -5,13 +5,6 @@ endif | |||
| TOPDIR = .. | |||
| include $(TOPDIR)/Makefile.system | |||
| ifdef HAVE_SSE3 | |||
| CFLAGS += -msse3 | |||
| endif | |||
| ifdef HAVE_SSSE3 | |||
| CFLAGS += -mssse3 | |||
| endif | |||
| ifeq ($(ARCH), power) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| override CFLAGS += -fno-integrated-as | |||
| @@ -38,12 +31,6 @@ ifdef NO_AVX2 | |||
| endif | |||
| ifdef TARGET_CORE | |||
| ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) | |||
| override CFLAGS += -msse -msse2 -msse3 -mssse3 -msse4.1 | |||
| endif | |||
| ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE BANIAS NORTHWOOD ATHLON OPTERON)) | |||
| override CFLAGS += -msse -msse2 | |||
| endif | |||
| ifeq ($(TARGET_CORE), COOPERLAKE) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| @@ -151,9 +151,9 @@ DCOPYKERNEL = dcopy_power10.c | |||
| CCOPYKERNEL = ccopy_power10.c | |||
| ZCOPYKERNEL = zcopy_power10.c | |||
| # | |||
| SDOTKERNEL = sdot.c | |||
| DDOTKERNEL = ddot.c | |||
| DSDOTKERNEL = sdot.c | |||
| SDOTKERNEL = sdot_power10.c | |||
| DDOTKERNEL = ddot_power10.c | |||
| DSDOTKERNEL = sdot_power10.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| CDOTKERNEL = cdot_power9.S | |||
| else | |||
| @@ -0,0 +1,131 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static double ddot_kernel_8 (long n, double *x, double *y) | |||
| { | |||
| double dot; | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| "dcbt 0, %3 \n\t" | |||
| "xxlxor 32, 32, 32 \n\t" | |||
| "xxlxor 33, 33, 33 \n\t" | |||
| "xxlxor 34, 34, 34 \n\t" | |||
| "xxlxor 35, 35, 35 \n\t" | |||
| "xxlxor 36, 36, 36 \n\t" | |||
| "xxlxor 37, 37, 37 \n\t" | |||
| "xxlxor 38, 38, 38 \n\t" | |||
| "xxlxor 39, 39, 39 \n\t" | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "lxvp 42, 32(%2) \n\t" | |||
| "lxvp 44, 64(%2) \n\t" | |||
| "lxvp 46, 96(%2) \n\t" | |||
| "lxvp 48, 0(%3) \n\t" | |||
| "lxvp 50, 32(%3) \n\t" | |||
| "lxvp 52, 64(%3) \n\t" | |||
| "lxvp 54, 96(%3) \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addic. %1, %1, -16 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "xvmaddadp 32, 40, 48 \n\t" | |||
| "xvmaddadp 33, 41, 49 \n\t" | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "lxvp 48, 0(%3) \n\t" | |||
| "xvmaddadp 34, 42, 50 \n\t" | |||
| "xvmaddadp 35, 43, 51 \n\t" | |||
| "lxvp 42, 32(%2) \n\t" | |||
| "lxvp 50, 32(%3) \n\t" | |||
| "xvmaddadp 36, 44, 52 \n\t" | |||
| "xvmaddadp 37, 45, 53 \n\t" | |||
| "lxvp 44, 64(%2) \n\t" | |||
| "lxvp 52, 64(%3) \n\t" | |||
| "xvmaddadp 38, 46, 54 \n\t" | |||
| "xvmaddadp 39, 47, 55 \n\t" | |||
| "lxvp 46, 96(%2) \n\t" | |||
| "lxvp 54, 96(%3) \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addic. %1, %1, -16 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "xvmaddadp 32, 40, 48 \n\t" | |||
| "xvmaddadp 33, 41, 49 \n\t" | |||
| "xvmaddadp 34, 42, 50 \n\t" | |||
| "xvmaddadp 35, 43, 51 \n\t" | |||
| "xvmaddadp 36, 44, 52 \n\t" | |||
| "xvmaddadp 37, 45, 53 \n\t" | |||
| "xvmaddadp 38, 46, 54 \n\t" | |||
| "xvmaddadp 39, 47, 55 \n\t" | |||
| "xvadddp 32, 32, 33 \n\t" | |||
| "xvadddp 34, 34, 35 \n\t" | |||
| "xvadddp 36, 36, 37 \n\t" | |||
| "xvadddp 38, 38, 39 \n\t" | |||
| "xvadddp 32, 32, 34 \n\t" | |||
| "xvadddp 36, 36, 38 \n\t" | |||
| "xvadddp 32, 32, 36 \n\t" | |||
| XXSWAPD_S(33,32) | |||
| "xsadddp %x0, 32, 33 \n" | |||
| "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n" | |||
| : | |||
| "=d" (dot), // 0 | |||
| "+r" (n), // 1 | |||
| "+b" (x), // 2 | |||
| "+b" (y) // 3 | |||
| : | |||
| "m" (*x), | |||
| "m" (*y) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55" | |||
| ); | |||
| return dot; | |||
| } | |||
| @@ -0,0 +1,130 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "ddot_microk_power10.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG register i = 0; | |||
| FLOAT dot = 0.0; | |||
| while(i < n) | |||
| { | |||
| dot += y[i] * x[i] | |||
| + y[i+1] * x[i+1] | |||
| + y[i+2] * x[i+2] | |||
| + y[i+3] * x[i+3] | |||
| + y[i+4] * x[i+4] | |||
| + y[i+5] * x[i+5] | |||
| + y[i+6] * x[i+6] | |||
| + y[i+7] * x[i+7] ; | |||
| i+=8 ; | |||
| } | |||
| return dot; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT dot = 0.0 ; | |||
| if ( n <= 0 ) return(dot); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 ) | |||
| dot = ddot_kernel_8(n1, x, y); | |||
| i = n1; | |||
| while(i < n) | |||
| { | |||
| dot += y[i] * x[i] ; | |||
| i++ ; | |||
| } | |||
| return(dot); | |||
| } | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| BLASLONG n1 = n & -4; | |||
| while(i < n1) | |||
| { | |||
| FLOAT m1 = y[iy] * x[ix] ; | |||
| FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; | |||
| FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; | |||
| FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; | |||
| ix += inc_x*4 ; | |||
| iy += inc_y*4 ; | |||
| temp1 += m1+m3; | |||
| temp2 += m2+m4; | |||
| i+=4 ; | |||
| } | |||
| while(i < n) | |||
| { | |||
| temp1 += y[iy] * x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| dot = temp1 + temp2; | |||
| return(dot); | |||
| } | |||
| @@ -0,0 +1,135 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static float sdot_kernel_16 (long n, float *x, float *y) | |||
| { | |||
| float dot; | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| "dcbt 0, %3 \n\t" | |||
| "xxlxor 32, 32, 32 \n\t" | |||
| "xxlxor 33, 33, 33 \n\t" | |||
| "xxlxor 34, 34, 34 \n\t" | |||
| "xxlxor 35, 35, 35 \n\t" | |||
| "xxlxor 36, 36, 36 \n\t" | |||
| "xxlxor 37, 37, 37 \n\t" | |||
| "xxlxor 38, 38, 38 \n\t" | |||
| "xxlxor 39, 39, 39 \n\t" | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "lxvp 42, 32(%2) \n\t" | |||
| "lxvp 44, 64(%2) \n\t" | |||
| "lxvp 46, 96(%2) \n\t" | |||
| "lxvp 48, 0(%3) \n\t" | |||
| "lxvp 50, 32(%3) \n\t" | |||
| "lxvp 52, 64(%3) \n\t" | |||
| "lxvp 54, 96(%3) \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addic. %1, %1, -32 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "xvmaddasp 32, 40, 48 \n\t" | |||
| "xvmaddasp 33, 41, 49 \n\t" | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "lxvp 48, 0(%3) \n\t" | |||
| "xvmaddasp 34, 42, 50 \n\t" | |||
| "xvmaddasp 35, 43, 51 \n\t" | |||
| "lxvp 42, 32(%2) \n\t" | |||
| "lxvp 50, 32(%3) \n\t" | |||
| "xvmaddasp 36, 44, 52 \n\t" | |||
| "xvmaddasp 37, 45, 53 \n\t" | |||
| "lxvp 44, 64(%2) \n\t" | |||
| "lxvp 52, 64(%3) \n\t" | |||
| "xvmaddasp 38, 46, 54 \n\t" | |||
| "xvmaddasp 39, 47, 55 \n\t" | |||
| "lxvp 46, 96(%2) \n\t" | |||
| "lxvp 54, 96(%3) \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addic. %1, %1, -32 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "xvmaddasp 32, 40, 48 \n\t" | |||
| "xvmaddasp 33, 41, 49 \n\t" | |||
| "xvmaddasp 34, 42, 50 \n\t" | |||
| "xvmaddasp 35, 43, 51 \n\t" | |||
| "xvmaddasp 36, 44, 52 \n\t" | |||
| "xvmaddasp 37, 45, 53 \n\t" | |||
| "xvmaddasp 38, 46, 54 \n\t" | |||
| "xvmaddasp 39, 47, 55 \n\t" | |||
| "xvaddsp 32, 32, 33 \n\t" | |||
| "xvaddsp 34, 34, 35 \n\t" | |||
| "xvaddsp 36, 36, 37 \n\t" | |||
| "xvaddsp 38, 38, 39 \n\t" | |||
| "xvaddsp 32, 32, 34 \n\t" | |||
| "xvaddsp 36, 36, 38 \n\t" | |||
| "xvaddsp 32, 32, 36 \n\t" | |||
| "xxsldwi 33, 32, 32, 2 \n\t" | |||
| "xvaddsp 32, 32, 33 \n\t" | |||
| "xxsldwi 33, 32, 32, 1 \n\t" | |||
| "xvaddsp 32, 32, 33 \n\t" | |||
| "xscvspdp %x0, 32 \n" | |||
| "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n" | |||
| : | |||
| "=f" (dot), // 0 | |||
| "+r" (n), // 1 | |||
| "+b" (x), // 2 | |||
| "+b" (y) // 3 | |||
| : | |||
| "m" (*x), | |||
| "m" (*y) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55" | |||
| ); | |||
| return dot; | |||
| } | |||
| @@ -0,0 +1,154 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "sdot_microk_power10.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG register i = 0; | |||
| FLOAT dot = 0.0; | |||
| while(i < n) | |||
| { | |||
| dot += y[i] * x[i] | |||
| + y[i+1] * x[i+1] | |||
| + y[i+2] * x[i+2] | |||
| + y[i+3] * x[i+3] | |||
| + y[i+4] * x[i+4] | |||
| + y[i+5] * x[i+5] | |||
| + y[i+6] * x[i+6] | |||
| + y[i+7] * x[i+7] ; | |||
| i+=8 ; | |||
| } | |||
| return dot; | |||
| } | |||
| #endif | |||
| #if defined (DSDOT) | |||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #else | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #endif | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| double dot = 0.0 ; | |||
| #if defined (DSDOT) | |||
| double mydot = 0.0; | |||
| FLOAT asmdot = 0.0; | |||
| #else | |||
| FLOAT mydot=0.0; | |||
| #endif | |||
| BLASLONG n1; | |||
| if ( n <= 0 ) return(dot); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| n1 = n & (BLASLONG)(-32); | |||
| if ( n1 ) | |||
| #if defined(DSDOT) | |||
| { | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| BLASLONG n2 = 32; | |||
| while (i<n1) { | |||
| asmdot = sdot_kernel_16(n2, x1, y1); | |||
| mydot += (double)asmdot; | |||
| asmdot=0.; | |||
| x1+=32; | |||
| y1+=32; | |||
| i+=32; | |||
| } | |||
| } | |||
| #else | |||
| mydot = sdot_kernel_16(n1, x, y); | |||
| #endif | |||
| i = n1; | |||
| while(i < n) | |||
| { | |||
| #if defined(DSDOT) | |||
| dot += (double)y[i] * (double)x[i] ; | |||
| #else | |||
| dot += y[i] * x[i] ; | |||
| #endif | |||
| i++ ; | |||
| } | |||
| dot+=mydot; | |||
| return(dot); | |||
| } | |||
| n1 = n & (BLASLONG)(-2); | |||
| while(i < n1) | |||
| { | |||
| #if defined (DSDOT) | |||
| dot += (double)y[iy] * (double)x[ix] + (double)y[iy+inc_y] * (double)x[ix+inc_x]; | |||
| #else | |||
| dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; | |||
| #endif | |||
| ix += inc_x*2 ; | |||
| iy += inc_y*2 ; | |||
| i+=2 ; | |||
| } | |||
| while(i < n) | |||
| { | |||
| #if defined (DSDOT) | |||
| dot += (double)y[iy] * (double)x[ix] ; | |||
| #else | |||
| dot += y[iy] * x[ix] ; | |||
| #endif | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(dot); | |||
| } | |||
| @@ -102,3 +102,6 @@ ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c | |||
| SASUMKERNEL = sasum.c | |||
| DASUMKERNEL = dasum.c | |||
| SROTKERNEL = srot.c | |||
| DROTKERNEL = drot.c | |||
| @@ -0,0 +1,139 @@ | |||
| #include "common.h" | |||
| #if defined(SKYLAKEX) | |||
| #include "drot_microk_skylakex-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "drot_microk_haswell-2.c" | |||
| #endif | |||
| #ifndef HAVE_DROT_KERNEL | |||
| static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i = 0; | |||
| FLOAT f0, f1, f2, f3; | |||
| FLOAT x0, x1, x2, x3; | |||
| FLOAT g0, g1, g2, g3; | |||
| FLOAT y0, y1, y2, y3; | |||
| FLOAT* xp = x; | |||
| FLOAT* yp = y; | |||
| BLASLONG n1 = n & (~7); | |||
| while (i < n1) { | |||
| x0 = xp[0]; | |||
| y0 = yp[0]; | |||
| x1 = xp[1]; | |||
| y1 = yp[1]; | |||
| x2 = xp[2]; | |||
| y2 = yp[2]; | |||
| x3 = xp[3]; | |||
| y3 = yp[3]; | |||
| f0 = c*x0 + s*y0; | |||
| g0 = c*y0 - s*x0; | |||
| f1 = c*x1 + s*y1; | |||
| g1 = c*y1 - s*x1; | |||
| f2 = c*x2 + s*y2; | |||
| g2 = c*y2 - s*x2; | |||
| f3 = c*x3 + s*y3; | |||
| g3 = c*y3 - s*x3; | |||
| xp[0] = f0; | |||
| yp[0] = g0; | |||
| xp[1] = f1; | |||
| yp[1] = g1; | |||
| xp[2] = f2; | |||
| yp[2] = g2; | |||
| xp[3] = f3; | |||
| yp[3] = g3; | |||
| xp += 4; | |||
| yp += 4; | |||
| i += 4; | |||
| } | |||
| while (i < n) { | |||
| FLOAT temp = c*x[i] + s*y[i]; | |||
| y[i] = c*y[i] - s*x[i]; | |||
| x[i] = temp; | |||
| i++; | |||
| } | |||
| } | |||
| #endif | |||
| static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| FLOAT temp; | |||
| if (n <= 0) | |||
| return; | |||
| if ((inc_x == 1) && (inc_y == 1)) { | |||
| drot_kernel(n, x, y, c, s); | |||
| } | |||
| else { | |||
| while (i < n) { | |||
| temp = c * x[ix] + s * y[iy]; | |||
| y[iy] = c * y[iy] - s * x[ix]; | |||
| x[ix] = temp; | |||
| ix += inc_x; | |||
| iy += inc_y; | |||
| i++; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| #if defined(SMP) | |||
| static int rot_thread_function(blas_arg_t *args) | |||
| { | |||
| rot_compute(args->m, | |||
| args->a, args->lda, | |||
| args->b, args->ldb, | |||
| ((FLOAT *)args->alpha)[0], | |||
| ((FLOAT *)args->alpha)[1]); | |||
| return 0; | |||
| } | |||
| extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| #if defined(SMP) | |||
| int nthreads; | |||
| FLOAT alpha[2]={c, s}; | |||
| FLOAT dummy_c; | |||
| #endif | |||
| #if defined(SMP) | |||
| if (inc_x == 0 || inc_y == 0 || n <= 100000) { | |||
| nthreads = 1; | |||
| } | |||
| else { | |||
| nthreads = num_cpu_avail(1); | |||
| } | |||
| if (nthreads == 1) { | |||
| rot_compute(n, x, inc_x, y, inc_y, c, s); | |||
| } | |||
| else { | |||
| #if defined(DOUBLE) | |||
| int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD; | |||
| #else | |||
| int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; | |||
| #endif | |||
| blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); | |||
| } | |||
| #else | |||
| rot_compute(n, x, inc_x, y, inc_y, c, s); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,87 @@ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) | |||
| #define HAVE_DROT_KERNEL 1 | |||
| #include <immintrin.h> | |||
| #include <stdint.h> | |||
| static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i = 0; | |||
| BLASLONG tail_index_4 = n&(~3); | |||
| BLASLONG tail_index_16 = n&(~15); | |||
| __m256d c_256, s_256; | |||
| if (n >= 4) { | |||
| c_256 = _mm256_set1_pd(c); | |||
| s_256 = _mm256_set1_pd(s); | |||
| } | |||
| __m256d x0, x1, x2, x3; | |||
| __m256d y0, y1, y2, y3; | |||
| __m256d t0, t1, t2, t3; | |||
| for (i = 0; i < tail_index_16; i += 16) { | |||
| x0 = _mm256_loadu_pd(&x[i + 0]); | |||
| x1 = _mm256_loadu_pd(&x[i + 4]); | |||
| x2 = _mm256_loadu_pd(&x[i + 8]); | |||
| x3 = _mm256_loadu_pd(&x[i +12]); | |||
| y0 = _mm256_loadu_pd(&y[i + 0]); | |||
| y1 = _mm256_loadu_pd(&y[i + 4]); | |||
| y2 = _mm256_loadu_pd(&y[i + 8]); | |||
| y3 = _mm256_loadu_pd(&y[i +12]); | |||
| t0 = _mm256_mul_pd(s_256, y0); | |||
| t1 = _mm256_mul_pd(s_256, y1); | |||
| t2 = _mm256_mul_pd(s_256, y2); | |||
| t3 = _mm256_mul_pd(s_256, y3); | |||
| t0 = _mm256_fmadd_pd(c_256, x0, t0); | |||
| t1 = _mm256_fmadd_pd(c_256, x1, t1); | |||
| t2 = _mm256_fmadd_pd(c_256, x2, t2); | |||
| t3 = _mm256_fmadd_pd(c_256, x3, t3); | |||
| _mm256_storeu_pd(&x[i + 0], t0); | |||
| _mm256_storeu_pd(&x[i + 4], t1); | |||
| _mm256_storeu_pd(&x[i + 8], t2); | |||
| _mm256_storeu_pd(&x[i +12], t3); | |||
| t0 = _mm256_mul_pd(s_256, x0); | |||
| t1 = _mm256_mul_pd(s_256, x1); | |||
| t2 = _mm256_mul_pd(s_256, x2); | |||
| t3 = _mm256_mul_pd(s_256, x3); | |||
| t0 = _mm256_fmsub_pd(c_256, y0, t0); | |||
| t1 = _mm256_fmsub_pd(c_256, y1, t1); | |||
| t2 = _mm256_fmsub_pd(c_256, y2, t2); | |||
| t3 = _mm256_fmsub_pd(c_256, y3, t3); | |||
| _mm256_storeu_pd(&y[i + 0], t0); | |||
| _mm256_storeu_pd(&y[i + 4], t1); | |||
| _mm256_storeu_pd(&y[i + 8], t2); | |||
| _mm256_storeu_pd(&y[i +12], t3); | |||
| } | |||
| for (i = tail_index_16; i < tail_index_4; i += 4) { | |||
| x0 = _mm256_loadu_pd(&x[i]); | |||
| y0 = _mm256_loadu_pd(&y[i]); | |||
| t0 = _mm256_mul_pd(s_256, y0); | |||
| t0 = _mm256_fmadd_pd(c_256, x0, t0); | |||
| _mm256_storeu_pd(&x[i], t0); | |||
| t0 = _mm256_mul_pd(s_256, x0); | |||
| t0 = _mm256_fmsub_pd(c_256, y0, t0); | |||
| _mm256_storeu_pd(&y[i], t0); | |||
| } | |||
| for (i = tail_index_4; i < n; ++i) { | |||
| FLOAT temp = c * x[i] + s * y[i]; | |||
| y[i] = c * y[i] - s * x[i]; | |||
| x[i] = temp; | |||
| } | |||
| } | |||
| #endif | |||
| @@ -0,0 +1,94 @@ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) | |||
| #define HAVE_DROT_KERNEL 1 | |||
| #include <immintrin.h> | |||
| #include <stdint.h> | |||
| static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i = 0; | |||
| BLASLONG n1 = n; | |||
| BLASLONG tail_index_8 = 0; | |||
| BLASLONG tail_index_32 = 0; | |||
| __m512d c_512 = _mm512_set1_pd(c); | |||
| __m512d s_512 = _mm512_set1_pd(s); | |||
| tail_index_8 = n1 & (~7); | |||
| tail_index_32 = n1 & (~31); | |||
| __m512d x0, x1, x2, x3; | |||
| __m512d y0, y1, y2, y3; | |||
| __m512d t0, t1, t2, t3; | |||
| for (i = 0; i < tail_index_32; i += 32) { | |||
| x0 = _mm512_loadu_pd(&x[i + 0]); | |||
| x1 = _mm512_loadu_pd(&x[i + 8]); | |||
| x2 = _mm512_loadu_pd(&x[i +16]); | |||
| x3 = _mm512_loadu_pd(&x[i +24]); | |||
| y0 = _mm512_loadu_pd(&y[i + 0]); | |||
| y1 = _mm512_loadu_pd(&y[i + 8]); | |||
| y2 = _mm512_loadu_pd(&y[i +16]); | |||
| y3 = _mm512_loadu_pd(&y[i +24]); | |||
| t0 = _mm512_mul_pd(s_512, y0); | |||
| t1 = _mm512_mul_pd(s_512, y1); | |||
| t2 = _mm512_mul_pd(s_512, y2); | |||
| t3 = _mm512_mul_pd(s_512, y3); | |||
| t0 = _mm512_fmadd_pd(c_512, x0, t0); | |||
| t1 = _mm512_fmadd_pd(c_512, x1, t1); | |||
| t2 = _mm512_fmadd_pd(c_512, x2, t2); | |||
| t3 = _mm512_fmadd_pd(c_512, x3, t3); | |||
| _mm512_storeu_pd(&x[i + 0], t0); | |||
| _mm512_storeu_pd(&x[i + 8], t1); | |||
| _mm512_storeu_pd(&x[i +16], t2); | |||
| _mm512_storeu_pd(&x[i +24], t3); | |||
| t0 = _mm512_mul_pd(s_512, x0); | |||
| t1 = _mm512_mul_pd(s_512, x1); | |||
| t2 = _mm512_mul_pd(s_512, x2); | |||
| t3 = _mm512_mul_pd(s_512, x3); | |||
| t0 = _mm512_fmsub_pd(c_512, y0, t0); | |||
| t1 = _mm512_fmsub_pd(c_512, y1, t1); | |||
| t2 = _mm512_fmsub_pd(c_512, y2, t2); | |||
| t3 = _mm512_fmsub_pd(c_512, y3, t3); | |||
| _mm512_storeu_pd(&y[i + 0], t0); | |||
| _mm512_storeu_pd(&y[i + 8], t1); | |||
| _mm512_storeu_pd(&y[i +16], t2); | |||
| _mm512_storeu_pd(&y[i +24], t3); | |||
| } | |||
| for (i = tail_index_32; i < tail_index_8; i += 8) { | |||
| x0 = _mm512_loadu_pd(&x[i]); | |||
| y0 = _mm512_loadu_pd(&y[i]); | |||
| t0 = _mm512_mul_pd(s_512, y0); | |||
| t0 = _mm512_fmadd_pd(c_512, x0, t0); | |||
| _mm512_storeu_pd(&x[i], t0); | |||
| t0 = _mm512_mul_pd(s_512, x0); | |||
| t0 = _mm512_fmsub_pd(c_512, y0, t0); | |||
| _mm512_storeu_pd(&y[i], t0); | |||
| } | |||
| if ((n1&7) > 0) { | |||
| unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 -(n1&7))); | |||
| __m512d tail_x = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x[tail_index_8]); | |||
| __m512d tail_y = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &y[tail_index_8]); | |||
| __m512d temp = _mm512_mul_pd(s_512, tail_y); | |||
| temp = _mm512_fmadd_pd(c_512, tail_x, temp); | |||
| _mm512_mask_storeu_pd(&x[tail_index_8],*((__mmask8*)&tail_mask8), temp); | |||
| temp = _mm512_mul_pd(s_512, tail_x); | |||
| temp = _mm512_fmsub_pd(c_512, tail_y, temp); | |||
| _mm512_mask_storeu_pd(&y[tail_index_8], *((__mmask8*)&tail_mask8), temp); | |||
| } | |||
| } | |||
| #endif | |||
| @@ -0,0 +1,139 @@ | |||
| #include "common.h" | |||
| #if defined(SKYLAKEX) | |||
| #include "srot_microk_skylakex-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "srot_microk_haswell-2.c" | |||
| #endif | |||
| #ifndef HAVE_SROT_KERNEL | |||
| static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i = 0; | |||
| FLOAT f0, f1, f2, f3; | |||
| FLOAT x0, x1, x2, x3; | |||
| FLOAT g0, g1, g2, g3; | |||
| FLOAT y0, y1, y2, y3; | |||
| FLOAT* xp = x; | |||
| FLOAT* yp = y; | |||
| BLASLONG n1 = n & (~7); | |||
| while (i < n1) { | |||
| x0 = xp[0]; | |||
| y0 = yp[0]; | |||
| x1 = xp[1]; | |||
| y1 = yp[1]; | |||
| x2 = xp[2]; | |||
| y2 = yp[2]; | |||
| x3 = xp[3]; | |||
| y3 = yp[3]; | |||
| f0 = c*x0 + s*y0; | |||
| g0 = c*y0 - s*x0; | |||
| f1 = c*x1 + s*y1; | |||
| g1 = c*y1 - s*x1; | |||
| f2 = c*x2 + s*y2; | |||
| g2 = c*y2 - s*x2; | |||
| f3 = c*x3 + s*y3; | |||
| g3 = c*y3 - s*x3; | |||
| xp[0] = f0; | |||
| yp[0] = g0; | |||
| xp[1] = f1; | |||
| yp[1] = g1; | |||
| xp[2] = f2; | |||
| yp[2] = g2; | |||
| xp[3] = f3; | |||
| yp[3] = g3; | |||
| xp += 4; | |||
| yp += 4; | |||
| i += 4; | |||
| } | |||
| while (i < n) { | |||
| FLOAT temp = c*x[i] + s*y[i]; | |||
| y[i] = c*y[i] - s*x[i]; | |||
| x[i] = temp; | |||
| i++; | |||
| } | |||
| } | |||
| #endif | |||
| static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| FLOAT temp; | |||
| if (n <= 0) | |||
| return; | |||
| if ((inc_x == 1) && (inc_y == 1)) { | |||
| srot_kernel(n, x, y, c, s); | |||
| } | |||
| else { | |||
| while (i < n) { | |||
| temp = c * x[ix] + s * y[iy]; | |||
| y[iy] = c * y[iy] - s * x[ix]; | |||
| x[ix] = temp; | |||
| ix += inc_x; | |||
| iy += inc_y; | |||
| i++; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| #if defined(SMP) | |||
| static int rot_thread_function(blas_arg_t *args) | |||
| { | |||
| rot_compute(args->m, | |||
| args->a, args->lda, | |||
| args->b, args->ldb, | |||
| ((float *)args->alpha)[0], | |||
| ((float *)args->alpha)[1]); | |||
| return 0; | |||
| } | |||
| extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| #if defined(SMP) | |||
| int nthreads; | |||
| FLOAT alpha[2]={c, s}; | |||
| FLOAT dummy_c; | |||
| #endif | |||
| #if defined(SMP) | |||
| if (inc_x == 0 || inc_y == 0 || n <= 100000) { | |||
| nthreads = 1; | |||
| } | |||
| else { | |||
| nthreads = num_cpu_avail(1); | |||
| } | |||
| if (nthreads == 1) { | |||
| rot_compute(n, x, inc_x, y, inc_y, c, s); | |||
| } | |||
| else { | |||
| #if defined(DOUBLE) | |||
| int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD; | |||
| #else | |||
| int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; | |||
| #endif | |||
| blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); | |||
| } | |||
| #else | |||
| rot_compute(n, x, inc_x, y, inc_y, c, s); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,87 @@ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) | |||
| #define HAVE_SROT_KERNEL 1 | |||
| #include <immintrin.h> | |||
| #include <stdint.h> | |||
| static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i = 0; | |||
| BLASLONG tail_index_8 = n&(~7); | |||
| BLASLONG tail_index_32 = n&(~31); | |||
| __m256 c_256, s_256; | |||
| if (n >= 8) { | |||
| c_256 = _mm256_set1_ps(c); | |||
| s_256 = _mm256_set1_ps(s); | |||
| } | |||
| __m256 x0, x1, x2, x3; | |||
| __m256 y0, y1, y2, y3; | |||
| __m256 t0, t1, t2, t3; | |||
| for (i = 0; i < tail_index_32; i += 32) { | |||
| x0 = _mm256_loadu_ps(&x[i + 0]); | |||
| x1 = _mm256_loadu_ps(&x[i + 8]); | |||
| x2 = _mm256_loadu_ps(&x[i +16]); | |||
| x3 = _mm256_loadu_ps(&x[i +24]); | |||
| y0 = _mm256_loadu_ps(&y[i + 0]); | |||
| y1 = _mm256_loadu_ps(&y[i + 8]); | |||
| y2 = _mm256_loadu_ps(&y[i +16]); | |||
| y3 = _mm256_loadu_ps(&y[i +24]); | |||
| t0 = _mm256_mul_ps(s_256, y0); | |||
| t1 = _mm256_mul_ps(s_256, y1); | |||
| t2 = _mm256_mul_ps(s_256, y2); | |||
| t3 = _mm256_mul_ps(s_256, y3); | |||
| t0 = _mm256_fmadd_ps(c_256, x0, t0); | |||
| t1 = _mm256_fmadd_ps(c_256, x1, t1); | |||
| t2 = _mm256_fmadd_ps(c_256, x2, t2); | |||
| t3 = _mm256_fmadd_ps(c_256, x3, t3); | |||
| _mm256_storeu_ps(&x[i + 0], t0); | |||
| _mm256_storeu_ps(&x[i + 8], t1); | |||
| _mm256_storeu_ps(&x[i +16], t2); | |||
| _mm256_storeu_ps(&x[i +24], t3); | |||
| t0 = _mm256_mul_ps(s_256, x0); | |||
| t1 = _mm256_mul_ps(s_256, x1); | |||
| t2 = _mm256_mul_ps(s_256, x2); | |||
| t3 = _mm256_mul_ps(s_256, x3); | |||
| t0 = _mm256_fmsub_ps(c_256, y0, t0); | |||
| t1 = _mm256_fmsub_ps(c_256, y1, t1); | |||
| t2 = _mm256_fmsub_ps(c_256, y2, t2); | |||
| t3 = _mm256_fmsub_ps(c_256, y3, t3); | |||
| _mm256_storeu_ps(&y[i + 0], t0); | |||
| _mm256_storeu_ps(&y[i + 8], t1); | |||
| _mm256_storeu_ps(&y[i +16], t2); | |||
| _mm256_storeu_ps(&y[i +24], t3); | |||
| } | |||
| for (i = tail_index_32; i < tail_index_8; i += 8) { | |||
| x0 = _mm256_loadu_ps(&x[i]); | |||
| y0 = _mm256_loadu_ps(&y[i]); | |||
| t0 = _mm256_mul_ps(s_256, y0); | |||
| t0 = _mm256_fmadd_ps(c_256, x0, t0); | |||
| _mm256_storeu_ps(&x[i], t0); | |||
| t0 = _mm256_mul_ps(s_256, x0); | |||
| t0 = _mm256_fmsub_ps(c_256, y0, t0); | |||
| _mm256_storeu_ps(&y[i], t0); | |||
| } | |||
| for (i = tail_index_8; i < n; ++i) { | |||
| FLOAT temp = c * x[i] + s * y[i]; | |||
| y[i] = c * y[i] - s * x[i]; | |||
| x[i] = temp; | |||
| } | |||
| } | |||
| #endif | |||
| @@ -0,0 +1,91 @@ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) | |||
| #define HAVE_SROT_KERNEL 1 | |||
| #include <immintrin.h> | |||
| #include <stdint.h> | |||
| static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i = 0; | |||
| __m512 c_512, s_512; | |||
| c_512 = _mm512_set1_ps(c); | |||
| s_512 = _mm512_set1_ps(s); | |||
| BLASLONG tail_index_16 = n&(~15); | |||
| BLASLONG tail_index_64 = n&(~63); | |||
| __m512 x0, x1, x2, x3; | |||
| __m512 y0, y1, y2, y3; | |||
| __m512 t0, t1, t2, t3; | |||
| for (i = 0; i < tail_index_64; i += 64) { | |||
| x0 = _mm512_loadu_ps(&x[i + 0]); | |||
| x1 = _mm512_loadu_ps(&x[i +16]); | |||
| x2 = _mm512_loadu_ps(&x[i +32]); | |||
| x3 = _mm512_loadu_ps(&x[i +48]); | |||
| y0 = _mm512_loadu_ps(&y[i + 0]); | |||
| y1 = _mm512_loadu_ps(&y[i +16]); | |||
| y2 = _mm512_loadu_ps(&y[i +32]); | |||
| y3 = _mm512_loadu_ps(&y[i +48]); | |||
| t0 = _mm512_mul_ps(s_512, y0); | |||
| t1 = _mm512_mul_ps(s_512, y1); | |||
| t2 = _mm512_mul_ps(s_512, y2); | |||
| t3 = _mm512_mul_ps(s_512, y3); | |||
| t0 = _mm512_fmadd_ps(c_512, x0, t0); | |||
| t1 = _mm512_fmadd_ps(c_512, x1, t1); | |||
| t2 = _mm512_fmadd_ps(c_512, x2, t2); | |||
| t3 = _mm512_fmadd_ps(c_512, x3, t3); | |||
| _mm512_storeu_ps(&x[i + 0], t0); | |||
| _mm512_storeu_ps(&x[i +16], t1); | |||
| _mm512_storeu_ps(&x[i +32], t2); | |||
| _mm512_storeu_ps(&x[i +48], t3); | |||
| t0 = _mm512_mul_ps(s_512, x0); | |||
| t1 = _mm512_mul_ps(s_512, x1); | |||
| t2 = _mm512_mul_ps(s_512, x2); | |||
| t3 = _mm512_mul_ps(s_512, x3); | |||
| t0 = _mm512_fmsub_ps(c_512, y0, t0); | |||
| t1 = _mm512_fmsub_ps(c_512, y1, t1); | |||
| t2 = _mm512_fmsub_ps(c_512, y2, t2); | |||
| t3 = _mm512_fmsub_ps(c_512, y3, t3); | |||
| _mm512_storeu_ps(&y[i + 0], t0); | |||
| _mm512_storeu_ps(&y[i +16], t1); | |||
| _mm512_storeu_ps(&y[i +32], t2); | |||
| _mm512_storeu_ps(&y[i +48], t3); | |||
| } | |||
| for (i = tail_index_64; i < tail_index_16; i += 16) { | |||
| x0 = _mm512_loadu_ps(&x[i]); | |||
| y0 = _mm512_loadu_ps(&y[i]); | |||
| t0 = _mm512_mul_ps(s_512, y0); | |||
| t0 = _mm512_fmadd_ps(c_512, x0, t0); | |||
| _mm512_storeu_ps(&x[i], t0); | |||
| t0 = _mm512_mul_ps(s_512, x0); | |||
| t0 = _mm512_fmsub_ps(c_512, y0, t0); | |||
| _mm512_storeu_ps(&y[i], t0); | |||
| } | |||
| if ((n & 15) > 0) { | |||
| uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16-(n&15))); | |||
| __m512 tail_x = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &x[tail_index_16]); | |||
| __m512 tail_y = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &y[tail_index_16]); | |||
| __m512 temp = _mm512_mul_ps(s_512, tail_y); | |||
| temp = _mm512_fmadd_ps(c_512, tail_x, temp); | |||
| _mm512_mask_storeu_ps(&x[tail_index_16], *((__mmask16*)&tail_mask16), temp); | |||
| temp = _mm512_mul_ps(s_512, tail_x); | |||
| temp = _mm512_fmsub_ps(c_512, tail_y, temp); | |||
| _mm512_mask_storeu_ps(&y[tail_index_16], *((__mmask16*)&tail_mask16), temp); | |||
| } | |||
| } | |||
| #endif | |||