Optimize the performance of daxpy by using universal intrinsicstags/v0.3.11^2
| @@ -492,7 +492,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ | "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ | ||||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | ||||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ | "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ | ||||
| "-DHAVE_AVX -DHAVE_FMA4" | |||||
| "-DHAVE_AVX" | |||||
| #define LIBNAME "bulldozer" | #define LIBNAME "bulldozer" | ||||
| #define CORENAME "BULLDOZER" | #define CORENAME "BULLDOZER" | ||||
| #endif | #endif | ||||
| @@ -508,7 +508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | ||||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ | "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ | ||||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ | "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ | ||||
| "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" | |||||
| "-DHAVE_AVX -DHAVE_FMA3" | |||||
| #define LIBNAME "piledriver" | #define LIBNAME "piledriver" | ||||
| #define CORENAME "PILEDRIVER" | #define CORENAME "PILEDRIVER" | ||||
| #endif | #endif | ||||
| @@ -524,7 +524,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | ||||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ | "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ | ||||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ | "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ | ||||
| "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" | |||||
| "-DHAVE_AVX -DHAVE_FMA3" | |||||
| #define LIBNAME "steamroller" | #define LIBNAME "steamroller" | ||||
| #define CORENAME "STEAMROLLER" | #define CORENAME "STEAMROLLER" | ||||
| #endif | #endif | ||||
| @@ -540,7 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | ||||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ | "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ | ||||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ | "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ | ||||
| "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" | |||||
| "-DHAVE_AVX -DHAVE_FMA3" | |||||
| #define LIBNAME "excavator" | #define LIBNAME "excavator" | ||||
| #define CORENAME "EXCAVATOR" | #define CORENAME "EXCAVATOR" | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,71 @@ | |||||
| #ifndef _INTRIN_H_ | |||||
| #define _INTRIN_H_ | |||||
| #if defined(_MSC_VER) | |||||
| #define BLAS_INLINE __inline | |||||
| #elif defined(__GNUC__) | |||||
| #if defined(__STRICT_ANSI__) | |||||
| #define BLAS_INLINE __inline__ | |||||
| #else | |||||
| #define BLAS_INLINE inline | |||||
| #endif | |||||
| #else | |||||
| #define BLAS_INLINE | |||||
| #endif | |||||
| #ifdef _MSC_VER | |||||
| #define BLAS_FINLINE static __forceinline | |||||
| #elif defined(__GNUC__) | |||||
| #define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline)) | |||||
| #else | |||||
| #define BLAS_FINLINE static | |||||
| #endif | |||||
| #ifdef __cplusplus | |||||
| extern "C" { | |||||
| #endif | |||||
| // include head | |||||
| /** SSE **/ | |||||
| #ifdef HAVE_SSE | |||||
| #include <xmmintrin.h> | |||||
| #endif | |||||
| /** SSE2 **/ | |||||
| #ifdef HAVE_SSE2 | |||||
| #include <emmintrin.h> | |||||
| #endif | |||||
| /** SSE3 **/ | |||||
| #ifdef HAVE_SSE3 | |||||
| #include <pmmintrin.h> | |||||
| #endif | |||||
| /** SSSE3 **/ | |||||
| #ifdef HAVE_SSSE3 | |||||
| #include <tmmintrin.h> | |||||
| #endif | |||||
| /** SSE41 **/ | |||||
| #ifdef HAVE_SSE4_1 | |||||
| #include <smmintrin.h> | |||||
| #endif | |||||
| /** AVX **/ | |||||
| #ifdef HAVE_AVX | |||||
| #include <immintrin.h> | |||||
| #endif | |||||
| // distribute | |||||
| #if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16) | |||||
| #include "intrin_avx512.h" | |||||
| #elif defined(HAVE_AVX2) | |||||
| #include "intrin_avx.h" | |||||
| #elif defined(HAVE_SSE2) | |||||
| #include "intrin_sse.h" | |||||
| #endif | |||||
| #ifndef V_SIMD | |||||
| #define V_SIMD 0 | |||||
| #define V_SIMD_F64 0 | |||||
| #endif | |||||
| #ifdef __cplusplus | |||||
| } | |||||
| #endif | |||||
| #endif // _INTRIN_H_ | |||||
| @@ -0,0 +1,29 @@ | |||||
| #define V_SIMD 256 | |||||
| #define V_SIMD_F64 1 | |||||
| /* | |||||
| Data Type | |||||
| */ | |||||
| typedef __m256 v_f32; | |||||
| #define v_nlanes_f32 8 | |||||
| /* | |||||
| arithmetic | |||||
| */ | |||||
| #define v_add_f32 _mm256_add_ps | |||||
| #define v_mul_f32 _mm256_mul_ps | |||||
| #ifdef HAVE_FMA3 | |||||
| // multiply and add, a*b + c | |||||
| #define v_muladd_f32 _mm256_fmadd_ps | |||||
| #else | |||||
| // multiply and add, a*b + c | |||||
| BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) | |||||
| { return v_add_f32(v_mul_f32(a, b), c); } | |||||
| #endif // !HAVE_FMA3 | |||||
| /* | |||||
| memory | |||||
| */ | |||||
| // unaligned load | |||||
| #define v_loadu_f32 _mm256_loadu_ps | |||||
| #define v_storeu_f32 _mm256_storeu_ps | |||||
| #define v_setall_f32(VAL) _mm256_set1_ps(VAL) | |||||
| @@ -0,0 +1,21 @@ | |||||
| #define V_SIMD 512 | |||||
| #define V_SIMD_F64 1 | |||||
| /* | |||||
| Data Type | |||||
| */ | |||||
| typedef __m512 v_f32; | |||||
| #define v_nlanes_f32 16 | |||||
| /* | |||||
| arithmetic | |||||
| */ | |||||
| #define v_add_f32 _mm512_add_ps | |||||
| #define v_mul_f32 _mm512_mul_ps | |||||
| // multiply and add, a*b + c | |||||
| #define v_muladd_f32 _mm512_fmadd_ps | |||||
| /* | |||||
| memory | |||||
| */ | |||||
| // unaligned load | |||||
| #define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) | |||||
| #define v_storeu_f32 _mm512_storeu_ps | |||||
| #define v_setall_f32(VAL) _mm512_set1_ps(VAL) | |||||
| @@ -0,0 +1,30 @@ | |||||
| #define V_SIMD 128 | |||||
| #define V_SIMD_F64 1 | |||||
| /* | |||||
| Data Type | |||||
| */ | |||||
| typedef __m128 v_f32; | |||||
| #define v_nlanes_f32 4 | |||||
| /* | |||||
| arithmetic | |||||
| */ | |||||
| #define v_add_f32 _mm_add_ps | |||||
| #define v_mul_f32 _mm_mul_ps | |||||
| #ifdef HAVE_FMA3 | |||||
| // multiply and add, a*b + c | |||||
| #define v_muladd_f32 _mm_fmadd_ps | |||||
| #elif defined(HAVE_FMA4) | |||||
| // multiply and add, a*b + c | |||||
| #define v_muladd_f32 _mm_macc_ps | |||||
| #else | |||||
| // multiply and add, a*b + c | |||||
| BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) | |||||
| { return v_add_f32(v_mul_f32(a, b), c); } | |||||
| #endif // HAVE_FMA3 | |||||
| /* | |||||
| memory | |||||
| */ | |||||
| // unaligned load | |||||
| #define v_loadu_f32 _mm_loadu_ps | |||||
| #define v_storeu_f32 _mm_storeu_ps | |||||
| #define v_setall_f32(VAL) _mm_set1_ps(VAL) | |||||
| @@ -45,28 +45,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "daxpy_microk_sandy-2.c" | #include "daxpy_microk_sandy-2.c" | ||||
| #endif | #endif | ||||
| #ifndef HAVE_KERNEL_8 | #ifndef HAVE_KERNEL_8 | ||||
| #include"../simd/intrin.h" | |||||
| static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | ||||
| { | { | ||||
| BLASLONG register i = 0; | BLASLONG register i = 0; | ||||
| FLOAT a = *alpha; | FLOAT a = *alpha; | ||||
| #if V_SIMD | |||||
| v_f32 __alpha, tmp; | |||||
| __alpha = v_setall_f32(*alpha); | |||||
| const int vstep = v_nlanes_f32; | |||||
| for (; i < n; i += vstep) { | |||||
| tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i)); | |||||
| v_storeu_f32(y + i, tmp); | |||||
| } | |||||
| #else | |||||
| while(i < n) | while(i < n) | ||||
| { | |||||
| y[i] += a * x[i]; | |||||
| y[i+1] += a * x[i+1]; | |||||
| y[i+2] += a * x[i+2]; | |||||
| y[i+3] += a * x[i+3]; | |||||
| y[i+4] += a * x[i+4]; | |||||
| y[i+5] += a * x[i+5]; | |||||
| y[i+6] += a * x[i+6]; | |||||
| y[i+7] += a * x[i+7]; | |||||
| i+=8 ; | |||||
| } | |||||
| { | |||||
| y[i] += a * x[i]; | |||||
| y[i+1] += a * x[i+1]; | |||||
| y[i+2] += a * x[i+2]; | |||||
| y[i+3] += a * x[i+3]; | |||||
| y[i+4] += a * x[i+4]; | |||||
| y[i+5] += a * x[i+5]; | |||||
| y[i+6] += a * x[i+6]; | |||||
| y[i+7] += a * x[i+7]; | |||||
| i+=8 ; | |||||
| } | |||||
| #endif | |||||
| } | } | ||||
| #endif | #endif | ||||