Optimize the performance of daxpy by using universal intrinsicstags/v0.3.11^2
| @@ -492,7 +492,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | |||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ | |||
| "-DHAVE_AVX -DHAVE_FMA4" | |||
| "-DHAVE_AVX" | |||
| #define LIBNAME "bulldozer" | |||
| #define CORENAME "BULLDOZER" | |||
| #endif | |||
| @@ -508,7 +508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ | |||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ | |||
| "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" | |||
| "-DHAVE_AVX -DHAVE_FMA3" | |||
| #define LIBNAME "piledriver" | |||
| #define CORENAME "PILEDRIVER" | |||
| #endif | |||
| @@ -524,7 +524,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ | |||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ | |||
| "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" | |||
| "-DHAVE_AVX -DHAVE_FMA3" | |||
| #define LIBNAME "steamroller" | |||
| #define CORENAME "STEAMROLLER" | |||
| #endif | |||
| @@ -540,7 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ | |||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ | |||
| "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" | |||
| "-DHAVE_AVX -DHAVE_FMA3" | |||
| #define LIBNAME "excavator" | |||
| #define CORENAME "EXCAVATOR" | |||
| #endif | |||
| @@ -0,0 +1,71 @@ | |||
| #ifndef _INTRIN_H_ | |||
| #define _INTRIN_H_ | |||
| #if defined(_MSC_VER) | |||
| #define BLAS_INLINE __inline | |||
| #elif defined(__GNUC__) | |||
| #if defined(__STRICT_ANSI__) | |||
| #define BLAS_INLINE __inline__ | |||
| #else | |||
| #define BLAS_INLINE inline | |||
| #endif | |||
| #else | |||
| #define BLAS_INLINE | |||
| #endif | |||
| #ifdef _MSC_VER | |||
| #define BLAS_FINLINE static __forceinline | |||
| #elif defined(__GNUC__) | |||
| #define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline)) | |||
| #else | |||
| #define BLAS_FINLINE static | |||
| #endif | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| // include head | |||
| /** SSE **/ | |||
| #ifdef HAVE_SSE | |||
| #include <xmmintrin.h> | |||
| #endif | |||
| /** SSE2 **/ | |||
| #ifdef HAVE_SSE2 | |||
| #include <emmintrin.h> | |||
| #endif | |||
| /** SSE3 **/ | |||
| #ifdef HAVE_SSE3 | |||
| #include <pmmintrin.h> | |||
| #endif | |||
| /** SSSE3 **/ | |||
| #ifdef HAVE_SSSE3 | |||
| #include <tmmintrin.h> | |||
| #endif | |||
| /** SSE41 **/ | |||
| #ifdef HAVE_SSE4_1 | |||
| #include <smmintrin.h> | |||
| #endif | |||
| /** AVX **/ | |||
| #ifdef HAVE_AVX | |||
| #include <immintrin.h> | |||
| #endif | |||
| // distribute | |||
| #if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16) | |||
| #include "intrin_avx512.h" | |||
| #elif defined(HAVE_AVX2) | |||
| #include "intrin_avx.h" | |||
| #elif defined(HAVE_SSE2) | |||
| #include "intrin_sse.h" | |||
| #endif | |||
| #ifndef V_SIMD | |||
| #define V_SIMD 0 | |||
| #define V_SIMD_F64 0 | |||
| #endif | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| #endif // _INTRIN_H_ | |||
| @@ -0,0 +1,29 @@ | |||
| #define V_SIMD 256 | |||
| #define V_SIMD_F64 1 | |||
| /* | |||
| Data Type | |||
| */ | |||
| typedef __m256 v_f32; | |||
| #define v_nlanes_f32 8 | |||
| /* | |||
| arithmetic | |||
| */ | |||
| #define v_add_f32 _mm256_add_ps | |||
| #define v_mul_f32 _mm256_mul_ps | |||
| #ifdef HAVE_FMA3 | |||
| // multiply and add, a*b + c | |||
| #define v_muladd_f32 _mm256_fmadd_ps | |||
| #else | |||
| // multiply and add, a*b + c | |||
| BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) | |||
| { return v_add_f32(v_mul_f32(a, b), c); } | |||
| #endif // !HAVE_FMA3 | |||
| /* | |||
| memory | |||
| */ | |||
| // unaligned load | |||
| #define v_loadu_f32 _mm256_loadu_ps | |||
| #define v_storeu_f32 _mm256_storeu_ps | |||
| #define v_setall_f32(VAL) _mm256_set1_ps(VAL) | |||
| @@ -0,0 +1,21 @@ | |||
| #define V_SIMD 512 | |||
| #define V_SIMD_F64 1 | |||
| /* | |||
| Data Type | |||
| */ | |||
| typedef __m512 v_f32; | |||
| #define v_nlanes_f32 16 | |||
| /* | |||
| arithmetic | |||
| */ | |||
| #define v_add_f32 _mm512_add_ps | |||
| #define v_mul_f32 _mm512_mul_ps | |||
| // multiply and add, a*b + c | |||
| #define v_muladd_f32 _mm512_fmadd_ps | |||
| /* | |||
| memory | |||
| */ | |||
| // unaligned load | |||
| #define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) | |||
| #define v_storeu_f32 _mm512_storeu_ps | |||
| #define v_setall_f32(VAL) _mm512_set1_ps(VAL) | |||
| @@ -0,0 +1,30 @@ | |||
| #define V_SIMD 128 | |||
| #define V_SIMD_F64 1 | |||
| /* | |||
| Data Type | |||
| */ | |||
| typedef __m128 v_f32; | |||
| #define v_nlanes_f32 4 | |||
| /* | |||
| arithmetic | |||
| */ | |||
| #define v_add_f32 _mm_add_ps | |||
| #define v_mul_f32 _mm_mul_ps | |||
| #ifdef HAVE_FMA3 | |||
| // multiply and add, a*b + c | |||
| #define v_muladd_f32 _mm_fmadd_ps | |||
| #elif defined(HAVE_FMA4) | |||
| // multiply and add, a*b + c | |||
| #define v_muladd_f32 _mm_macc_ps | |||
| #else | |||
| // multiply and add, a*b + c | |||
| BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) | |||
| { return v_add_f32(v_mul_f32(a, b), c); } | |||
| #endif // HAVE_FMA3 | |||
| /* | |||
| memory | |||
| */ | |||
| // unaligned load | |||
| #define v_loadu_f32 _mm_loadu_ps | |||
| #define v_storeu_f32 _mm_storeu_ps | |||
| #define v_setall_f32(VAL) _mm_set1_ps(VAL) | |||
| @@ -45,28 +45,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "daxpy_microk_sandy-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| #include"../simd/intrin.h" | |||
| static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| FLOAT a = *alpha; | |||
| #if V_SIMD | |||
| v_f32 __alpha, tmp; | |||
| __alpha = v_setall_f32(*alpha); | |||
| const int vstep = v_nlanes_f32; | |||
| for (; i < n; i += vstep) { | |||
| tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i)); | |||
| v_storeu_f32(y + i, tmp); | |||
| } | |||
| #else | |||
| while(i < n) | |||
| { | |||
| y[i] += a * x[i]; | |||
| y[i+1] += a * x[i+1]; | |||
| y[i+2] += a * x[i+2]; | |||
| y[i+3] += a * x[i+3]; | |||
| y[i+4] += a * x[i+4]; | |||
| y[i+5] += a * x[i+5]; | |||
| y[i+6] += a * x[i+6]; | |||
| y[i+7] += a * x[i+7]; | |||
| i+=8 ; | |||
| } | |||
| { | |||
| y[i] += a * x[i]; | |||
| y[i+1] += a * x[i+1]; | |||
| y[i+2] += a * x[i+2]; | |||
| y[i+3] += a * x[i+3]; | |||
| y[i+4] += a * x[i+4]; | |||
| y[i+5] += a * x[i+5]; | |||
| y[i+6] += a * x[i+6]; | |||
| y[i+7] += a * x[i+7]; | |||
| i+=8 ; | |||
| } | |||
| #endif | |||
| } | |||
| #endif | |||