Browse Source

performance improved

tags/v0.3.11^2
Qiyu8 5 years ago
parent
commit
14f7dad3b7
5 changed files with 46 additions and 3 deletions
  1. +20
    -0
      kernel/simd/intrin.h
  2. +10
    -0
      kernel/simd/intrin_avx.h
  3. +3
    -1
      kernel/simd/intrin_avx512.h
  4. +11
    -0
      kernel/simd/intrin_sse.h
  5. +2
    -2
      kernel/x86_64/daxpy.c

+ 20
- 0
kernel/simd/intrin.h View File

@@ -1,6 +1,26 @@
#ifndef _INTRIN_H_
#define _INTRIN_H_

#if defined(_MSC_VER)
#define BLAS_INLINE __inline
#elif defined(__GNUC__)
#if defined(__STRICT_ANSI__)
#define BLAS_INLINE __inline__
#else
#define BLAS_INLINE inline
#endif
#else
#define BLAS_INLINE
#endif

#ifdef _MSC_VER
#define BLAS_FINLINE static __forceinline
#elif defined(__GNUC__)
#define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline))
#else
#define BLAS_FINLINE static
#endif

#ifdef __cplusplus
extern "C" {
#endif


+ 10
- 0
kernel/simd/intrin_avx.h View File

@@ -10,6 +10,16 @@ arithmetic
*/
#define v_add_f32 _mm256_add_ps
#define v_mul_f32 _mm256_mul_ps

#ifdef HAVE_FMA3
// multiply and add, a*b + c
#define v_muladd_f32 _mm256_fmadd_ps
#else
// multiply and add, a*b + c
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
{ return v_add_f32(v_mul_f32(a, b), c); }
#endif // !HAVE_FMA3

/*
memory
*/


+ 3
- 1
kernel/simd/intrin_avx512.h View File

@@ -10,10 +10,12 @@ arithmetic
*/
#define v_add_f32 _mm512_add_ps
#define v_mul_f32 _mm512_mul_ps
// multiply and add, a*b + c
#define v_muladd_f32 _mm512_fmadd_ps
/*
memory
*/
// unaligned load
#define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR))
#define v_storeu_f32(PTR) _mm512_storeu_ps((const __m512*)(PTR))
#define v_storeu_f32 _mm512_storeu_ps
#define v_setall_f32(VAL) _mm512_set1_ps(VAL)

+ 11
- 0
kernel/simd/intrin_sse.h View File

@@ -10,6 +10,17 @@ arithmetic
*/
#define v_add_f32 _mm_add_ps
#define v_mul_f32 _mm_mul_ps
#ifdef HAVE_FMA3
// multiply and add, a*b + c
#define v_muladd_f32 _mm_fmadd_ps
#elif defined(HAVE_FMA4)
// multiply and add, a*b + c
#define v_muladd_f32 _mm_macc_ps
#else
// multiply and add, a*b + c
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
{ return v_add_f32(v_mul_f32(a, b), c); }
#endif // HAVE_FMA3
/*
memory
*/


+ 2
- 2
kernel/x86_64/daxpy.c View File

@@ -48,7 +48,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_8
#include"../simd/intrin.h"

void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
FLOAT a = *alpha;
@@ -57,7 +57,7 @@ void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
__alpha = v_setall_f32(*alpha);
const int vstep = v_nlanes_f32;
for (; i < n; i += vstep) {
tmp = v_add_f32(v_loadu_f32(y + i), v_mul_f32(__alpha, v_loadu_f32( x + i )));
tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i));
v_storeu_f32(y + i, tmp);
}
#else


Loading…
Cancel
Save