Browse Source

Move sgemm_direct_performant helper to separate file

pull/2782/head
Martin Kroeker GitHub 5 years ago
parent
commit
56d4d4f84b
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 42 additions and 5 deletions
  1. +30
    -0
      kernel/x86_64/sgemm_direct_performant.c
  2. +12
    -5
      kernel/x86_64/sgemm_direct_skylakex.c

+ 30
- 0
kernel/x86_64/sgemm_direct_performant.c View File

@@ -0,0 +1,30 @@
#include "common.h"
/* helper for the direct sgemm code written by Arjan van der Ven */




int CNAME(BLASLONG M, BLASLONG N, BLASLONG K)
{
unsigned long long mnk = M * N * K;
/* large matrixes -> not performant */
if (mnk >= 28 * 512 * 512)
return 0;

/*
* if the B matrix is not a nice multiple if 4 we get many unaligned accesses,
* and the regular sgemm copy/realignment of data pays off much quicker
*/
if ((N & 3) != 0 && (mnk >= 8 * 512 * 512))
return 0;

#ifdef SMP
/* if we can run multithreaded, the threading changes the based threshold */
if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1)
return 0;
#endif

return 1;
}



+ 12
- 5
kernel/x86_64/sgemm_direct_skylakex.c View File

@@ -1,7 +1,7 @@
#if defined(SKYLAKEX) || defined (COOPERLAKE)
/* the direct sgemm code written by Arjan van der Ven */
//#include <immintrin.h>
#include <immintrin.h>
#include "common.h"
/*
* "Direct sgemm" code. This code operates directly on the inputs and outputs
* of the sgemm call, avoiding the copies, memory realignments and threading,
@@ -38,6 +38,7 @@
#define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N;
#define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M;

#if 0
int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
{
unsigned long long mnk = M * N * K;
@@ -61,9 +62,10 @@ int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
return 1;
}

#endif

void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
//void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
{
int i, j, k;

@@ -465,3 +467,8 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict
}
}
}
#else
#include "common.h"
void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
{}
#endif

Loading…
Cancel
Save