Move sgemm_direct_performant helper to separate file

5 years ago · 56d4d4f84b
--- a/kernel/x86_64/sgemm_direct_performant.c
+++ b/kernel/x86_64/sgemm_direct_performant.c
@@ -0,0 +1,30 @@
 #include "common.h"
 /* helper for the direct sgemm code written by Arjan van der Ven */




 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K)
 {
 	unsigned long long mnk = M * N * K;
 	/* large matrixes -> not performant */
 	if (mnk >= 28 * 512 * 512)
 		return 0;

 	/*
 	 * if the B matrix is not a nice multiple if 4 we get many unaligned accesses,
 	 * and the regular sgemm copy/realignment of data pays off much quicker
 	 */
 	if ((N & 3) != 0 && (mnk >= 8 * 512 * 512))
 		return 0;

 #ifdef SMP
 	/* if we can run multithreaded, the threading changes the based threshold */
 	if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1)
 		return 0;
 #endif

 	return 1;
 }


--- a/kernel/x86_64/sgemm_direct_skylakex.c
+++ b/kernel/x86_64/sgemm_direct_skylakex.c
@@ -1,7 +1,7 @@

 #if defined(SKYLAKEX) || defined (COOPERLAKE)
 /* the direct sgemm code written by Arjan van der Ven */
 //#include <immintrin.h>

 #include <immintrin.h>
 #include "common.h"
 /*
 * "Direct sgemm" code. This code operates directly on the inputs and outputs
 * of the sgemm call, avoiding the copies, memory realignments and threading,
@@ -38,6 +38,7 @@
 #define MATMUL_SCALAR(N,M) result##N##M +=  Aval##M * Bval##N;
 #define STORE_SCALAR(N,M)  R[(i+M) * strideR + j + N] = result##N##M;

 #if 0
 int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
 {
 	unsigned long long mnk = M * N * K;
@@ -61,9 +62,10 @@ int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
 	return 1;
 }

 #endif


 void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
 //void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
 void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
 {
 	int i, j, k;

@@ -465,3 +467,8 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict
 		}
 	}
 }
 #else
 #include "common.h"
 void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
 {}
 #endif