diff --git a/kernel/arm64/sgemm_direct_performant.c b/kernel/arm64/sgemm_direct_performant.c new file mode 100644 index 000000000..c5c42bccc --- /dev/null +++ b/kernel/arm64/sgemm_direct_performant.c @@ -0,0 +1,31 @@ +#include "common.h" +/* helper for the direct sgemm code written by Arjan van der Ven */ + + + + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K) +{ +if (M<3 || M%2==1) return 0; + unsigned long long mnk = M * N * K; + /* large matrixes -> not performant */ + if (mnk >= 28 * 512 * 512) + return 0; + + /* + * if the B matrix is not a nice multiple if 4 we get many unaligned accesses, + * and the regular sgemm copy/realignment of data pays off much quicker + */ + if ((N & 3) != 0 && (mnk >= 8 * 512 * 512)) + return 0; + +#ifdef SMP + /* if we can run multithreaded, the threading changes the based threshold */ + if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1) + return 0; +#endif + + return 1; +} + +