diff --git a/kernel/arm64/sgemm_direct_performant.c b/kernel/arm64/sgemm_direct_performant.c
new file mode 100644
index 000000000..c5c42bccc
--- /dev/null
+++ b/kernel/arm64/sgemm_direct_performant.c
@@ -0,0 +1,31 @@
+#include "common.h"
+/* helper for the direct sgemm code written by Arjan van der Ven */
+
+
+
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K)
+{
+if (M<3 || M%2==1) return 0;
+	unsigned long long mnk = M * N * K;
+	/* large matrixes -> not performant */
+	if (mnk >= 28 * 512 * 512)
+		return 0;
+
+	/*
+	 * if the B matrix is not a nice multiple if 4 we get many unaligned accesses,
+	 * and the regular sgemm copy/realignment of data pays off much quicker
+	 */
+	if ((N & 3) != 0 && (mnk >= 8 * 512 * 512))
+		return 0;
+
+#ifdef SMP
+	/* if we can run multithreaded, the threading changes the based threshold */
+	if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1)
+		return 0;
+#endif
+
+	return 1;
+}
+
+