Browse Source
Merge pull request #5353 from nakagawa-fj/feature/gemm_divide_rate_for_A64FX
Multi-thread Performance Improvement of GEMM with DIVIDE_RATE=1 for A64FX
pull/5356/head
Martin Kroeker
GitHub
10 months ago
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with
6 additions and
0 deletions
-
driver/level3/gemm.c
-
param.h
|
|
|
@@ -59,6 +59,10 @@ |
|
|
|
#define GEMM_Q 128 |
|
|
|
#endif |
|
|
|
|
|
|
|
#ifdef GEMM_DIVIDE_RATE |
|
|
|
#define DIVIDE_RATE GEMM_DIVIDE_RATE |
|
|
|
#endif |
|
|
|
|
|
|
|
#ifdef THREADED_LEVEL3 |
|
|
|
#include "level3_thread.c" |
|
|
|
#else |
|
|
|
|
|
|
|
@@ -3701,6 +3701,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d |
|
|
|
|
|
|
|
#elif defined(A64FX) // 512-bit SVE |
|
|
|
|
|
|
|
#define GEMM_DIVIDE_RATE 1 |
|
|
|
|
|
|
|
#if defined(XDOUBLE) || defined(DOUBLE) |
|
|
|
#define GEMM_PREFERED_SIZE 8 |
|
|
|
#else |
|
|
|
|