Browse Source

Merge pull request #3772 from siko1056/develop

Support CONSISTENT_FPCSR on aarch64 systems
tags/v0.3.22^2
Martin Kroeker GitHub 3 years ago
parent
commit
8e851160d7
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 17 additions and 1 deletions
  1. +1
    -1
      Makefile.rule
  2. +8
    -0
      driver/others/blas_server.c
  3. +8
    -0
      driver/others/blas_server_omp.c

+ 1
- 1
Makefile.rule View File

@@ -207,7 +207,7 @@ NO_AFFINITY = 1
# to the user space. If bigphysarea is enabled, it will use it. # to the user space. If bigphysarea is enabled, it will use it.
# DEVICEDRIVER_ALLOCATION = 1 # DEVICEDRIVER_ALLOCATION = 1


# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
# If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only).
# CONSISTENT_FPCSR = 1 # CONSISTENT_FPCSR = 1


# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute


+ 8
- 0
driver/others/blas_server.c View File

@@ -470,9 +470,13 @@ blas_queue_t *tscq;
#endif #endif


#ifdef CONSISTENT_FPCSR #ifdef CONSISTENT_FPCSR
#ifdef __aarch64__
__asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode));
#else
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
#endif #endif
#endif


#ifdef MONITOR #ifdef MONITOR
main_status[cpu] = MAIN_RUNNING1; main_status[cpu] = MAIN_RUNNING1;
@@ -746,9 +750,13 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
queue -> position = pos; queue -> position = pos;


#ifdef CONSISTENT_FPCSR #ifdef CONSISTENT_FPCSR
#ifdef __aarch64__
__asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue -> sse_mode));
#else
__asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode)); __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode));
__asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode));
#endif #endif
#endif


#if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)




+ 8
- 0
driver/others/blas_server_omp.c View File

@@ -284,8 +284,12 @@ static void exec_threads(blas_queue_t *queue, int buf_index){
sb = queue -> sb; sb = queue -> sb;


#ifdef CONSISTENT_FPCSR #ifdef CONSISTENT_FPCSR
#ifdef __aarch64__
__asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode));
#else
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
#endif
#endif #endif


if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
@@ -383,8 +387,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){


#ifdef CONSISTENT_FPCSR #ifdef CONSISTENT_FPCSR
for (i = 0; i < num; i ++) { for (i = 0; i < num; i ++) {
#ifdef __aarch64__
__asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue[i].sse_mode));
#else
__asm__ __volatile__ ("fnstcw %0" : "=m" (queue[i].x87_mode)); __asm__ __volatile__ ("fnstcw %0" : "=m" (queue[i].x87_mode));
__asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode));
#endif
} }
#endif #endif




Loading…
Cancel
Save