Support CONSISTENT_FPCSR on aarch64 systemstags/v0.3.22^2
| @@ -207,7 +207,7 @@ NO_AFFINITY = 1 | |||||
| # to the user space. If bigphysarea is enabled, it will use it. | # to the user space. If bigphysarea is enabled, it will use it. | ||||
| # DEVICEDRIVER_ALLOCATION = 1 | # DEVICEDRIVER_ALLOCATION = 1 | ||||
| # If you need to synchronize FP CSR between threads (for x86/x86_64 only). | |||||
| # If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only). | |||||
| # CONSISTENT_FPCSR = 1 | # CONSISTENT_FPCSR = 1 | ||||
| # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute | # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute | ||||
| @@ -470,9 +470,13 @@ blas_queue_t *tscq; | |||||
| #endif | #endif | ||||
| #ifdef CONSISTENT_FPCSR | #ifdef CONSISTENT_FPCSR | ||||
| #ifdef __aarch64__ | |||||
| __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); | |||||
| #else | |||||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | ||||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifdef MONITOR | #ifdef MONITOR | ||||
| main_status[cpu] = MAIN_RUNNING1; | main_status[cpu] = MAIN_RUNNING1; | ||||
| @@ -746,9 +750,13 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||||
| queue -> position = pos; | queue -> position = pos; | ||||
| #ifdef CONSISTENT_FPCSR | #ifdef CONSISTENT_FPCSR | ||||
| #ifdef __aarch64__ | |||||
| __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue -> sse_mode)); | |||||
| #else | |||||
| __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode)); | __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode)); | ||||
| __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode)); | __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode)); | ||||
| #endif | #endif | ||||
| #endif | |||||
| #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) | #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) | ||||
| @@ -284,8 +284,12 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ | |||||
| sb = queue -> sb; | sb = queue -> sb; | ||||
| #ifdef CONSISTENT_FPCSR | #ifdef CONSISTENT_FPCSR | ||||
| #ifdef __aarch64__ | |||||
| __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); | |||||
| #else | |||||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | ||||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | ||||
| #endif | |||||
| #endif | #endif | ||||
| if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { | if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { | ||||
| @@ -383,8 +387,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||||
| #ifdef CONSISTENT_FPCSR | #ifdef CONSISTENT_FPCSR | ||||
| for (i = 0; i < num; i ++) { | for (i = 0; i < num; i ++) { | ||||
| #ifdef __aarch64__ | |||||
| __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue[i].sse_mode)); | |||||
| #else | |||||
| __asm__ __volatile__ ("fnstcw %0" : "=m" (queue[i].x87_mode)); | __asm__ __volatile__ ("fnstcw %0" : "=m" (queue[i].x87_mode)); | ||||
| __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode)); | __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode)); | ||||
| #endif | |||||
| } | } | ||||
| #endif | #endif | ||||