| @@ -72,19 +72,9 @@ static HANDLE blas_threads [MAX_CPU_NUMBER]; | |||
| static DWORD blas_threads_id[MAX_CPU_NUMBER]; | |||
| static volatile int thread_target; // target num of live threads, volatile for cross-thread reads | |||
| #if defined (__GNUC__) && (__GNUC__ < 6) | |||
| #define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch) | |||
| #else | |||
| #if defined(_WIN64) | |||
| #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp) | |||
| #else | |||
| #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp) | |||
| #endif | |||
| #endif | |||
| static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| if (!(mode & BLAS_COMPLEX)){ | |||
| if (!(mode & BLAS_COMPLEX)) { | |||
| #ifdef EXPRECISION | |||
| if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||
| /* REAL / Extended Double */ | |||
| @@ -99,7 +89,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> c, args -> ldc, sb); | |||
| } else | |||
| #endif | |||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE){ | |||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE) { | |||
| /* REAL / Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | |||
| double *, BLASLONG, double *, BLASLONG, | |||
| @@ -110,7 +100,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> a, args -> lda, | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ | |||
| } else if ((mode & BLAS_PREC) == BLAS_SINGLE) { | |||
| /* REAL / Single */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, float *, BLASLONG, | |||
| @@ -122,7 +112,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| #ifdef BUILD_BFLOAT16 | |||
| } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ | |||
| } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) { | |||
| /* REAL / BFLOAT16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | |||
| bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, | |||
| @@ -133,7 +123,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> a, args -> lda, | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| } else if ((mode & BLAS_PREC) == BLAS_STOBF16){ | |||
| } else if ((mode & BLAS_PREC) == BLAS_STOBF16) { | |||
| /* REAL / BLAS_STOBF16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, bfloat16 *, BLASLONG, | |||
| @@ -144,7 +134,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> a, args -> lda, | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ | |||
| } else if ((mode & BLAS_PREC) == BLAS_DTOBF16) { | |||
| /* REAL / BLAS_DTOBF16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | |||
| double *, BLASLONG, bfloat16 *, BLASLONG, | |||
| @@ -161,7 +151,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| } | |||
| } else { | |||
| #ifdef EXPRECISION | |||
| if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||
| if ((mode & BLAS_PREC) == BLAS_XDOUBLE) { | |||
| /* COMPLEX / Extended Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, | |||
| xdouble *, BLASLONG, xdouble *, BLASLONG, | |||
| @@ -175,7 +165,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> c, args -> ldc, sb); | |||
| } else | |||
| #endif | |||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE){ | |||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE) { | |||
| /* COMPLEX / Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, | |||
| double *, BLASLONG, double *, BLASLONG, | |||
| @@ -205,10 +195,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| } | |||
| } | |||
| /* This is a main routine of threads. Each thread waits until job is */ | |||
| /* queued. */ | |||
| static DWORD WINAPI blas_thread_server(void *arg){ | |||
| // This is a main routine of threads. Each thread waits until job is | |||
| // queued. | |||
| static DWORD WINAPI blas_thread_server(void *arg) { | |||
| /* Thread identifier */ | |||
| BLASLONG cpu = (BLASLONG)arg; | |||
| @@ -221,24 +210,22 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| MT_TRACE("Server[%2ld] Thread is started!\n", cpu); | |||
| while (1){ | |||
| while (1) { | |||
| /* Waiting for Queue */ | |||
| MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu); | |||
| // event raised when work is added to the queue | |||
| WaitForSingleObject(kickoff_event, INFINITE); | |||
| // event raised when work is added to the queue | |||
| WaitForSingleObject(kickoff_event, INFINITE); | |||
| if (cpu > thread_target - 2) | |||
| { | |||
| //MT_TRACE("thread [%d] exiting.\n", cpu); | |||
| break; // excess thread, so worker thread exits | |||
| } | |||
| if (cpu > thread_target - 2) { | |||
| //MT_TRACE("thread [%d] exiting.\n", cpu); | |||
| break; // excess thread, so worker thread exits | |||
| } | |||
| MT_TRACE("Server[%2ld] Got it.\n", cpu); | |||
| #if 1 | |||
| EnterCriticalSection(&queue_lock); | |||
| queue = work_queue; | |||
| @@ -246,51 +233,39 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| work_queue = work_queue->next; | |||
| LeaveCriticalSection(&queue_lock); | |||
| #else | |||
| volatile blas_queue_t* queue_next; | |||
| INT_PTR prev_value; | |||
| do { | |||
| queue = (volatile blas_queue_t*)work_queue; | |||
| if (!queue) | |||
| break; | |||
| queue_next = (volatile blas_queue_t*)queue->next; | |||
| prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue); | |||
| } while (prev_value != queue); | |||
| #endif | |||
| if (queue) { | |||
| if (queue) { | |||
| int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; | |||
| sa = queue -> sa; | |||
| sb = queue -> sb; | |||
| #ifdef CONSISTENT_FPCSR | |||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||
| #endif | |||
| #ifdef CONSISTENT_FPCSR | |||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||
| #endif | |||
| MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", | |||
| cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); | |||
| // fprintf(stderr, "queue start[%ld]!!!\n", cpu); | |||
| #ifdef MONITOR | |||
| main_status[cpu] = MAIN_RUNNING1; | |||
| #endif | |||
| #ifdef MONITOR | |||
| main_status[cpu] = MAIN_RUNNING1; | |||
| #endif | |||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
| if (sa == NULL) | |||
| sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
| if (sb == NULL) { | |||
| if (!(queue -> mode & BLAS_COMPLEX)){ | |||
| if (!(queue -> mode & BLAS_COMPLEX)) { | |||
| #ifdef EXPRECISION | |||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) { | |||
| sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| } else | |||
| #endif | |||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ | |||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { | |||
| #ifdef BUILD_DOUBLE | |||
| sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| @@ -324,26 +299,25 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| /* Other types in future */ | |||
| } | |||
| } | |||
| queue->sb=sb; | |||
| queue->sb=sb; | |||
| } | |||
| #ifdef MONITOR | |||
| main_status[cpu] = MAIN_RUNNING2; | |||
| #endif | |||
| #ifdef MONITOR | |||
| main_status[cpu] = MAIN_RUNNING2; | |||
| #endif | |||
| if (!(queue -> mode & BLAS_LEGACY)) { | |||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||
| } else { | |||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||
| } | |||
| }else{ | |||
| continue; //if queue == NULL | |||
| } | |||
| } else { | |||
| continue; //if queue == NULL | |||
| } | |||
| MT_TRACE("Server[%2ld] Finished!\n", cpu); | |||
| queue->finished = 1; | |||
| queue->finished = 1; | |||
| } | |||
| /* Shutdown procedure */ | |||
| @@ -353,10 +327,12 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| blas_memory_free(buffer); | |||
| return 0; | |||
| } | |||
| } | |||
| /* Initializing routine */ | |||
| int blas_thread_init(void){ | |||
| // | |||
| // Initializing routine | |||
| // | |||
| int blas_thread_init(void) { | |||
| BLASLONG i; | |||
| if (blas_server_avail || (blas_cpu_number <= 1)) return 0; | |||
| @@ -365,16 +341,16 @@ int blas_thread_init(void){ | |||
| MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number); | |||
| if (!blas_server_avail){ | |||
| // create the kickoff Event | |||
| kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | |||
| if (!blas_server_avail) { | |||
| // create the kickoff Event | |||
| kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | |||
| thread_target = blas_cpu_number; | |||
| thread_target = blas_cpu_number; | |||
| InitializeCriticalSection(&queue_lock); | |||
| for(i = 0; i < blas_cpu_number - 1; i++){ | |||
| //MT_TRACE("thread_init: creating thread [%d]\n", i); | |||
| for(i = 0; i < blas_cpu_number - 1; i++) { | |||
| //MT_TRACE("thread_init: creating thread [%d]\n", i); | |||
| blas_threads[i] = CreateThread(NULL, 0, | |||
| blas_thread_server, (void *)i, | |||
| @@ -391,13 +367,10 @@ int blas_thread_init(void){ | |||
| /* | |||
| User can call one of two routines. | |||
| exec_blas_async ... immediately returns after jobs are queued. | |||
| exec_blas ... returns after jobs are finished. | |||
| */ | |||
| int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| int exec_blas_async(BLASLONG pos, blas_queue_t *queue) { | |||
| #if defined(SMP_SERVER) | |||
| // Handle lazy re-init of the thread-pool after a POSIX fork | |||
| @@ -417,7 +390,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); | |||
| #endif | |||
| current->finished = 0; | |||
| current->finished = 0; | |||
| current = current -> next; | |||
| pos ++; | |||
| } | |||
| @@ -426,18 +399,18 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| if (!work_queue) | |||
| { | |||
| work_queue = queue; | |||
| work_queue = queue; | |||
| } | |||
| else | |||
| { | |||
| blas_queue_t *next_item = work_queue; | |||
| // find the end of the work queue | |||
| while (next_item) | |||
| next_item = next_item->next; | |||
| // find the end of the work queue | |||
| while (next_item) | |||
| next_item = next_item->next; | |||
| // add new work to the end | |||
| next_item = queue; | |||
| // add new work to the end | |||
| next_item = queue; | |||
| } | |||
| LeaveCriticalSection(&queue_lock); | |||
| @@ -447,20 +420,24 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| return 0; | |||
| } | |||
| int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ | |||
| // | |||
| // Join. Wait for all queued tasks to complete | |||
| // | |||
| int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) { | |||
| MT_TRACE("Synchronization Waiting.\n"); | |||
| MT_TRACE("Synchronization Waiting.\n"); | |||
| while (num){ | |||
| MT_TRACE("Waiting Queue ..\n"); | |||
| while (!queue->finished) | |||
| YIELDING; | |||
| while (num) { | |||
| MT_TRACE("Waiting Queue ..\n"); | |||
| queue = queue->next; | |||
| num--; | |||
| } | |||
| while (!queue->finished) | |||
| YIELDING; | |||
| MT_TRACE("Completely Done.\n\n"); | |||
| queue = queue->next; | |||
| num--; | |||
| } | |||
| MT_TRACE("Completely Done.\n\n"); | |||
| // if work was added to the queue after this batch we can't sleep the worker threads | |||
| // by resetting the event | |||
| @@ -474,8 +451,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ | |||
| return 0; | |||
| } | |||
| /* Execute Threads */ | |||
| int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| // | |||
| // Execute Threads | |||
| // | |||
| int exec_blas(BLASLONG num, blas_queue_t *queue) { | |||
| #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) | |||
| // Handle lazy re-init of the thread-pool after a POSIX fork | |||
| @@ -507,9 +486,8 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| return 0; | |||
| } | |||
| /* Shutdown procedure, but user don't have to call this routine. The */ | |||
| /* kernel automatically kill threads. */ | |||
| // Shutdown procedure, but user don't have to call this routine. The | |||
| // kernel automatically kill threads. | |||
| int BLASFUNC(blas_thread_shutdown)(void){ | |||
| int i; | |||
| @@ -518,9 +496,9 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
| LOCK_COMMAND(&server_lock); | |||
| if (blas_server_avail){ | |||
| if (blas_server_avail) { | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| for(i = 0; i < blas_num_threads - 1; i++) { | |||
| // Could also just use WaitForMultipleObjects | |||
| DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); | |||
| @@ -555,7 +533,7 @@ void goto_set_num_threads(int num_threads) | |||
| if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | |||
| if (blas_server_avail && num_threads < blas_num_threads) { | |||
| if (blas_server_avail && num_threads < blas_num_threads) { | |||
| LOCK_COMMAND(&server_lock); | |||
| thread_target = num_threads; | |||
| @@ -586,7 +564,7 @@ void goto_set_num_threads(int num_threads) | |||
| thread_target = num_threads; | |||
| //increased_threads = 1; | |||
| if (!blas_server_avail){ | |||
| if (!blas_server_avail) { | |||
| // create the kickoff Event | |||
| kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | |||
| @@ -595,7 +573,7 @@ void goto_set_num_threads(int num_threads) | |||
| blas_server_avail = 1; | |||
| } | |||
| for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ | |||
| for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) { | |||
| //MT_TRACE("set_num_threads: creating thread [%d]\n", i); | |||
| blas_threads[i] = CreateThread(NULL, 0, | |||