Introduced callback to Pthread, Win32 and OpenMP backendtags/v0.3.28^2
| @@ -26,6 +26,11 @@ char* openblas_get_config(void); | |||||
| /*Get the CPU corename on runtime.*/ | /*Get the CPU corename on runtime.*/ | ||||
| char* openblas_get_corename(void); | char* openblas_get_corename(void); | ||||
| /*Set the threading backend to a custom callback.*/ | |||||
| typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, int dojob_data); | |||||
| typedef void (*openblas_threads_callback)(int sync, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, int dojob_data); | |||||
| void openblas_set_threads_callback_function(openblas_threads_callback callback); | |||||
| #ifdef OPENBLAS_OS_LINUX | #ifdef OPENBLAS_OS_LINUX | ||||
| /* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */ | /* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */ | ||||
| int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set); | int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set); | ||||
| @@ -47,6 +47,11 @@ int BLASFUNC(xerbla)(char *, blasint *info, blasint); | |||||
| void openblas_set_num_threads_(int *); | void openblas_set_num_threads_(int *); | ||||
| /*Set the threading backend to a custom callback.*/ | |||||
| typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, int dojob_data); | |||||
| typedef void (*openblas_threads_callback)(int sync, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, int dojob_data); | |||||
| extern openblas_threads_callback openblas_threads_callback_; | |||||
| FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); | FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); | ||||
| FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); | FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); | ||||
| @@ -25,6 +25,7 @@ if (USE_THREAD) | |||||
| ${BLAS_SERVER} | ${BLAS_SERVER} | ||||
| divtable.c # TODO: Makefile has -UDOUBLE | divtable.c # TODO: Makefile has -UDOUBLE | ||||
| blas_l1_thread.c | blas_l1_thread.c | ||||
| blas_server_callback.c | |||||
| ) | ) | ||||
| if (NOT NO_AFFINITY) | if (NOT NO_AFFINITY) | ||||
| @@ -6,7 +6,7 @@ COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) | |||||
| #COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | #COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | ||||
| ifdef SMP | ifdef SMP | ||||
| COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) | |||||
| COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) blas_server_callback.$(SUFFIX) | |||||
| ifneq ($(NO_AFFINITY), 1) | ifneq ($(NO_AFFINITY), 1) | ||||
| COMMONOBJS += init.$(SUFFIX) | COMMONOBJS += init.$(SUFFIX) | ||||
| endif | endif | ||||
| @@ -140,6 +140,9 @@ memory.$(SUFFIX) : $(MEMORY) ../../common.h ../../param.h | |||||
| blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../../param.h | blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../../param.h | ||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | $(CC) $(CFLAGS) -c $< -o $(@F) | ||||
| blas_server_callback.$(SUFFIX) : blas_server_callback.c ../../common.h | |||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||||
| openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c | openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c | ||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | $(CC) $(CFLAGS) -c $< -o $(@F) | ||||
| @@ -115,6 +115,8 @@ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; | |||||
| int blas_omp_threads_local = 1; | int blas_omp_threads_local = 1; | ||||
| static void * blas_thread_buffer[MAX_CPU_NUMBER]; | |||||
| /* Local Variables */ | /* Local Variables */ | ||||
| #if defined(USE_PTHREAD_LOCK) | #if defined(USE_PTHREAD_LOCK) | ||||
| static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER; | static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER; | ||||
| @@ -190,6 +192,10 @@ static int main_status[MAX_CPU_NUMBER]; | |||||
| BLASLONG exit_time[MAX_CPU_NUMBER]; | BLASLONG exit_time[MAX_CPU_NUMBER]; | ||||
| #endif | #endif | ||||
| //Prototypes | |||||
| static void exec_threads(int , blas_queue_t *, int); | |||||
| static void adjust_thread_buffers(); | |||||
| static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | ||||
| if (!(mode & BLAS_COMPLEX)){ | if (!(mode & BLAS_COMPLEX)){ | ||||
| @@ -375,7 +381,6 @@ static void* blas_thread_server(void *arg){ | |||||
| /* Thread identifier */ | /* Thread identifier */ | ||||
| BLASLONG cpu = (BLASLONG)arg; | BLASLONG cpu = (BLASLONG)arg; | ||||
| unsigned int last_tick; | unsigned int last_tick; | ||||
| void *buffer, *sa, *sb; | |||||
| blas_queue_t *queue; | blas_queue_t *queue; | ||||
| blas_queue_t *tscq; | blas_queue_t *tscq; | ||||
| @@ -395,8 +400,6 @@ blas_queue_t *tscq; | |||||
| main_status[cpu] = MAIN_ENTER; | main_status[cpu] = MAIN_ENTER; | ||||
| #endif | #endif | ||||
| buffer = blas_memory_alloc(2); | |||||
| #ifdef SMP_DEBUG | #ifdef SMP_DEBUG | ||||
| fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu); | fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu); | ||||
| #endif | #endif | ||||
| @@ -456,117 +459,9 @@ blas_queue_t *tscq; | |||||
| start = rpcc(); | start = rpcc(); | ||||
| #endif | #endif | ||||
| if (queue) { | |||||
| int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine; | |||||
| atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); | |||||
| sa = queue -> sa; | |||||
| sb = queue -> sb; | |||||
| #ifdef SMP_DEBUG | |||||
| if (queue -> args) { | |||||
| fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", | |||||
| cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); | |||||
| } | |||||
| #endif | |||||
| #ifdef CONSISTENT_FPCSR | |||||
| #ifdef __aarch64__ | |||||
| __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); | |||||
| #else | |||||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||||
| #endif | |||||
| #endif | |||||
| #ifdef MONITOR | |||||
| main_status[cpu] = MAIN_RUNNING1; | |||||
| #endif | |||||
| //For Loongson servers, like the 3C5000 (featuring 16 cores), applying an | |||||
| //offset to the buffer is essential for minimizing cache conflicts and optimizing performance. | |||||
| #if defined(LOONGSON3R5) && !defined(NO_AFFINITY) | |||||
| char model_name[128]; | |||||
| get_cpu_model(model_name); | |||||
| if ((strstr(model_name, "3C5000") != NULL) || (strstr(model_name, "3D5000") != NULL)) | |||||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); | |||||
| #endif | |||||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||||
| if (sb == NULL) { | |||||
| if (!(queue -> mode & BLAS_COMPLEX)){ | |||||
| #ifdef EXPRECISION | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||||
| sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| } else | |||||
| #endif | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { | |||||
| #ifdef BUILD_DOUBLE | |||||
| sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { | |||||
| #ifdef BUILD_SINGLE | |||||
| sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else { | |||||
| /* Other types in future */ | |||||
| } | |||||
| } else { | |||||
| #ifdef EXPRECISION | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||||
| sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| } else | |||||
| #endif | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ | |||||
| #ifdef BUILD_COMPLEX16 | |||||
| sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { | |||||
| #ifdef BUILD_COMPLEX | |||||
| sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else { | |||||
| /* Other types in future */ | |||||
| } | |||||
| } | |||||
| queue->sb=sb; | |||||
| } | |||||
| #ifdef MONITOR | |||||
| main_status[cpu] = MAIN_RUNNING2; | |||||
| #endif | |||||
| if (queue -> mode & BLAS_LEGACY) { | |||||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||||
| } else | |||||
| if (queue -> mode & BLAS_PTHREAD) { | |||||
| void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine; | |||||
| (pthreadcompat)(queue -> args); | |||||
| } else | |||||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||||
| #ifdef SMP_DEBUG | |||||
| fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu); | |||||
| #endif | |||||
| #ifdef MONITOR | |||||
| main_status[cpu] = MAIN_FINISH; | |||||
| #endif | |||||
| // arm: make sure all results are written out _before_ | |||||
| // thread is marked as done and other threads use them | |||||
| MB; | |||||
| atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)0); | |||||
| } | |||||
| if(queue) { | |||||
| exec_threads(cpu, queue, 0); | |||||
| } | |||||
| #ifdef MONITOR | #ifdef MONITOR | ||||
| main_status[cpu] = MAIN_DONE; | main_status[cpu] = MAIN_DONE; | ||||
| @@ -588,8 +483,6 @@ blas_queue_t *tscq; | |||||
| fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); | fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); | ||||
| #endif | #endif | ||||
| blas_memory_free(buffer); | |||||
| //pthread_exit(NULL); | //pthread_exit(NULL); | ||||
| return NULL; | return NULL; | ||||
| @@ -671,6 +564,9 @@ int blas_thread_init(void){ | |||||
| LOCK_COMMAND(&server_lock); | LOCK_COMMAND(&server_lock); | ||||
| // Adjust thread buffers | |||||
| adjust_thread_buffers(); | |||||
| if (!blas_server_avail){ | if (!blas_server_avail){ | ||||
| thread_timeout_env=openblas_thread_timeout(); | thread_timeout_env=openblas_thread_timeout(); | ||||
| @@ -901,6 +797,18 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||||
| fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num); | fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num); | ||||
| #endif | #endif | ||||
| //Redirect to caller's callback routine | |||||
| if (openblas_threads_callback_) { | |||||
| int buf_index = 0, i = 0; | |||||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | |||||
| for (i = 0; i < num; i ++) | |||||
| queue[i].position = i; | |||||
| #endif | |||||
| openblas_threads_callback_(1, (openblas_dojob_callback) exec_threads, num, sizeof(blas_queue_t), (void*) queue, buf_index); | |||||
| return 0; | |||||
| } | |||||
| #ifdef __ELF__ | #ifdef __ELF__ | ||||
| if (omp_in_parallel && (num > 1)) { | if (omp_in_parallel && (num > 1)) { | ||||
| if (omp_in_parallel() > 0) { | if (omp_in_parallel() > 0) { | ||||
| @@ -1074,6 +982,14 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||||
| LOCK_COMMAND(&server_lock); | LOCK_COMMAND(&server_lock); | ||||
| //Free buffers allocated for threads | |||||
| for(i=0; i<MAX_CPU_NUMBER; i++){ | |||||
| if(blas_thread_buffer[i]!=NULL){ | |||||
| blas_memory_free(blas_thread_buffer[i]); | |||||
| blas_thread_buffer[i]=NULL; | |||||
| } | |||||
| } | |||||
| if (blas_server_avail) { | if (blas_server_avail) { | ||||
| for (i = 0; i < blas_num_threads - 1; i++) { | for (i = 0; i < blas_num_threads - 1; i++) { | ||||
| @@ -1110,5 +1026,135 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| static void adjust_thread_buffers() { | |||||
| int i=0; | |||||
| //adjust buffer for each thread | |||||
| for(i=0; i < blas_cpu_number; i++){ | |||||
| if(blas_thread_buffer[i] == NULL){ | |||||
| blas_thread_buffer[i] = blas_memory_alloc(2); | |||||
| } | |||||
| } | |||||
| for(; i < MAX_CPU_NUMBER; i++){ | |||||
| if(blas_thread_buffer[i] != NULL){ | |||||
| blas_memory_free(blas_thread_buffer[i]); | |||||
| blas_thread_buffer[i] = NULL; | |||||
| } | |||||
| } | |||||
| } | |||||
| static void exec_threads(int cpu, blas_queue_t *queue, int buf_index) { | |||||
| int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine; | |||||
| atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); | |||||
| void *buffer = blas_thread_buffer[cpu]; | |||||
| void *sa = queue -> sa; | |||||
| void *sb = queue -> sb; | |||||
| #ifdef SMP_DEBUG | |||||
| if (queue -> args) { | |||||
| fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", | |||||
| cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); | |||||
| } | |||||
| #endif | |||||
| #ifdef CONSISTENT_FPCSR | |||||
| #ifdef __aarch64__ | |||||
| __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); | |||||
| #else | |||||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||||
| #endif | |||||
| #endif | |||||
| #ifdef MONITOR | |||||
| main_status[cpu] = MAIN_RUNNING1; | |||||
| #endif | |||||
| //For Loongson servers, like the 3C5000 (featuring 16 cores), applying an | |||||
| //offset to the buffer is essential for minimizing cache conflicts and optimizing performance. | |||||
| #if defined(LOONGSON3R5) && !defined(NO_AFFINITY) | |||||
| char model_name[128]; | |||||
| get_cpu_model(model_name); | |||||
| if ((strstr(model_name, "3C5000") != NULL) || (strstr(model_name, "3D5000") != NULL)) | |||||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); | |||||
| #endif | #endif | ||||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||||
| if (sb == NULL) { | |||||
| if (!(queue -> mode & BLAS_COMPLEX)){ | |||||
| #ifdef EXPRECISION | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||||
| sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| } else | |||||
| #endif | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { | |||||
| #ifdef BUILD_DOUBLE | |||||
| sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { | |||||
| #ifdef BUILD_SINGLE | |||||
| sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else { | |||||
| /* Other types in future */ | |||||
| } | |||||
| } else { | |||||
| #ifdef EXPRECISION | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||||
| sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| } else | |||||
| #endif | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ | |||||
| #ifdef BUILD_COMPLEX16 | |||||
| sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { | |||||
| #ifdef BUILD_COMPLEX | |||||
| sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else { | |||||
| /* Other types in future */ | |||||
| } | |||||
| } | |||||
| queue->sb=sb; | |||||
| } | |||||
| #ifdef MONITOR | |||||
| main_status[cpu] = MAIN_RUNNING2; | |||||
| #endif | |||||
| if (queue -> mode & BLAS_LEGACY) { | |||||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||||
| } else | |||||
| if (queue -> mode & BLAS_PTHREAD) { | |||||
| void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine; | |||||
| (pthreadcompat)(queue -> args); | |||||
| } else | |||||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||||
| #ifdef SMP_DEBUG | |||||
| fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu); | |||||
| #endif | |||||
| #ifdef MONITOR | |||||
| main_status[cpu] = MAIN_FINISH; | |||||
| #endif | |||||
| // arm: make sure all results are written out _before_ | |||||
| // thread is marked as done and other threads use them | |||||
| MB; | |||||
| atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)0); | |||||
| } | |||||
| #endif | |||||
| @@ -0,0 +1,12 @@ | |||||
| #include "common.h" | |||||
| /* global variable to change threading backend from openblas-managed to caller-managed */ | |||||
| openblas_threads_callback openblas_threads_callback_ = 0; | |||||
| /* non-threadsafe function should be called before any other | |||||
| openblas function to change how threads are managed */ | |||||
| void openblas_set_threads_callback_function(openblas_threads_callback callback) | |||||
| { | |||||
| openblas_threads_callback_ = callback; | |||||
| } | |||||
| @@ -285,7 +285,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||||
| } | } | ||||
| } | } | ||||
| static void exec_threads(blas_queue_t *queue, int buf_index){ | |||||
| static void exec_threads(int thread_num, blas_queue_t *queue, int buf_index){ | |||||
| void *buffer, *sa, *sb; | void *buffer, *sa, *sb; | ||||
| int pos=0, release_flag=0; | int pos=0, release_flag=0; | ||||
| @@ -305,7 +305,7 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ | |||||
| if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { | if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { | ||||
| pos = omp_get_thread_num(); | |||||
| pos= thread_num; | |||||
| buffer = blas_thread_buffer[buf_index][pos]; | buffer = blas_thread_buffer[buf_index][pos]; | ||||
| //fallback | //fallback | ||||
| @@ -420,18 +420,25 @@ while (true) { | |||||
| break; | break; | ||||
| } | } | ||||
| } | } | ||||
| if (i != MAX_PARALLEL_NUMBER) | |||||
| break; | |||||
| } | |||||
| if (openblas_omp_adaptive_env() != 0) { | |||||
| #pragma omp parallel for num_threads(num) schedule(OMP_SCHED) | |||||
| for (i = 0; i < num; i ++) { | |||||
| if(i != MAX_PARALLEL_NUMBER) | |||||
| break; | |||||
| } | |||||
| /*For caller-managed threading, if caller has registered the callback, pass exec_thread as callback function*/ | |||||
| if (openblas_threads_callback_) { | |||||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | |||||
| for (i = 0; i < num; i ++) | |||||
| queue[i].position = i; | |||||
| #endif | |||||
| openblas_threads_callback_(1, (openblas_dojob_callback) exec_threads, num, sizeof(blas_queue_t), (void*) queue, buf_index); | |||||
| } else { | |||||
| if (openblas_omp_adaptive_env() != 0) { | |||||
| #pragma omp parallel for num_threads(num) schedule(OMP_SCHED) | |||||
| for (i = 0; i < num; i ++) { | |||||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | #ifndef USE_SIMPLE_THREADED_LEVEL3 | ||||
| queue[i].position = i; | queue[i].position = i; | ||||
| #endif | #endif | ||||
| exec_threads(&queue[i], buf_index); | |||||
| exec_threads(omp_get_thread_num(), &queue[i], buf_index); | |||||
| } | } | ||||
| } else { | } else { | ||||
| #pragma omp parallel for schedule(OMP_SCHED) | #pragma omp parallel for schedule(OMP_SCHED) | ||||
| @@ -441,9 +448,10 @@ if (openblas_omp_adaptive_env() != 0) { | |||||
| queue[i].position = i; | queue[i].position = i; | ||||
| #endif | #endif | ||||
| exec_threads(&queue[i], buf_index); | |||||
| exec_threads(omp_get_thread_num(), &queue[i], buf_index); | |||||
| } | } | ||||
| } | } | ||||
| } | |||||
| #ifdef HAVE_C11 | #ifdef HAVE_C11 | ||||
| atomic_store(&blas_buffer_inuse[buf_index], false); | atomic_store(&blas_buffer_inuse[buf_index], false); | ||||
| @@ -1,3 +1,4 @@ | |||||
| /*********************************************************************/ | /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | /* Copyright 2009, 2010 The University of Texas at Austin. */ | ||||
| /* All rights reserved. */ | /* All rights reserved. */ | ||||
| @@ -67,6 +68,8 @@ int blas_server_avail = 0; | |||||
| int blas_omp_threads_local = 1; | int blas_omp_threads_local = 1; | ||||
| static void * blas_thread_buffer[MAX_CPU_NUMBER]; | |||||
| /* Local Variables */ | /* Local Variables */ | ||||
| static BLASULONG server_lock = 0; | static BLASULONG server_lock = 0; | ||||
| @@ -74,6 +77,10 @@ static HANDLE blas_threads [MAX_CPU_NUMBER]; | |||||
| static DWORD blas_threads_id[MAX_CPU_NUMBER]; | static DWORD blas_threads_id[MAX_CPU_NUMBER]; | ||||
| static volatile int thread_target; // target num of live threads, volatile for cross-thread reads | static volatile int thread_target; // target num of live threads, volatile for cross-thread reads | ||||
| //Prototypes | |||||
| static void exec_threads(int , blas_queue_t *, int); | |||||
| static void adjust_thread_buffers(); | |||||
| // | // | ||||
| // Legacy code path | // Legacy code path | ||||
| // | // | ||||
| @@ -207,13 +214,9 @@ static DWORD WINAPI blas_thread_server(void *arg) { | |||||
| /* Thread identifier */ | /* Thread identifier */ | ||||
| BLASLONG cpu = (BLASLONG)arg; | BLASLONG cpu = (BLASLONG)arg; | ||||
| void *buffer, *sa, *sb; | |||||
| blas_queue_t *queue; | blas_queue_t *queue; | ||||
| /* Each server needs each buffer */ | |||||
| buffer = blas_memory_alloc(2); | |||||
| MT_TRACE("Server[%2ld] Thread is started!\n", cpu); | MT_TRACE("Server[%2ld] Thread is started!\n", cpu); | ||||
| while (1) { | while (1) { | ||||
| @@ -240,87 +243,14 @@ static DWORD WINAPI blas_thread_server(void *arg) { | |||||
| LeaveCriticalSection(&queue_lock); | LeaveCriticalSection(&queue_lock); | ||||
| if (queue) { | |||||
| int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; | |||||
| sa = queue -> sa; | |||||
| sb = queue -> sb; | |||||
| #ifdef CONSISTENT_FPCSR | |||||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||||
| #endif | |||||
| MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", | |||||
| cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); | |||||
| // fprintf(stderr, "queue start[%ld]!!!\n", cpu); | |||||
| #ifdef MONITOR | |||||
| main_status[cpu] = MAIN_RUNNING1; | |||||
| #endif | |||||
| if (sa == NULL) | |||||
| sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||||
| if (sb == NULL) { | |||||
| if (!(queue -> mode & BLAS_COMPLEX)) { | |||||
| #ifdef EXPRECISION | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) { | |||||
| sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| } else | |||||
| #endif | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { | |||||
| #ifdef BUILD_DOUBLE | |||||
| sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { | |||||
| #ifdef BUILD_SINGLE | |||||
| sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else { | |||||
| /* Other types in future */ | |||||
| } | |||||
| } else { | |||||
| #ifdef EXPRECISION | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||||
| sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| } else | |||||
| #endif | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ | |||||
| #ifdef BUILD_COMPLEX16 | |||||
| sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { | |||||
| #ifdef BUILD_COMPLEX | |||||
| sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else { | |||||
| /* Other types in future */ | |||||
| } | |||||
| } | |||||
| queue->sb=sb; | |||||
| } | |||||
| #ifdef MONITOR | |||||
| main_status[cpu] = MAIN_RUNNING2; | |||||
| #endif | |||||
| if(queue) { | |||||
| if (!(queue -> mode & BLAS_LEGACY)) { | |||||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||||
| } else { | |||||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||||
| } | |||||
| exec_threads(cpu, queue, 0); | |||||
| } else { | } else { | ||||
| continue; //if queue == NULL | |||||
| } | |||||
| continue; //if queue == NULL | |||||
| } | |||||
| MT_TRACE("Server[%2ld] Finished!\n", cpu); | MT_TRACE("Server[%2ld] Finished!\n", cpu); | ||||
| queue->finished = 1; | queue->finished = 1; | ||||
| @@ -330,8 +260,6 @@ static DWORD WINAPI blas_thread_server(void *arg) { | |||||
| MT_TRACE("Server[%2ld] Shutdown!\n", cpu); | MT_TRACE("Server[%2ld] Shutdown!\n", cpu); | ||||
| blas_memory_free(buffer); | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| @@ -345,6 +273,8 @@ int blas_thread_init(void) { | |||||
| LOCK_COMMAND(&server_lock); | LOCK_COMMAND(&server_lock); | ||||
| adjust_thread_buffers(); | |||||
| MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number); | MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number); | ||||
| if (!blas_server_avail) { | if (!blas_server_avail) { | ||||
| @@ -473,6 +403,17 @@ int exec_blas(BLASLONG num, blas_queue_t *queue) { | |||||
| if ((num <= 0) || (queue == NULL)) return 0; | if ((num <= 0) || (queue == NULL)) return 0; | ||||
| //Redirect to caller's callback routine | |||||
| if (openblas_threads_callback_) { | |||||
| int buf_index = 0, i = 0; | |||||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | |||||
| for (i = 0; i < num; i ++) | |||||
| queue[i].position = i; | |||||
| #endif | |||||
| openblas_threads_callback_(1, (openblas_dojob_callback) exec_threads, num, sizeof(blas_queue_t), (void*) queue, buf_index); | |||||
| return 0; | |||||
| } | |||||
| if ((num > 1) && queue -> next) | if ((num > 1) && queue -> next) | ||||
| exec_blas_async(1, queue -> next); | exec_blas_async(1, queue -> next); | ||||
| @@ -507,6 +448,14 @@ int BLASFUNC(blas_thread_shutdown)(void) { | |||||
| LOCK_COMMAND(&server_lock); | LOCK_COMMAND(&server_lock); | ||||
| //Free buffers allocated for threads | |||||
| for(i=0; i<MAX_CPU_NUMBER; i++){ | |||||
| if(blas_thread_buffer[i]!=NULL){ | |||||
| blas_memory_free(blas_thread_buffer[i]); | |||||
| blas_thread_buffer[i]=NULL; | |||||
| } | |||||
| } | |||||
| if (blas_server_avail) { | if (blas_server_avail) { | ||||
| for (i = 0; i < blas_num_threads - 1; i++) { | for (i = 0; i < blas_num_threads - 1; i++) { | ||||
| @@ -610,3 +559,106 @@ void openblas_set_num_threads(int num) | |||||
| { | { | ||||
| goto_set_num_threads(num); | goto_set_num_threads(num); | ||||
| } | } | ||||
| static void adjust_thread_buffers() { | |||||
| int i=0; | |||||
| //adjust buffer for each thread | |||||
| for(i=0; i < blas_cpu_number; i++){ | |||||
| if(blas_thread_buffer[i] == NULL){ | |||||
| blas_thread_buffer[i] = blas_memory_alloc(2); | |||||
| } | |||||
| } | |||||
| for(; i < MAX_CPU_NUMBER; i++){ | |||||
| if(blas_thread_buffer[i] != NULL){ | |||||
| blas_memory_free(blas_thread_buffer[i]); | |||||
| blas_thread_buffer[i] = NULL; | |||||
| } | |||||
| } | |||||
| } | |||||
| //Indivitual threads work executor, Helps in setting by synchronization environment and calling inner_threads routine | |||||
| static void exec_threads(int cpu, blas_queue_t *queue, int buf_index) { | |||||
| void *buffer, *sa, *sb; | |||||
| buffer = blas_thread_buffer[cpu]; | |||||
| sa = queue -> sa; | |||||
| sb = queue -> sb; | |||||
| int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; | |||||
| #ifdef CONSISTENT_FPCSR | |||||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||||
| #endif | |||||
| MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", | |||||
| cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); | |||||
| // fprintf(stderr, "queue start[%ld]!!!\n", cpu); | |||||
| #ifdef MONITOR | |||||
| main_status[cpu] = MAIN_RUNNING1; | |||||
| #endif | |||||
| if (sa == NULL) | |||||
| sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||||
| if (sb == NULL) { | |||||
| if (!(queue -> mode & BLAS_COMPLEX)) { | |||||
| #ifdef EXPRECISION | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) { | |||||
| sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| } else | |||||
| #endif | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { | |||||
| #ifdef BUILD_DOUBLE | |||||
| sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { | |||||
| #ifdef BUILD_SINGLE | |||||
| sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else { | |||||
| /* Other types in future */ | |||||
| } | |||||
| } else { | |||||
| #ifdef EXPRECISION | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||||
| sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| } else | |||||
| #endif | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ | |||||
| #ifdef BUILD_COMPLEX16 | |||||
| sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { | |||||
| #ifdef BUILD_COMPLEX | |||||
| sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) | |||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||||
| #endif | |||||
| } else { | |||||
| /* Other types in future */ | |||||
| } | |||||
| } | |||||
| queue->sb=sb; | |||||
| } | |||||
| #ifdef MONITOR | |||||
| main_status[cpu] = MAIN_RUNNING2; | |||||
| #endif | |||||
| if (!(queue -> mode & BLAS_LEGACY)) { | |||||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||||
| } else { | |||||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||||
| } | |||||
| } | |||||