|
|
|
@@ -140,6 +140,16 @@ typedef struct { |
|
|
|
|
|
|
|
} thread_status_t; |
|
|
|
|
|
|
|
#if (__STDC_VERSION__ >= 201112L) |
|
|
|
#define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED) |
|
|
|
#define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) |
|
|
|
#else |
|
|
|
#define atomic_load_queue(p) (blas_queue_t*)(*(volatile blas_queue_t**)(p)) |
|
|
|
#define atomic_store_queue(p, v) (*(volatile blas_queue_t* volatile*)(p) = (v)) |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static thread_status_t thread_status[MAX_CPU_NUMBER] __attribute__((aligned(ATTRIBUTE_SIZE))); |
|
|
|
|
|
|
|
#ifndef THREAD_TIMEOUT |
|
|
|
@@ -312,20 +322,19 @@ blas_queue_t *tscq; |
|
|
|
|
|
|
|
last_tick = (unsigned int)rpcc(); |
|
|
|
|
|
|
|
pthread_mutex_lock (&thread_status[cpu].lock); |
|
|
|
tscq=thread_status[cpu].queue; |
|
|
|
pthread_mutex_unlock (&thread_status[cpu].lock); |
|
|
|
tscq = atomic_load_queue(&thread_status[cpu].queue); |
|
|
|
|
|
|
|
while(!tscq) { |
|
|
|
YIELDING; |
|
|
|
|
|
|
|
if ((unsigned int)rpcc() - last_tick > thread_timeout) { |
|
|
|
|
|
|
|
pthread_mutex_lock (&thread_status[cpu].lock); |
|
|
|
|
|
|
|
if (!thread_status[cpu].queue) { |
|
|
|
if (!atomic_load_queue(&thread_status[cpu].queue)) { |
|
|
|
pthread_mutex_lock (&thread_status[cpu].lock); |
|
|
|
thread_status[cpu].status = THREAD_STATUS_SLEEP; |
|
|
|
while (thread_status[cpu].status == THREAD_STATUS_SLEEP) { |
|
|
|
while (thread_status[cpu].status == THREAD_STATUS_SLEEP && |
|
|
|
!atomic_load_queue(&thread_status[cpu].queue)) { |
|
|
|
|
|
|
|
#ifdef MONITOR |
|
|
|
main_status[cpu] = MAIN_SLEEPING; |
|
|
|
@@ -333,19 +342,18 @@ blas_queue_t *tscq; |
|
|
|
|
|
|
|
pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock); |
|
|
|
} |
|
|
|
pthread_mutex_unlock(&thread_status[cpu].lock); |
|
|
|
} |
|
|
|
|
|
|
|
pthread_mutex_unlock(&thread_status[cpu].lock); |
|
|
|
|
|
|
|
last_tick = (unsigned int)rpcc(); |
|
|
|
} |
|
|
|
pthread_mutex_lock (&thread_status[cpu].lock); |
|
|
|
tscq=thread_status[cpu].queue; |
|
|
|
pthread_mutex_unlock (&thread_status[cpu].lock); |
|
|
|
|
|
|
|
tscq = atomic_load_queue(&thread_status[cpu].queue); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
queue = thread_status[cpu].queue; |
|
|
|
queue = atomic_load_queue(&thread_status[cpu].queue); |
|
|
|
MB; |
|
|
|
|
|
|
|
if ((long)queue == -1) break; |
|
|
|
|
|
|
|
@@ -360,9 +368,7 @@ blas_queue_t *tscq; |
|
|
|
if (queue) { |
|
|
|
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; |
|
|
|
|
|
|
|
pthread_mutex_lock (&thread_status[cpu].lock); |
|
|
|
thread_status[cpu].queue = (blas_queue_t *)1; |
|
|
|
pthread_mutex_unlock (&thread_status[cpu].lock); |
|
|
|
atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); |
|
|
|
|
|
|
|
sa = queue -> sa; |
|
|
|
sb = queue -> sb; |
|
|
|
@@ -442,13 +448,9 @@ blas_queue_t *tscq; |
|
|
|
|
|
|
|
// arm: make sure all results are written out _before_ |
|
|
|
// thread is marked as done and other threads use them |
|
|
|
WMB; |
|
|
|
MB; |
|
|
|
atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)0); |
|
|
|
|
|
|
|
pthread_mutex_lock (&thread_status[cpu].lock); |
|
|
|
thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ |
|
|
|
pthread_mutex_unlock (&thread_status[cpu].lock); |
|
|
|
|
|
|
|
WMB; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
@@ -566,7 +568,7 @@ int blas_thread_init(void){ |
|
|
|
|
|
|
|
for(i = 0; i < blas_num_threads - 1; i++){ |
|
|
|
|
|
|
|
thread_status[i].queue = (blas_queue_t *)NULL; |
|
|
|
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); |
|
|
|
thread_status[i].status = THREAD_STATUS_WAKEUP; |
|
|
|
|
|
|
|
pthread_mutex_init(&thread_status[i].lock, NULL); |
|
|
|
@@ -655,7 +657,8 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ |
|
|
|
if (queue -> mode & BLAS_NODE) { |
|
|
|
|
|
|
|
do { |
|
|
|
while((thread_status[i].node != node || thread_status[i].queue) && (i < blas_num_threads - 1)) i ++; |
|
|
|
|
|
|
|
while((thread_status[i].node != node || atomic_load_queue(&thread_status[i].queue)) && (i < blas_num_threads - 1)) i ++; |
|
|
|
|
|
|
|
if (i < blas_num_threads - 1) break; |
|
|
|
|
|
|
|
@@ -669,36 +672,26 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ |
|
|
|
} while (1); |
|
|
|
|
|
|
|
} else { |
|
|
|
pthread_mutex_lock (&thread_status[i].lock); |
|
|
|
tsiq = thread_status[i].queue; |
|
|
|
pthread_mutex_unlock (&thread_status[i].lock); |
|
|
|
tsiq = atomic_load_queue(&thread_status[i].queue); |
|
|
|
while(tsiq) { |
|
|
|
i ++; |
|
|
|
if (i >= blas_num_threads - 1) i = 0; |
|
|
|
pthread_mutex_lock (&thread_status[i].lock); |
|
|
|
tsiq = thread_status[i].queue; |
|
|
|
pthread_mutex_unlock (&thread_status[i].lock); |
|
|
|
tsiq = atomic_load_queue(&thread_status[i].queue); |
|
|
|
} |
|
|
|
} |
|
|
|
#else |
|
|
|
pthread_mutex_lock (&thread_status[i].lock); |
|
|
|
tsiq=thread_status[i].queue ; |
|
|
|
pthread_mutex_unlock (&thread_status[i].lock); |
|
|
|
tsiq = atomic_load_queue(&thread_status[i].queue); |
|
|
|
while(tsiq) { |
|
|
|
i ++; |
|
|
|
if (i >= blas_num_threads - 1) i = 0; |
|
|
|
pthread_mutex_lock (&thread_status[i].lock); |
|
|
|
tsiq=thread_status[i].queue ; |
|
|
|
pthread_mutex_unlock (&thread_status[i].lock); |
|
|
|
tsiq = atomic_load_queue(&thread_status[i].queue); |
|
|
|
} |
|
|
|
#endif |
|
|
|
|
|
|
|
queue -> assigned = i; |
|
|
|
WMB; |
|
|
|
pthread_mutex_lock (&thread_status[i].lock); |
|
|
|
thread_status[i].queue = queue; |
|
|
|
pthread_mutex_unlock (&thread_status[i].lock); |
|
|
|
WMB; |
|
|
|
MB; |
|
|
|
|
|
|
|
atomic_store_queue(&thread_status[i].queue, queue); |
|
|
|
|
|
|
|
queue = queue -> next; |
|
|
|
pos ++; |
|
|
|
@@ -718,9 +711,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ |
|
|
|
|
|
|
|
pos = current -> assigned; |
|
|
|
|
|
|
|
pthread_mutex_lock (&thread_status[pos].lock); |
|
|
|
tspq=thread_status[pos].queue; |
|
|
|
pthread_mutex_unlock (&thread_status[pos].lock); |
|
|
|
tspq = atomic_load_queue(&thread_status[pos].queue); |
|
|
|
|
|
|
|
if ((BLASULONG)tspq > 1) { |
|
|
|
pthread_mutex_lock (&thread_status[pos].lock); |
|
|
|
@@ -752,24 +743,20 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ |
|
|
|
|
|
|
|
while ((num > 0) && queue) { |
|
|
|
|
|
|
|
pthread_mutex_lock(&thread_status[queue->assigned].lock); |
|
|
|
tsqq=thread_status[queue -> assigned].queue; |
|
|
|
pthread_mutex_unlock(&thread_status[queue->assigned].lock); |
|
|
|
tsqq = atomic_load_queue(&thread_status[queue->assigned].queue); |
|
|
|
|
|
|
|
|
|
|
|
while(tsqq) { |
|
|
|
YIELDING; |
|
|
|
pthread_mutex_lock(&thread_status[queue->assigned].lock); |
|
|
|
tsqq=thread_status[queue -> assigned].queue; |
|
|
|
pthread_mutex_unlock(&thread_status[queue->assigned].lock); |
|
|
|
|
|
|
|
|
|
|
|
tsqq = atomic_load_queue(&thread_status[queue->assigned].queue); |
|
|
|
}; |
|
|
|
|
|
|
|
queue = queue -> next; |
|
|
|
num --; |
|
|
|
} |
|
|
|
|
|
|
|
MB; |
|
|
|
|
|
|
|
#ifdef SMP_DEBUG |
|
|
|
fprintf(STDERR, "Done.\n\n"); |
|
|
|
#endif |
|
|
|
@@ -880,7 +867,7 @@ void goto_set_num_threads(int num_threads) { |
|
|
|
|
|
|
|
for(i = blas_num_threads - 1; i < num_threads - 1; i++){ |
|
|
|
|
|
|
|
thread_status[i].queue = (blas_queue_t *)NULL; |
|
|
|
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); |
|
|
|
thread_status[i].status = THREAD_STATUS_WAKEUP; |
|
|
|
|
|
|
|
pthread_mutex_init(&thread_status[i].lock, NULL); |
|
|
|
@@ -971,12 +958,11 @@ int BLASFUNC(blas_thread_shutdown)(void){ |
|
|
|
|
|
|
|
for (i = 0; i < blas_num_threads - 1; i++) { |
|
|
|
|
|
|
|
pthread_mutex_lock (&thread_status[i].lock); |
|
|
|
|
|
|
|
thread_status[i].queue = (blas_queue_t *)-1; |
|
|
|
pthread_mutex_lock (&thread_status[i].lock); |
|
|
|
|
|
|
|
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); |
|
|
|
thread_status[i].status = THREAD_STATUS_WAKEUP; |
|
|
|
|
|
|
|
pthread_cond_signal (&thread_status[i].wakeup); |
|
|
|
|
|
|
|
pthread_mutex_unlock(&thread_status[i].lock); |
|
|
|
|