|
|
|
@@ -91,11 +91,7 @@ |
|
|
|
#endif |
|
|
|
|
|
|
|
typedef struct { |
|
|
|
#if __STDC_VERSION__ >= 201112L |
|
|
|
_Atomic |
|
|
|
#else |
|
|
|
volatile |
|
|
|
#endif |
|
|
|
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; |
|
|
|
} job_t; |
|
|
|
|
|
|
|
@@ -351,7 +347,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
/* Make sure if no one is using workspace */ |
|
|
|
START_RPCC(); |
|
|
|
for (i = 0; i < args -> nthreads; i++) |
|
|
|
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; |
|
|
|
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; |
|
|
|
STOP_RPCC(waiting1); |
|
|
|
|
|
|
|
#if defined(FUSED_GEMM) && !defined(TIMING) |
|
|
|
@@ -413,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
|
|
|
|
/* Wait until other region of B is initialized */ |
|
|
|
START_RPCC(); |
|
|
|
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; |
|
|
|
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; |
|
|
|
STOP_RPCC(waiting2); |
|
|
|
|
|
|
|
/* Apply kernel with local region of A and part of other region of B */ |
|
|
|
@@ -431,6 +427,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
/* Clear synchronization flag if this thread is done with other region of B */ |
|
|
|
if (m_to - m_from == min_i) { |
|
|
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; |
|
|
|
WMB; |
|
|
|
} |
|
|
|
} |
|
|
|
} while (current != mypos); |
|
|
|
@@ -492,7 +489,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
|
|
|
START_RPCC(); |
|
|
|
for (i = 0; i < args -> nthreads; i++) { |
|
|
|
for (js = 0; js < DIVIDE_RATE; js++) { |
|
|
|
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; |
|
|
|
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;}; |
|
|
|
} |
|
|
|
} |
|
|
|
STOP_RPCC(waiting3); |
|
|
|
@@ -658,8 +655,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG |
|
|
|
} |
|
|
|
|
|
|
|
/* Clear synchronization flags */ |
|
|
|
for (i = 0; i < MAX_CPU_NUMBER; i++) { |
|
|
|
for (j = 0; j < MAX_CPU_NUMBER; j++) { |
|
|
|
for (i = 0; i < nthreads; i++) { |
|
|
|
for (j = 0; j < nthreads; j++) { |
|
|
|
for (k = 0; k < DIVIDE_RATE; k++) { |
|
|
|
job[i].working[j][CACHE_LINE_SIZE * k] = 0; |
|
|
|
} |
|
|
|
|