ger and gemv call blas_memory_alloc/free which in their turn call blas_lock. blas_lock create thread contention when matrices are small and the number of thread is high enough. We avoid call blas_memory_alloc by replacing it with stack allocation. This can be enabled with: make -DMAX_STACK_ALLOC=2048 The given size (in byte) must be high enough to avoid thread contention and small enough to avoid stack overflow. Fix #478tags/v0.2.14^2
| @@ -305,6 +305,10 @@ ifdef SANITY_CHECK | |||||
| CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU) | CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU) | ||||
| endif | endif | ||||
| ifdef MAX_STACK_ALLOC | |||||
| CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) | |||||
| endif | |||||
| # | # | ||||
| # Architecture dependent settings | # Architecture dependent settings | ||||
| # | # | ||||
| @@ -208,7 +208,18 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| if (incx < 0) x -= (lenx - 1) * incx; | if (incx < 0) x -= (lenx - 1) * incx; | ||||
| if (incy < 0) y -= (leny - 1) * incy; | if (incy < 0) y -= (leny - 1) * incy; | ||||
| #ifdef MAX_STACK_ALLOC | |||||
| int stack_alloc_size = m + n; | |||||
| if(stack_alloc_size < 128) | |||||
| //dgemv_n.S require a 128 bytes buffer | |||||
| stack_alloc_size = 128; | |||||
| if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) | |||||
| stack_alloc_size = 0; | |||||
| FLOAT stack_buffer[stack_alloc_size]; | |||||
| buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); | |||||
| #else | |||||
| buffer = (FLOAT *)blas_memory_alloc(1); | buffer = (FLOAT *)blas_memory_alloc(1); | ||||
| #endif | |||||
| #ifdef SMP | #ifdef SMP | ||||
| @@ -237,7 +248,10 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| } | } | ||||
| #endif | #endif | ||||
| blas_memory_free(buffer); | |||||
| #ifdef MAX_STACK_ALLOC | |||||
| if(!stack_alloc_size) | |||||
| #endif | |||||
| blas_memory_free(buffer); | |||||
| FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); | FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); | ||||
| @@ -171,7 +171,15 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| if (incy < 0) y -= (n - 1) * incy; | if (incy < 0) y -= (n - 1) * incy; | ||||
| if (incx < 0) x -= (m - 1) * incx; | if (incx < 0) x -= (m - 1) * incx; | ||||
| #ifdef MAX_STACK_ALLOC | |||||
| int stack_alloc_size = m; | |||||
| if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) | |||||
| stack_alloc_size = 0; | |||||
| FLOAT stack_buffer[stack_alloc_size]; | |||||
| buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); | |||||
| #else | |||||
| buffer = (FLOAT *)blas_memory_alloc(1); | buffer = (FLOAT *)blas_memory_alloc(1); | ||||
| #endif | |||||
| #ifdef SMPTEST | #ifdef SMPTEST | ||||
| nthreads = num_cpu_avail(2); | nthreads = num_cpu_avail(2); | ||||
| @@ -190,7 +198,10 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| } | } | ||||
| #endif | #endif | ||||
| blas_memory_free(buffer); | |||||
| #ifdef MAX_STACK_ALLOC | |||||
| if(!stack_alloc_size) | |||||
| #endif | |||||
| blas_memory_free(buffer); | |||||
| FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); | FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); | ||||
| @@ -302,7 +302,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||||
| if ( n < 1 ) return(0); | if ( n < 1 ) return(0); | ||||
| xbuffer = buffer; | xbuffer = buffer; | ||||
| ytemp = buffer + NBMAX; | |||||
| ytemp = buffer + (m < NBMAX ? m : NBMAX); | |||||
| n0 = n / NBMAX; | n0 = n / NBMAX; | ||||
| n1 = (n % NBMAX) >> 2 ; | n1 = (n % NBMAX) >> 2 ; | ||||