updated some level1 funcions, that are not thread savetags/v0.2.20^2
| @@ -42,24 +42,6 @@ | |||||
| #include "functable.h" | #include "functable.h" | ||||
| #endif | #endif | ||||
| #ifdef SMP | |||||
| static int asum_threads (BLASLONG m, BLASLONG n, BLASLONG k, float alpha, | |||||
| float* x, BLASLONG incx, float* y, BLASLONG incy, float* z, BLASLONG incz) | |||||
| { | |||||
| #ifndef CBLAS | |||||
| FLOATRET ret; | |||||
| ret = (FLOATRET)ASUM_K(m, x, incx); | |||||
| *((double *)z) = (double)ret; | |||||
| #else | |||||
| FLOAT ret; | |||||
| ret = ASUM_K(m, x, incx); | |||||
| *((double *)z) = (double)ret; | |||||
| #endif | |||||
| return 0; | |||||
| } | |||||
| #endif | |||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | ||||
| @@ -70,62 +52,14 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||||
| PRINT_DEBUG_NAME; | PRINT_DEBUG_NAME; | ||||
| #ifdef SMP | |||||
| int i; | |||||
| int mode, nthreads; | |||||
| double mid_result= 0.0; | |||||
| FLOAT dummyalpha[2] = {ZERO, ZERO}; | |||||
| double *buffer = (double*)blas_memory_alloc(0); | |||||
| #endif | |||||
| if (n <= 0) return 0; | if (n <= 0) return 0; | ||||
| IDEBUG_START; | IDEBUG_START; | ||||
| FUNCTION_PROFILE_START(); | FUNCTION_PROFILE_START(); | ||||
| #ifdef SMP | |||||
| nthreads = num_cpu_avail(1); | |||||
| //Temporarily work-around the low performance issue with small imput size & | |||||
| //multithreads. | |||||
| if (n <= 100000) | |||||
| nthreads = 1; | |||||
| if (nthreads == 1) { | |||||
| #endif | |||||
| ret = (FLOATRET)ASUM_K(n, x, incx); | ret = (FLOATRET)ASUM_K(n, x, incx); | ||||
| #ifdef SMP | |||||
| } else { | |||||
| #ifndef DOUBLE | |||||
| #ifndef COMPLEX | |||||
| mode = BLAS_SINGLE | BLAS_REAL; | |||||
| #else | |||||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | |||||
| #endif | |||||
| #else | |||||
| #ifndef COMPLEX | |||||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||||
| #else | |||||
| mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||||
| #endif | |||||
| #endif | |||||
| blas_level1_thread_with_return_value(mode, n, 0, 0, dummyalpha, | |||||
| x, incx, NULL, 0, buffer, 0, (void *)asum_threads, nthreads); | |||||
| for(i = 0; i < nthreads; i++) | |||||
| mid_result += buffer[2*i]; | |||||
| ret = (FLOATRET)mid_result; | |||||
| } | |||||
| blas_memory_free(buffer); | |||||
| #endif | |||||
| FUNCTION_PROFILE_END(COMPSIZE, n, n); | FUNCTION_PROFILE_END(COMPSIZE, n, n); | ||||
| IDEBUG_END; | IDEBUG_END; | ||||
| @@ -141,68 +75,18 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| #ifdef SMP | |||||
| int i; | |||||
| int mode, nthreads; | |||||
| double mid_result= 0.0; | |||||
| FLOAT dummyalpha[2] = {ZERO, ZERO}; | |||||
| double *buffer = (double*)blas_memory_alloc(0); | |||||
| #endif | |||||
| if (n <= 0) return 0; | if (n <= 0) return 0; | ||||
| IDEBUG_START; | IDEBUG_START; | ||||
| FUNCTION_PROFILE_START(); | FUNCTION_PROFILE_START(); | ||||
| #ifdef SMP | |||||
| nthreads = num_cpu_avail(1); | |||||
| //Temporarily work-around the low performance issue with small imput size & | |||||
| //multithreads. | |||||
| if (n <= 100000) | |||||
| nthreads = 1; | |||||
| if (nthreads == 1) { | |||||
| #endif | |||||
| ret = ASUM_K(n, x, incx); | ret = ASUM_K(n, x, incx); | ||||
| #ifdef SMP | |||||
| } else { | |||||
| #ifndef DOUBLE | |||||
| #ifndef COMPLEX | |||||
| mode = BLAS_SINGLE | BLAS_REAL; | |||||
| #else | |||||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | |||||
| #endif | |||||
| #else | |||||
| #ifndef COMPLEX | |||||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||||
| #else | |||||
| mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||||
| #endif | |||||
| #endif | |||||
| blas_level1_thread_with_return_value(mode, n, 0, 0, dummyalpha, | |||||
| x, incx, NULL, 0, buffer, 0, (void *)asum_threads, nthreads); | |||||
| for(i = 0; i < nthreads; i++) | |||||
| mid_result += buffer[2*i]; | |||||
| ret = (FLOAT)mid_result; | |||||
| } | |||||
| blas_memory_free(buffer); | |||||
| #endif | |||||
| FUNCTION_PROFILE_END(COMPSIZE, n, n); | FUNCTION_PROFILE_END(COMPSIZE, n, n); | ||||
| IDEBUG_END; | IDEBUG_END; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -42,17 +42,6 @@ | |||||
| #include "functable.h" | #include "functable.h" | ||||
| #endif | #endif | ||||
| #ifdef SMP | |||||
| static int copy_threads (BLASLONG m, BLASLONG n, BLASLONG k, float alpha, | |||||
| float* x, BLASLONG incx, float* y, BLASLONG incy, float* z, BLASLONG incz) | |||||
| { | |||||
| COPY_K(m, x, incx, y, incy); | |||||
| return 0; | |||||
| } | |||||
| #endif | |||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ | void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ | ||||
| @@ -71,11 +60,6 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ | |||||
| #endif | #endif | ||||
| #ifdef SMP | |||||
| int mode, nthreads; | |||||
| FLOAT dummyalpha[2] = {ZERO, ZERO}; | |||||
| #endif | |||||
| if (n <= 0) return; | if (n <= 0) return; | ||||
| IDEBUG_START; | IDEBUG_START; | ||||
| @@ -85,42 +69,8 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ | |||||
| if (incx < 0) x -= (n - 1) * incx * COMPSIZE; | if (incx < 0) x -= (n - 1) * incx * COMPSIZE; | ||||
| if (incy < 0) y -= (n - 1) * incy * COMPSIZE; | if (incy < 0) y -= (n - 1) * incy * COMPSIZE; | ||||
| #ifdef SMP | |||||
| nthreads = num_cpu_avail(1); | |||||
| //Temporarily work-around the low performance issue with small imput size & | |||||
| //multithreads. | |||||
| if (n <= 100000) | |||||
| nthreads = 1; | |||||
| if (nthreads == 1) { | |||||
| #endif | |||||
| COPY_K(n, x, incx, y, incy); | COPY_K(n, x, incx, y, incy); | ||||
| #ifdef SMP | |||||
| } else { | |||||
| #ifndef DOUBLE | |||||
| #ifndef COMPLEX | |||||
| mode = BLAS_SINGLE | BLAS_REAL; | |||||
| #else | |||||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | |||||
| #endif | |||||
| #else | |||||
| #ifndef COMPLEX | |||||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||||
| #else | |||||
| mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||||
| #endif | |||||
| #endif | |||||
| blas_level1_thread(mode, n, 0, 0, dummyalpha, | |||||
| x, incx, y, incy, NULL, 0, (void *)copy_threads, nthreads); | |||||
| } | |||||
| #endif | |||||
| FUNCTION_PROFILE_END(COMPSIZE, COMPSIZE * n, 0); | FUNCTION_PROFILE_END(COMPSIZE, COMPSIZE * n, 0); | ||||
| IDEBUG_END; | IDEBUG_END; | ||||
| @@ -42,24 +42,6 @@ | |||||
| #include "functable.h" | #include "functable.h" | ||||
| #endif | #endif | ||||
| #ifdef SMP | |||||
| static int dot_threads (BLASLONG m, BLASLONG n, BLASLONG k, float alpha, | |||||
| float* x, BLASLONG incx, float* y, BLASLONG incy, float* z, BLASLONG incz) | |||||
| { | |||||
| #ifndef CBLAS | |||||
| FLOATRET ret; | |||||
| ret = (FLOATRET)DOTU_K(m, x, incx, y, incy); | |||||
| *((double *)z) = (double)ret; | |||||
| #else | |||||
| FLOAT ret; | |||||
| ret = DOTU_K(n, x, incx, y, incy); | |||||
| *((double *)z) = (double)ret; | |||||
| #endif | |||||
| return 0; | |||||
| } | |||||
| #endif | |||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ | FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ | ||||
| @@ -71,14 +53,6 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ | |||||
| PRINT_DEBUG_NAME; | PRINT_DEBUG_NAME; | ||||
| #ifdef SMP | |||||
| int i; | |||||
| int mode, nthreads; | |||||
| double mid_result= 0.0; | |||||
| FLOAT dummyalpha[2] = {ZERO, ZERO}; | |||||
| double *buffer = (double*)blas_memory_alloc(0); | |||||
| #endif | |||||
| if (n <= 0) return 0.; | if (n <= 0) return 0.; | ||||
| IDEBUG_START; | IDEBUG_START; | ||||
| @@ -88,40 +62,8 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ | |||||
| if (incx < 0) x -= (n - 1) * incx; | if (incx < 0) x -= (n - 1) * incx; | ||||
| if (incy < 0) y -= (n - 1) * incy; | if (incy < 0) y -= (n - 1) * incy; | ||||
| #ifdef SMP | |||||
| nthreads = num_cpu_avail(1); | |||||
| //Temporarily work-around the low performance issue with small imput size & | |||||
| //multithreads. | |||||
| if (n <= 100000) | |||||
| nthreads = 1; | |||||
| if (nthreads == 1) { | |||||
| #endif | |||||
| ret = (FLOATRET)DOTU_K(n, x, incx, y, incy); | ret = (FLOATRET)DOTU_K(n, x, incx, y, incy); | ||||
| #ifdef SMP | |||||
| } else { | |||||
| #ifndef DOUBLE | |||||
| mode = BLAS_SINGLE | BLAS_REAL; | |||||
| #else | |||||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||||
| #endif | |||||
| blas_level1_thread_with_return_value(mode, n, 0, 0, dummyalpha, | |||||
| x, incx, y, incy, buffer, 0, (void *)dot_threads, nthreads); | |||||
| for(i = 0; i < nthreads; i++) | |||||
| mid_result += buffer[2*i]; | |||||
| ret = (FLOATRET)mid_result; | |||||
| } | |||||
| blas_memory_free(buffer); | |||||
| #endif | |||||
| FUNCTION_PROFILE_END(1, 2 * n, 2 * n); | FUNCTION_PROFILE_END(1, 2 * n, 2 * n); | ||||
| IDEBUG_END; | IDEBUG_END; | ||||
| @@ -137,14 +79,6 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ | |||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| #ifdef SMP | |||||
| int i; | |||||
| int mode, nthreads; | |||||
| double mid_result= 0.0; | |||||
| FLOAT dummyalpha[2] = {ZERO, ZERO}; | |||||
| double *buffer = (double*)blas_memory_alloc(0); | |||||
| #endif | |||||
| if (n <= 0) return 0.; | if (n <= 0) return 0.; | ||||
| IDEBUG_START; | IDEBUG_START; | ||||
| @@ -154,39 +88,8 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ | |||||
| if (incx < 0) x -= (n - 1) * incx; | if (incx < 0) x -= (n - 1) * incx; | ||||
| if (incy < 0) y -= (n - 1) * incy; | if (incy < 0) y -= (n - 1) * incy; | ||||
| #ifdef SMP | |||||
| nthreads = num_cpu_avail(1); | |||||
| //Temporarily work-around the low performance issue with small imput size & | |||||
| //multithreads. | |||||
| if (n <= 100000) | |||||
| nthreads = 1; | |||||
| if (nthreads == 1) { | |||||
| #endif | |||||
| ret = DOTU_K(n, x, incx, y, incy); | ret = DOTU_K(n, x, incx, y, incy); | ||||
| #ifdef SMP | |||||
| } else { | |||||
| #ifndef DOUBLE | |||||
| mode = BLAS_SINGLE | BLAS_REAL; | |||||
| #else | |||||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||||
| #endif | |||||
| blas_level1_thread_with_return_value(mode, n, 0, 0, dummyalpha, | |||||
| x, incx, y, incy, buffer, 0, (void *)dot_threads, nthreads); | |||||
| for(i = 0; i < nthreads; i++) | |||||
| mid_result += buffer[2*i]; | |||||
| ret = (FLOAT)mid_result; | |||||
| } | |||||
| blas_memory_free(buffer); | |||||
| #endif | |||||
| FUNCTION_PROFILE_END(1, 2 * n, 2 * n); | FUNCTION_PROFILE_END(1, 2 * n, 2 * n); | ||||
| IDEBUG_END; | IDEBUG_END; | ||||
| @@ -42,16 +42,6 @@ | |||||
| #include "functable.h" | #include "functable.h" | ||||
| #endif | #endif | ||||
| #ifdef SMP | |||||
| static int rot_threads (BLASLONG m, BLASLONG n, BLASLONG k, float alpha, | |||||
| float* x, BLASLONG incx, float* y, BLASLONG incy, float* z, BLASLONG incz) | |||||
| { | |||||
| ROT_K(m, x, incx, y, incy, n, k); | |||||
| return 0; | |||||
| } | |||||
| #endif | |||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ | void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ | ||||
| @@ -72,11 +62,6 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT c, F | |||||
| #endif | #endif | ||||
| #ifdef SMP | |||||
| int mode, nthreads; | |||||
| FLOAT dummyalpha[2] = {ZERO, ZERO}; | |||||
| #endif | |||||
| if (n <= 0) return; | if (n <= 0) return; | ||||
| IDEBUG_START; | IDEBUG_START; | ||||
| @@ -86,34 +71,8 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT c, F | |||||
| if (incx < 0) x -= (n - 1) * incx; | if (incx < 0) x -= (n - 1) * incx; | ||||
| if (incy < 0) y -= (n - 1) * incy; | if (incy < 0) y -= (n - 1) * incy; | ||||
| #ifdef SMP | |||||
| nthreads = num_cpu_avail(1); | |||||
| //Temporarily work-around the low performance issue with small imput size & | |||||
| //multithreads. | |||||
| if (n <= 100000) | |||||
| nthreads = 1; | |||||
| if (nthreads == 1) { | |||||
| #endif | |||||
| ROT_K(n, x, incx, y, incy, c, s); | ROT_K(n, x, incx, y, incy, c, s); | ||||
| #ifdef SMP | |||||
| } else { | |||||
| #ifndef DOUBLE | |||||
| mode = BLAS_SINGLE | BLAS_REAL; | |||||
| #else | |||||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||||
| #endif | |||||
| blas_level1_thread(mode, n, c, s, dummyalpha, | |||||
| x, incx, y, incy, NULL, 0, (void *)rot_threads, nthreads); | |||||
| } | |||||
| #endif | |||||
| FUNCTION_PROFILE_END(1, n, n); | FUNCTION_PROFILE_END(1, n, n); | ||||
| IDEBUG_END; | IDEBUG_END; | ||||