| @@ -124,7 +124,13 @@ In chronological order: | |||
| * Jerome Robert <jeromerobert@gmx.com> | |||
| * [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478) | |||
| * [2015-12-23] `stack_check` in `gemv.c` (bug #722) | |||
| * [2015-12-28] Allow to force the number of parallel make job | |||
| * [2015-12-28] Fix detection of AMD E2-3200 detection | |||
| * [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected | |||
| * [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731) | |||
| * [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742) | |||
| * [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760) | |||
| * [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727) | |||
| * Dan Kortschak | |||
| * [2015-01-07] Added test for drotmg bug #484. | |||
| @@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ | |||
| sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ | |||
| spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ | |||
| ssymm.goto dsymm.goto csymm.goto zsymm.goto | |||
| ssymm.goto dsymm.goto csymm.goto zsymm.goto \ | |||
| smallscaling | |||
| acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | |||
| scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | |||
| @@ -2132,6 +2133,8 @@ cgemm3m.$(SUFFIX) : gemm3m.c | |||
| zgemm3m.$(SUFFIX) : gemm3m.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| smallscaling: smallscaling.c ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -lpthread -fopenmp -lm -o $(@F) $^ | |||
| clean :: | |||
| @rm -f *.goto *.mkl *.acml *.atlas *.veclib | |||
| @@ -0,0 +1,191 @@ | |||
| // run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n | |||
| #include <math.h> | |||
| #include <stdlib.h> | |||
| #include <stdio.h> | |||
| #include <time.h> | |||
| #include <cblas.h> | |||
| #include <omp.h> | |||
| #define MIN_SIZE 5 | |||
| #define MAX_SIZE 60 | |||
| #define NB_SIZE 10 | |||
| // number of loop for a 1x1 matrix. Lower it if the test is | |||
| // too slow on you computer. | |||
| #define NLOOP 2e7 | |||
| typedef struct { | |||
| int matrix_size; | |||
| int n_loop; | |||
| void (* bench_func)(); | |||
| void (* blas_func)(); | |||
| void * (* create_matrix)(int size); | |||
| } BenchParam; | |||
| void * s_create_matrix(int size) { | |||
| float * r = malloc(size * sizeof(double)); | |||
| for(int i = 0; i < size; i++) | |||
| r[i] = 1e3 * i / size; | |||
| return r; | |||
| } | |||
| void * c_create_matrix(int size) { | |||
| float * r = malloc(size * 2 * sizeof(double)); | |||
| for(int i = 0; i < 2 * size; i++) | |||
| r[i] = 1e3 * i / size; | |||
| return r; | |||
| } | |||
| void * z_create_matrix(int size) { | |||
| double * r = malloc(size * 2 * sizeof(double)); | |||
| for(int i = 0; i < 2 * size; i++) | |||
| r[i] = 1e3 * i / size; | |||
| return r; | |||
| } | |||
| void * d_create_matrix(int size) { | |||
| double * r = malloc(size * sizeof(double)); | |||
| for(int i = 0; i < size; i++) | |||
| r[i] = 1e3 * i / size; | |||
| return r; | |||
| } | |||
| void trmv_bench(BenchParam * param) | |||
| { | |||
| int i, n; | |||
| int size = param->matrix_size; | |||
| n = param->n_loop / size; | |||
| int one = 1; | |||
| void * A = param->create_matrix(size * size); | |||
| void * y = param->create_matrix(size); | |||
| for(i = 0; i < n; i++) { | |||
| param->blas_func("U", "N", "N", &size, A, &size, y, &one); | |||
| } | |||
| free(A); | |||
| free(y); | |||
| } | |||
| void gemv_bench(BenchParam * param) | |||
| { | |||
| int i, n; | |||
| int size = param->matrix_size; | |||
| n = param->n_loop / size; | |||
| double v = 1.01; | |||
| int one = 1; | |||
| void * A = param->create_matrix(size * size); | |||
| void * y = param->create_matrix(size); | |||
| for(i = 0; i < n; i++) { | |||
| param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one); | |||
| } | |||
| free(A); | |||
| free(y); | |||
| } | |||
| void ger_bench(BenchParam * param) { | |||
| int i, n; | |||
| int size = param->matrix_size; | |||
| n = param->n_loop / size; | |||
| double v = 1.01; | |||
| int one = 1; | |||
| void * A = param->create_matrix(size * size); | |||
| void * y = param->create_matrix(size); | |||
| for(i = 0; i < n; i++) { | |||
| param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size); | |||
| } | |||
| free(A); | |||
| free(y); | |||
| } | |||
| #ifndef _WIN32 | |||
| void * pthread_func_wrapper(void * param) { | |||
| ((BenchParam *)param)->bench_func(param); | |||
| pthread_exit(NULL); | |||
| } | |||
| #endif | |||
| #define NB_TESTS 5 | |||
| void * TESTS[4 * NB_TESTS] = { | |||
| trmv_bench, ztrmv_, z_create_matrix, "ztrmv", | |||
| gemv_bench, dgemv_, d_create_matrix, "dgemv", | |||
| gemv_bench, zgemv_, z_create_matrix, "zgemv", | |||
| ger_bench, dger_, d_create_matrix, "dger", | |||
| ger_bench, zgerc_, z_create_matrix, "zgerc", | |||
| }; | |||
| inline static double delta_time(struct timespec tick) { | |||
| struct timespec tock; | |||
| clock_gettime(CLOCK_MONOTONIC, &tock); | |||
| return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9; | |||
| } | |||
| double pthread_bench(BenchParam * param, int nb_threads) | |||
| { | |||
| #ifdef _WIN32 | |||
| return 0; | |||
| #else | |||
| BenchParam threaded_param = *param; | |||
| pthread_t threads[nb_threads]; | |||
| int t, rc; | |||
| struct timespec tick; | |||
| threaded_param.n_loop /= nb_threads; | |||
| clock_gettime(CLOCK_MONOTONIC, &tick); | |||
| for(t=0; t<nb_threads; t++){ | |||
| rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param); | |||
| if (rc){ | |||
| printf("ERROR; return code from pthread_create() is %d\n", rc); | |||
| exit(-1); | |||
| } | |||
| } | |||
| for(t=0; t<nb_threads; t++){ | |||
| pthread_join(threads[t], NULL); | |||
| } | |||
| return delta_time(tick); | |||
| #endif | |||
| } | |||
| double seq_bench(BenchParam * param) { | |||
| struct timespec tick; | |||
| clock_gettime(CLOCK_MONOTONIC, &tick); | |||
| param->bench_func(param); | |||
| return delta_time(tick); | |||
| } | |||
| double omp_bench(BenchParam * param) { | |||
| BenchParam threaded_param = *param; | |||
| struct timespec tick; | |||
| int t; | |||
| int nb_threads = omp_get_max_threads(); | |||
| threaded_param.n_loop /= nb_threads; | |||
| clock_gettime(CLOCK_MONOTONIC, &tick); | |||
| #pragma omp parallel for | |||
| for(t = 0; t < nb_threads; t ++){ | |||
| param->bench_func(&threaded_param); | |||
| } | |||
| return delta_time(tick); | |||
| } | |||
| int main(int argc, char * argv[]) { | |||
| double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE); | |||
| BenchParam param; | |||
| int test_id; | |||
| printf ("Running on %d threads\n", omp_get_max_threads()); | |||
| for(test_id = 0; test_id < NB_TESTS; test_id ++) { | |||
| double size = MIN_SIZE; | |||
| param.bench_func = TESTS[test_id * 4]; | |||
| param.blas_func = TESTS[test_id * 4 + 1]; | |||
| param.create_matrix = TESTS[test_id * 4 + 2]; | |||
| printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]); | |||
| param.n_loop = NLOOP; | |||
| while(size <= MAX_SIZE) { | |||
| param.matrix_size = (int)(size + 0.5); | |||
| double seq_time = seq_bench(¶m); | |||
| double omp_time = omp_bench(¶m); | |||
| double pthread_time = pthread_bench(¶m, omp_get_max_threads()); | |||
| printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, " | |||
| "pthread %gs, speedup %g\n", | |||
| param.matrix_size, seq_time, | |||
| omp_time, seq_time / omp_time, | |||
| pthread_time, seq_time / pthread_time); | |||
| size *= inc_factor; | |||
| } | |||
| } | |||
| } | |||
| @@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||
| #endif | |||
| x = buffer; | |||
| buffer += ((COMPSIZE * args -> m + 1023) & ~1023); | |||
| buffer += ((COMPSIZE * args -> m + 3) & ~3); | |||
| } | |||
| #ifndef TRANS | |||
| @@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||
| if (num_cpu) { | |||
| queue[0].sa = NULL; | |||
| queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; | |||
| queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE; | |||
| queue[num_cpu - 1].next = NULL; | |||
| @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu | |||
| if (incb != 1) { | |||
| B = buffer; | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); | |||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15); | |||
| COPY_K(m, b, incb, buffer, 1); | |||
| } | |||
| @@ -77,6 +77,7 @@ void NAME(char *TRANS, blasint *M, blasint *N, | |||
| blasint incy = *INCY; | |||
| FLOAT *buffer; | |||
| int buffer_size; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| #endif | |||
| @@ -141,7 +142,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| FLOAT *buffer; | |||
| blasint lenx, leny; | |||
| int trans; | |||
| int trans, buffer_size; | |||
| blasint info, t; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| @@ -230,7 +231,19 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (incx < 0) x -= (lenx - 1) * incx * 2; | |||
| if (incy < 0) y -= (leny - 1) * incy * 2; | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT); | |||
| #ifdef WINDOWS_ABI | |||
| buffer_size += 160 / sizeof(FLOAT) ; | |||
| #endif | |||
| // for alignment | |||
| buffer_size = (buffer_size + 3) & ~3; | |||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | |||
| #if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0 | |||
| // cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746) | |||
| if(trans && stack_alloc_size) | |||
| memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size)); | |||
| #endif | |||
| #ifdef SMP | |||
| @@ -253,7 +266,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| } | |||
| #endif | |||
| blas_memory_free(buffer); | |||
| STACK_FREE(buffer); | |||
| FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); | |||
| @@ -210,7 +210,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (incy < 0) y -= (n - 1) * incy * 2; | |||
| if (incx < 0) x -= (m - 1) * incx * 2; | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| STACK_ALLOC(2 * m, FLOAT, buffer); | |||
| #ifdef SMPTEST | |||
| // Threshold chosen so that speed-up is > 1 on a Xeon E5-2630 | |||
| @@ -249,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| } | |||
| #endif | |||
| blas_memory_free(buffer); | |||
| STACK_FREE(buffer); | |||
| FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); | |||
| @@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, | |||
| blasint info; | |||
| int uplo; | |||
| int unit; | |||
| int trans; | |||
| int trans, buffer_size; | |||
| FLOAT *buffer; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| @@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { | |||
| int trans, uplo, unit; | |||
| int trans, uplo, unit, buffer_size; | |||
| blasint info; | |||
| FLOAT *buffer; | |||
| #ifdef SMP | |||
| @@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| if (incx < 0 ) x -= (n - 1) * incx * 2; | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| #ifdef SMP | |||
| nthreads = num_cpu_avail(2); | |||
| // Calibrated on a Xeon E5-2630 | |||
| if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) { | |||
| nthreads = num_cpu_avail(2); | |||
| if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) | |||
| nthreads = 2; | |||
| } else | |||
| nthreads = 1; | |||
| if(nthreads > 1) { | |||
| buffer_size = n > 16 ? 0 : n * 4 + 40; | |||
| } | |||
| else | |||
| #endif | |||
| { | |||
| buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT); | |||
| if(incx != 1) | |||
| buffer_size += n * 2; | |||
| } | |||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | |||
| #ifdef SMP | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| } | |||
| #endif | |||
| blas_memory_free(buffer); | |||
| STACK_FREE(buffer); | |||
| FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); | |||