| @@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve | |||
| 9.Known Issues | |||
| * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit | |||
| is 64. On 32 bits, it is 32. | |||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. | |||
| 10. Specification of Git Branches | |||
| We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | |||
| @@ -2127,7 +2127,9 @@ | |||
| #endif | |||
| #ifndef ASSEMBLER | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) | |||
| extern BLASLONG gemm_offset_a; | |||
| extern BLASLONG gemm_offset_b; | |||
| extern BLASLONG sgemm_p; | |||
| extern BLASLONG sgemm_q; | |||
| extern BLASLONG sgemm_r; | |||
| @@ -152,6 +152,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define CMPEQ c.eq.d | |||
| #define CMPLE c.le.d | |||
| #define CMPLT c.lt.d | |||
| #define NEG neg.d | |||
| #else | |||
| #define LD lwc1 | |||
| #define ST swc1 | |||
| @@ -170,6 +171,14 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define CMPEQ c.eq.s | |||
| #define CMPLE c.le.s | |||
| #define CMPLT c.lt.s | |||
| #define PLU plu.ps | |||
| #define PLL pll.ps | |||
| #define PUU puu.ps | |||
| #define PUL pul.ps | |||
| #define MADPS madd.ps | |||
| #define CVTU cvt.s.pu | |||
| #define CVTL cvt.s.pl | |||
| #define NEG neg.s | |||
| #endif | |||
| #if defined(__64BIT__) && defined(USE64BITINT) | |||
| @@ -218,7 +227,7 @@ REALNAME: ;\ | |||
| #define SEEK_ADDRESS | |||
| #define BUFFER_SIZE ( 8 << 20) | |||
| #define BUFFER_SIZE ( 32 << 20) | |||
| #if defined(LOONGSON3A) | |||
| #define PAGESIZE (16UL << 10) | |||
| @@ -71,16 +71,25 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||
| queue[num_cpu].args = arg; | |||
| queue[num_cpu].range_m = range_m; | |||
| queue[num_cpu].range_n = &range[num_cpu]; | |||
| queue[num_cpu].sa = NULL; | |||
| #if defined(LOONGSON3A) | |||
| queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; | |||
| queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; | |||
| #else | |||
| queue[num_cpu].sa = NULL; | |||
| queue[num_cpu].sb = NULL; | |||
| #endif | |||
| queue[num_cpu].next = &queue[num_cpu + 1]; | |||
| num_cpu ++; | |||
| } | |||
| if (num_cpu) { | |||
| #if defined(LOONGSON3A) | |||
| queue[0].sa = sa; | |||
| queue[0].sb = sb; | |||
| queue[0].sb = sa + GEMM_OFFSET_A1 * 5; | |||
| #else | |||
| queue[0].sa = sa; | |||
| queue[0].sb = sb; | |||
| #endif | |||
| queue[num_cpu - 1].next = NULL; | |||
| exec_blas(num_cpu, | |||
| @@ -500,6 +500,7 @@ static int blas_monitor(void *arg){ | |||
| /* Initializing routine */ | |||
| int blas_thread_init(void){ | |||
| BLASLONG i; | |||
| int ret; | |||
| #ifdef NEED_STACKATTR | |||
| pthread_attr_t attr; | |||
| #endif | |||
| @@ -545,12 +546,16 @@ int blas_thread_init(void){ | |||
| pthread_cond_init (&thread_status[i].wakeup, NULL); | |||
| #ifdef NEED_STACKATTR | |||
| pthread_create(&blas_threads[i], &attr, | |||
| ret=pthread_create(&blas_threads[i], &attr, | |||
| (void *)&blas_thread_server, (void *)i); | |||
| #else | |||
| pthread_create(&blas_threads[i], NULL, | |||
| ret=pthread_create(&blas_threads[i], NULL, | |||
| (void *)&blas_thread_server, (void *)i); | |||
| #endif | |||
| if(ret!=0){ | |||
| fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret); | |||
| exit(1); | |||
| } | |||
| } | |||
| #ifdef MONITOR | |||
| @@ -797,6 +802,11 @@ void goto_set_num_threads(int num_threads) { | |||
| blas_cpu_number = num_threads; | |||
| #if defined(ARCH_MIPS64) | |||
| //set parameters for different number of threads. | |||
| blas_set_parameter(); | |||
| #endif | |||
| } | |||
| void openblas_set_num_threads(int num_threads) { | |||
| @@ -63,6 +63,11 @@ void goto_set_num_threads(int num_threads) { | |||
| omp_set_num_threads(blas_cpu_number); | |||
| #if defined(ARCH_MIPS64) | |||
| //set parameters for different number of threads. | |||
| blas_set_parameter(); | |||
| #endif | |||
| } | |||
| void openblas_set_num_threads(int num_threads) { | |||
| @@ -884,7 +884,7 @@ void *blas_memory_alloc(int procpos){ | |||
| if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); | |||
| #endif | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) | |||
| #ifndef DYNAMIC_ARCH | |||
| blas_set_parameter(); | |||
| #endif | |||
| @@ -45,8 +45,22 @@ int get_L2_size(void); | |||
| #define DEFAULT_GEMM_P 128 | |||
| #define DEFAULT_GEMM_Q 128 | |||
| #define DEFAULT_GEMM_R 128 | |||
| #define DEFAULT_GEMM_OFFSET_A 0 | |||
| #define DEFAULT_GEMM_OFFSET_B 0 | |||
| /* Global Parameter */ | |||
| #if GEMM_OFFSET_A == gemm_offset_a | |||
| BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A; | |||
| #else | |||
| BLASLONG gemm_offset_a = GEMM_OFFSET_A; | |||
| #endif | |||
| #if GEMM_OFFSET_B == gemm_offset_b | |||
| BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; | |||
| #else | |||
| BLASLONG gemm_offset_b = GEMM_OFFSET_B; | |||
| #endif | |||
| #if SGEMM_P == sgemm_p | |||
| BLASLONG sgemm_p = DEFAULT_GEMM_P; | |||
| #else | |||
| @@ -666,3 +680,21 @@ void blas_set_parameter(void){ | |||
| #endif | |||
| #endif | |||
| #if defined(ARCH_MIPS64) | |||
| void blas_set_parameter(void){ | |||
| #if defined(LOONGSON3A) | |||
| #ifdef SMP | |||
| if(blas_num_threads == 1){ | |||
| #endif | |||
| //single thread | |||
| dgemm_r = 1024; | |||
| #ifdef SMP | |||
| }else{ | |||
| //multi thread | |||
| dgemm_r = 200; | |||
| } | |||
| #endif | |||
| #endif | |||
| } | |||
| #endif | |||
| @@ -136,6 +136,7 @@ void NAME(char *SIDE, char *UPLO, | |||
| FLOAT *sa, *sb; | |||
| #ifdef SMP | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| int mode = BLAS_XDOUBLE | BLAS_REAL; | |||
| #elif defined(DOUBLE) | |||
| @@ -143,6 +144,15 @@ void NAME(char *SIDE, char *UPLO, | |||
| #else | |||
| int mode = BLAS_SINGLE | BLAS_REAL; | |||
| #endif | |||
| #else | |||
| #ifdef XDOUBLE | |||
| int mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||
| #elif defined(DOUBLE) | |||
| int mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||
| #else | |||
| int mode = BLAS_SINGLE | BLAS_COMPLEX; | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #if defined(SMP) && !defined(NO_AFFINITY) | |||
| @@ -237,6 +247,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, | |||
| FLOAT *sa, *sb; | |||
| #ifdef SMP | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| int mode = BLAS_XDOUBLE | BLAS_REAL; | |||
| #elif defined(DOUBLE) | |||
| @@ -244,6 +255,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, | |||
| #else | |||
| int mode = BLAS_SINGLE | BLAS_REAL; | |||
| #endif | |||
| #else | |||
| #ifdef XDOUBLE | |||
| int mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||
| #elif defined(DOUBLE) | |||
| int mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||
| #else | |||
| int mode = BLAS_SINGLE | BLAS_COMPLEX; | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #if defined(SMP) && !defined(NO_AFFINITY) | |||
| @@ -123,15 +123,37 @@ ifndef DTRSMKERNEL_RT | |||
| DTRSMKERNEL_RT = trsm_kernel_RT.S | |||
| endif | |||
| ifndef CTRSMKERNEL_LN | |||
| CTRSMKERNEL_LN = ztrsm_kernel_LT.S | |||
| endif | |||
| ifndef CTRSMKERNEL_LT | |||
| CTRSMKERNEL_LT = ztrsm_kernel_LT.S | |||
| endif | |||
| ifndef CTRSMKERNEL_RN | |||
| CTRSMKERNEL_RN = ztrsm_kernel_LT.S | |||
| endif | |||
| ifndef CTRSMKERNEL_RT | |||
| CTRSMKERNEL_RT = ztrsm_kernel_RT.S | |||
| endif | |||
| ifndef ZTRSMKERNEL_LN | |||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT.S | |||
| endif | |||
| ifndef ZTRSMKERNEL_LT | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT.S | |||
| endif | |||
| ifndef ZTRSMKERNEL_RN | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT.S | |||
| endif | |||
| ifndef ZTRSMKERNEL_RT | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT.S | |||
| endif | |||
| CGEMM3MKERNEL = zgemm3m_kernel.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel.S | |||
| @@ -1,18 +1,48 @@ | |||
| SAXPYKERNEL=axpy_loongson3a.S | |||
| DAXPYKERNEL=daxpy_loongson3a_simd.S | |||
| SGEMMKERNEL = sgemm_kernel_loongson3a.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| SGEMVNKERNEL = gemv_n_loongson3a.c | |||
| SGEMVTKERNEL = gemv_t_loongson3a.c | |||
| DGEMVNKERNEL = gemv_n_loongson3a.c | |||
| DGEMVTKERNEL = gemv_t_loongson3a.c | |||
| CGEMVNKERNEL = zgemv_n_loongson3a.c | |||
| CGEMVTKERNEL = zgemv_t_loongson3a.c | |||
| ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||
| ZGEMVTKERNEL = zgemv_t_loongson3a.c | |||
| SGEMMKERNEL = sgemm_kernel_8x4_ps.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| DGEMMKERNEL = gemm_kernel_loongson3a.S | |||
| DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| @@ -22,3 +52,17 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| @@ -0,0 +1,101 @@ | |||
| #include "common.h" | |||
| //These are auto-tuning codes on Loongson-3A platform. | |||
| //#define prefetch(x) __builtin_prefetch(x) | |||
| //#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) | |||
| #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) | |||
| #define likely(x) __builtin_expect(!!(x), 1) | |||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||
| #define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0) | |||
| #define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0) | |||
| #define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) | |||
| #define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) | |||
| { | |||
| BLASLONG kx=0, ky=0; | |||
| if(!ALPHA) | |||
| return 0; | |||
| //if(INCX < 0) | |||
| // kx = (1-N) * INCX; | |||
| // INCX = -INCX; | |||
| //if(INCY < 0) | |||
| // ky = (1-M) * INCY; | |||
| // INCY = -INCY; | |||
| BLASLONG fahead = 30; | |||
| BLASLONG spec_unroll = 4; | |||
| BLASLONG tMQ = M - M % spec_unroll; | |||
| BLASLONG j = 0, k = 0; | |||
| if(ALPHA == 1) { | |||
| if(INCY == 1) { | |||
| for(k=kx; likely(j < N); j++, k += INCX) { | |||
| BLASLONG i = 0; | |||
| for(; likely(i < tMQ);) { | |||
| prefetch(A[LDA * j + i + fahead]); | |||
| prefetch(Y[i + fahead]); | |||
| /*loop_mark*/ spec_loop_alpha1; | |||
| /*loop_mark*/ spec_loop_alpha1; | |||
| /*loop_mark*/ spec_loop_alpha1; | |||
| /*loop_mark*/ spec_loop_alpha1; | |||
| } | |||
| for(; likely(i < M);) { | |||
| spec_loop_alpha1; | |||
| } | |||
| } | |||
| } else { | |||
| for(k=kx; likely(j < N); j++, k += INCX) { | |||
| BLASLONG i = 0, h = ky; | |||
| for(; likely(i < tMQ);) { | |||
| prefetch(A[LDA * j + i + fahead]); | |||
| prefetch(Y[h + fahead]); | |||
| /*loop_mark*/ norm_loop_alpha1; | |||
| /*loop_mark*/ norm_loop_alpha1; | |||
| /*loop_mark*/ norm_loop_alpha1; | |||
| /*loop_mark*/ norm_loop_alpha1; | |||
| } | |||
| for(; likely(i < M);) { | |||
| norm_loop_alpha1; | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if(INCY == 1) { | |||
| for(k=kx; likely(j < N); j++, k += INCX) { | |||
| BLASLONG i = 0; | |||
| for(; likely(i < tMQ);) { | |||
| prefetch(A[LDA * j + i + fahead]); | |||
| prefetch(Y[i + fahead]); | |||
| /*loop_mark*/ spec_loop; | |||
| /*loop_mark*/ spec_loop; | |||
| /*loop_mark*/ spec_loop; | |||
| /*loop_mark*/ spec_loop; | |||
| } | |||
| for(; likely(i < M);) { | |||
| spec_loop; | |||
| } | |||
| } | |||
| } else { | |||
| for(k=kx; likely(j < N); j++, k += INCX) { | |||
| BLASLONG i = 0, h = ky; | |||
| for(; likely(i < tMQ);) { | |||
| prefetch(A[LDA * j + i + fahead]); | |||
| prefetch(Y[h + fahead]); | |||
| /*loop_mark*/ norm_loop; | |||
| /*loop_mark*/ norm_loop; | |||
| /*loop_mark*/ norm_loop; | |||
| /*loop_mark*/ norm_loop; | |||
| } | |||
| for(; likely(i < M);) { | |||
| norm_loop; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,93 @@ | |||
| #include "common.h" | |||
| //These are auto-tuning codes on Loongson-3A platform. | |||
| //#define prefetch(x) __builtin_prefetch(x) | |||
| //#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) | |||
| #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) | |||
| #define likely(x) __builtin_expect(!!(x), 1) | |||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||
| #define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0) | |||
| #define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0) | |||
| #define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) | |||
| #define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { | |||
| if(!ALPHA) | |||
| return 0; | |||
| // if(INCX < 0) | |||
| // INCX = -INCX; | |||
| // if(INCY < 0) | |||
| // INCY = -INCY; | |||
| BLASLONG fahead = 30; | |||
| BLASLONG spec_unroll = 3; | |||
| BLASLONG tMQ = M - M % spec_unroll; | |||
| BLASLONG j = 0, k = 0; | |||
| if(ALPHA == 1) { | |||
| if(INCX == 1) { | |||
| for(; likely(j < N); j++, k += INCY) { | |||
| BLASLONG i = 0; | |||
| for(; likely(i < tMQ);) { | |||
| prefetch(A[LDA * j + i + fahead]); | |||
| prefetch(X[i + fahead]); | |||
| /*loop_mark*/ spec_loop_alpha1; | |||
| /*loop_mark*/ spec_loop_alpha1; | |||
| /*loop_mark*/ spec_loop_alpha1; | |||
| } | |||
| for(; likely(i < M);) { | |||
| spec_loop_alpha1; | |||
| } | |||
| } | |||
| } else { | |||
| for(; likely(j < N); j++, k += INCY) { | |||
| BLASLONG i = 0, h = 0; | |||
| for(; likely(i < tMQ);) { | |||
| prefetch(A[LDA * j + i + fahead]); | |||
| prefetch(X[h + fahead]); | |||
| /*loop_mark*/ norm_loop_alpha1; | |||
| /*loop_mark*/ norm_loop_alpha1; | |||
| /*loop_mark*/ norm_loop_alpha1; | |||
| } | |||
| for(; likely(i < M);) { | |||
| norm_loop_alpha1; | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if(INCX == 1) { | |||
| for(; likely(j < N); j++, k += INCY) { | |||
| BLASLONG i = 0; | |||
| for(; likely(i < tMQ);) { | |||
| prefetch(A[LDA * j + i + fahead]); | |||
| prefetch(X[i + fahead]); | |||
| /*loop_mark*/ spec_loop; | |||
| /*loop_mark*/ spec_loop; | |||
| /*loop_mark*/ spec_loop; | |||
| } | |||
| for(; likely(i < M);) { | |||
| spec_loop; | |||
| } | |||
| } | |||
| } else { | |||
| for(; likely(j < N); j++, k += INCY) { | |||
| BLASLONG i = 0, h = 0; | |||
| for(; likely(i < tMQ);) { | |||
| prefetch(A[LDA * j + i + fahead]); | |||
| prefetch(X[h + fahead]); | |||
| /*loop_mark*/ norm_loop; | |||
| /*loop_mark*/ norm_loop; | |||
| /*loop_mark*/ norm_loop; | |||
| } | |||
| for(; likely(i < M);) { | |||
| norm_loop; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,139 @@ | |||
| #include "common.h" | |||
| //typedef int BLASLONG; | |||
| //typedef double FLOAT; | |||
| #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) | |||
| #define likely(x) __builtin_expect(!!(x), 1) | |||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||
| #if !defined(CONJ) && !defined(XCONJ) | |||
| #define spec_loop_alpha1 spec_loop_alpha1_0 | |||
| #define spec_loop spec_loop_0 | |||
| #define norm_loop_alpha1 norm_loop_alpha1_0 | |||
| #define norm_loop norm_loop_0 | |||
| #endif | |||
| #if defined(CONJ) && !defined(XCONJ) | |||
| #define spec_loop_alpha1 spec_loop_alpha1_1 | |||
| #define spec_loop spec_loop_1 | |||
| #define norm_loop_alpha1 norm_loop_alpha1_1 | |||
| #define norm_loop norm_loop_1 | |||
| #endif | |||
| #if !defined(CONJ) && defined(XCONJ) | |||
| #define spec_loop_alpha1 spec_loop_alpha1_2 | |||
| #define spec_loop spec_loop_2 | |||
| #define norm_loop_alpha1 norm_loop_alpha1_2 | |||
| #define norm_loop norm_loop_2 | |||
| #endif | |||
| #if defined(CONJ) && defined(XCONJ) | |||
| #define spec_loop_alpha1 spec_loop_alpha1_3 | |||
| #define spec_loop spec_loop_3 | |||
| #define norm_loop_alpha1 norm_loop_alpha1_3 | |||
| #define norm_loop norm_loop_3 | |||
| #endif | |||
| #define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) | |||
| #define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) | |||
| #define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) | |||
| #define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) | |||
| #define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||
| #define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||
| #define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||
| #define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||
| #define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) | |||
| #define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) | |||
| #define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) | |||
| #define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) | |||
| #define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) | |||
| #define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) | |||
| #define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) | |||
| #define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { | |||
| if(!rALPHA && iALPHA) | |||
| return 0; | |||
| BLASLONG fahead = 60; | |||
| BLASLONG spec_unroll = 2; | |||
| BLASLONG tMQ = M - M % spec_unroll; | |||
| BLASLONG j = 0, k = 0, jj = 0; | |||
| if(rALPHA == 1 && iALPHA == 0) { | |||
| if(INCY == 1) { | |||
| for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { | |||
| BLASLONG i = 0, ii = 0; | |||
| for(; likely(i < tMQ); i += spec_unroll) { | |||
| prefetch(A[jj + ii + fahead]); | |||
| prefetch(Y[ii + fahead]); | |||
| /*loop_mark*/ spec_loop_alpha1; | |||
| /*loop_mark*/ spec_loop_alpha1; | |||
| } | |||
| for(; likely(i < M); i++) { | |||
| spec_loop_alpha1; | |||
| } | |||
| } | |||
| } else { | |||
| for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { | |||
| BLASLONG i = 0, ii = 0, iii = 0; | |||
| for(; likely(i < tMQ); i += spec_unroll) { | |||
| prefetch(A[jj + ii + fahead]); | |||
| prefetch(Y[iii + fahead]); | |||
| /*loop_mark*/ norm_loop_alpha1; | |||
| /*loop_mark*/ norm_loop_alpha1; | |||
| } | |||
| for(; likely(i < M); i++) { | |||
| norm_loop_alpha1; | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| FLOAT rTmp, iTmp; | |||
| if(INCY == 1) { | |||
| for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { | |||
| BLASLONG i = 0, ii = 0; | |||
| for(; likely(i < tMQ); i += spec_unroll) { | |||
| prefetch(A[jj + ii + fahead]); | |||
| prefetch(Y[ii + fahead]); | |||
| /*loop_mark*/ spec_loop; | |||
| /*loop_mark*/ spec_loop; | |||
| } | |||
| for(; likely(i < M); i++) { | |||
| spec_loop; | |||
| } | |||
| } | |||
| } else { | |||
| for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { | |||
| BLASLONG i = 0, ii = 0, iii = 0; | |||
| for(; likely(i < tMQ); i += spec_unroll) { | |||
| prefetch(A[jj + ii + fahead]); | |||
| prefetch(Y[iii + fahead]); | |||
| /*loop_mark*/ norm_loop; | |||
| /*loop_mark*/ norm_loop; | |||
| } | |||
| for(; likely(i < M); i++) { | |||
| norm_loop; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,125 @@ | |||
| #include "common.h" | |||
| #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) | |||
| #define likely(x) __builtin_expect(!!(x), 1) | |||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||
| #if !defined(CONJ) && !defined(XCONJ) | |||
| #define spec_loop_alpha1 spec_loop_alpha1_0 | |||
| #define spec_loop spec_loop_0 | |||
| #define norm_loop_alpha1 norm_loop_alpha1_0 | |||
| #define norm_loop norm_loop_0 | |||
| #endif | |||
| #if defined(CONJ) && !defined(XCONJ) | |||
| #define spec_loop_alpha1 spec_loop_alpha1_1 | |||
| #define spec_loop spec_loop_1 | |||
| #define norm_loop_alpha1 norm_loop_alpha1_1 | |||
| #define norm_loop norm_loop_1 | |||
| #endif | |||
| #if !defined(CONJ) && defined(XCONJ) | |||
| #define spec_loop_alpha1 spec_loop_alpha1_2 | |||
| #define spec_loop spec_loop_2 | |||
| #define norm_loop_alpha1 norm_loop_alpha1_2 | |||
| #define norm_loop norm_loop_2 | |||
| #endif | |||
| #if defined(CONJ) && defined(XCONJ) | |||
| #define spec_loop_alpha1 spec_loop_alpha1_3 | |||
| #define spec_loop spec_loop_3 | |||
| #define norm_loop_alpha1 norm_loop_alpha1_3 | |||
| #define norm_loop norm_loop_3 | |||
| #endif | |||
| #define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) | |||
| #define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) | |||
| #define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) | |||
| #define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) | |||
| #define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||
| #define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||
| #define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||
| #define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||
| #define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) | |||
| #define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) | |||
| #define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) | |||
| #define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) | |||
| #define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) | |||
| #define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) | |||
| #define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) | |||
| #define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { | |||
| if(!rALPHA && iALPHA) | |||
| return 0; | |||
| BLASLONG fahead = 30; | |||
| BLASLONG spec_unroll = 2; | |||
| BLASLONG tMQ = M - M % spec_unroll; | |||
| BLASLONG j = 0, k = 0, jj = 0; | |||
| if(rALPHA == 1 && iALPHA == 0) { | |||
| if(INCX == 1) { | |||
| for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { | |||
| BLASLONG i = 0, ii = 0; | |||
| for(; likely(i < tMQ); i += spec_unroll) { | |||
| prefetch(A[jj + ii + fahead]); | |||
| prefetch(X[ii + fahead]); | |||
| /*loop_mark*/ spec_loop_alpha1; | |||
| /*loop_mark*/ spec_loop_alpha1; | |||
| } | |||
| for(; likely(i < M); i++) { | |||
| spec_loop_alpha1; | |||
| } | |||
| } | |||
| } else { | |||
| for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { | |||
| BLASLONG i = 0, ii = 0, iii = 0; | |||
| for(; likely(i < tMQ); i += spec_unroll) { | |||
| prefetch(A[jj + ii + fahead]); | |||
| prefetch(X[iii + fahead]); | |||
| /*loop_mark*/ norm_loop_alpha1; | |||
| /*loop_mark*/ norm_loop_alpha1; | |||
| } | |||
| for(; likely(i < M); i++) { | |||
| norm_loop_alpha1; | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| FLOAT rTmp, iTmp; | |||
| if(INCX == 1) { | |||
| for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { | |||
| BLASLONG i = 0, ii = 0; | |||
| for(; likely(i < tMQ); i += spec_unroll) { | |||
| prefetch(A[jj + ii + fahead]); | |||
| prefetch(X[ii + fahead]); | |||
| /*loop_mark*/ spec_loop; | |||
| /*loop_mark*/ spec_loop; | |||
| } | |||
| for(; likely(i < M); i++) { | |||
| spec_loop; | |||
| } | |||
| } | |||
| } else { | |||
| for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { | |||
| BLASLONG i = 0, ii = 0, iii = 0; | |||
| for(; likely(i < tMQ); i += spec_unroll) { | |||
| prefetch(A[jj + ii + fahead]); | |||
| prefetch(X[iii + fahead]); | |||
| /*loop_mark*/ norm_loop; | |||
| /*loop_mark*/ norm_loop; | |||
| } | |||
| for(; likely(i < M); i++) { | |||
| norm_loop; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1480,31 +1480,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 1 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 4 | |||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||
| #define SGEMM_DEFAULT_P 32 | |||
| #define DGEMM_DEFAULT_P 32 | |||
| #define CGEMM_DEFAULT_P 108 | |||
| #define ZGEMM_DEFAULT_P 112 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define SGEMM_DEFAULT_Q 116 | |||
| #define DGEMM_DEFAULT_Q 116 | |||
| #define CGEMM_DEFAULT_Q 144 | |||
| #define ZGEMM_DEFAULT_Q 72 | |||
| #define SGEMM_DEFAULT_P 64 | |||
| #define DGEMM_DEFAULT_P 44 | |||
| #define CGEMM_DEFAULT_P 64 | |||
| #define ZGEMM_DEFAULT_P 32 | |||
| #define SGEMM_DEFAULT_R 1000 | |||
| #define DGEMM_DEFAULT_R 1000 | |||
| #define CGEMM_DEFAULT_R 2000 | |||
| #define ZGEMM_DEFAULT_R 2000 | |||
| #define SGEMM_DEFAULT_Q 192 | |||
| #define DGEMM_DEFAULT_Q 92 | |||
| #define CGEMM_DEFAULT_Q 128 | |||
| #define ZGEMM_DEFAULT_Q 80 | |||
| #define SGEMM_DEFAULT_R 1024 | |||
| #define DGEMM_DEFAULT_R dgemm_r | |||
| #define CGEMM_DEFAULT_R 1024 | |||
| #define ZGEMM_DEFAULT_R 1024 | |||
| #define GEMM_OFFSET_A1 0x10000 | |||
| #define GEMM_OFFSET_B1 0x100000 | |||
| #define SYMV_P 16 | |||
| #endif | |||
| @@ -1301,6 +1301,8 @@ | |||
| NC = 0 | |||
| RESET = .TRUE. | |||
| ERRMAX = RZERO | |||
| RALS = RONE | |||
| RBETS = RONE | |||
| * | |||
| DO 100 IN = 1, NIDIM | |||
| N = IDIM( IN ) | |||
| @@ -1303,6 +1303,8 @@ | |||
| NC = 0 | |||
| RESET = .TRUE. | |||
| ERRMAX = RZERO | |||
| RALS = RONE | |||
| RBETS = RONE | |||
| * | |||
| DO 100 IN = 1, NIDIM | |||
| N = IDIM( IN ) | |||