| @@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve | |||||
| 9.Known Issues | 9.Known Issues | ||||
| * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit | * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit | ||||
| is 64. On 32 bits, it is 32. | is 64. On 32 bits, it is 32. | ||||
| * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. | |||||
| 10. Specification of Git Branches | 10. Specification of Git Branches | ||||
| We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | ||||
| @@ -2127,7 +2127,9 @@ | |||||
| #endif | #endif | ||||
| #ifndef ASSEMBLER | #ifndef ASSEMBLER | ||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) | |||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) | |||||
| extern BLASLONG gemm_offset_a; | |||||
| extern BLASLONG gemm_offset_b; | |||||
| extern BLASLONG sgemm_p; | extern BLASLONG sgemm_p; | ||||
| extern BLASLONG sgemm_q; | extern BLASLONG sgemm_q; | ||||
| extern BLASLONG sgemm_r; | extern BLASLONG sgemm_r; | ||||
| @@ -152,6 +152,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #define CMPEQ c.eq.d | #define CMPEQ c.eq.d | ||||
| #define CMPLE c.le.d | #define CMPLE c.le.d | ||||
| #define CMPLT c.lt.d | #define CMPLT c.lt.d | ||||
| #define NEG neg.d | |||||
| #else | #else | ||||
| #define LD lwc1 | #define LD lwc1 | ||||
| #define ST swc1 | #define ST swc1 | ||||
| @@ -170,6 +171,14 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #define CMPEQ c.eq.s | #define CMPEQ c.eq.s | ||||
| #define CMPLE c.le.s | #define CMPLE c.le.s | ||||
| #define CMPLT c.lt.s | #define CMPLT c.lt.s | ||||
| #define PLU plu.ps | |||||
| #define PLL pll.ps | |||||
| #define PUU puu.ps | |||||
| #define PUL pul.ps | |||||
| #define MADPS madd.ps | |||||
| #define CVTU cvt.s.pu | |||||
| #define CVTL cvt.s.pl | |||||
| #define NEG neg.s | |||||
| #endif | #endif | ||||
| #if defined(__64BIT__) && defined(USE64BITINT) | #if defined(__64BIT__) && defined(USE64BITINT) | ||||
| @@ -218,7 +227,7 @@ REALNAME: ;\ | |||||
| #define SEEK_ADDRESS | #define SEEK_ADDRESS | ||||
| #define BUFFER_SIZE ( 8 << 20) | |||||
| #define BUFFER_SIZE ( 32 << 20) | |||||
| #if defined(LOONGSON3A) | #if defined(LOONGSON3A) | ||||
| #define PAGESIZE (16UL << 10) | #define PAGESIZE (16UL << 10) | ||||
| @@ -71,16 +71,25 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||||
| queue[num_cpu].args = arg; | queue[num_cpu].args = arg; | ||||
| queue[num_cpu].range_m = range_m; | queue[num_cpu].range_m = range_m; | ||||
| queue[num_cpu].range_n = &range[num_cpu]; | queue[num_cpu].range_n = &range[num_cpu]; | ||||
| queue[num_cpu].sa = NULL; | |||||
| #if defined(LOONGSON3A) | |||||
| queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; | |||||
| queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; | |||||
| #else | |||||
| queue[num_cpu].sa = NULL; | |||||
| queue[num_cpu].sb = NULL; | queue[num_cpu].sb = NULL; | ||||
| #endif | |||||
| queue[num_cpu].next = &queue[num_cpu + 1]; | queue[num_cpu].next = &queue[num_cpu + 1]; | ||||
| num_cpu ++; | num_cpu ++; | ||||
| } | } | ||||
| if (num_cpu) { | if (num_cpu) { | ||||
| #if defined(LOONGSON3A) | |||||
| queue[0].sa = sa; | queue[0].sa = sa; | ||||
| queue[0].sb = sb; | |||||
| queue[0].sb = sa + GEMM_OFFSET_A1 * 5; | |||||
| #else | |||||
| queue[0].sa = sa; | |||||
| queue[0].sb = sb; | |||||
| #endif | |||||
| queue[num_cpu - 1].next = NULL; | queue[num_cpu - 1].next = NULL; | ||||
| exec_blas(num_cpu, | exec_blas(num_cpu, | ||||
| @@ -500,6 +500,7 @@ static int blas_monitor(void *arg){ | |||||
| /* Initializing routine */ | /* Initializing routine */ | ||||
| int blas_thread_init(void){ | int blas_thread_init(void){ | ||||
| BLASLONG i; | BLASLONG i; | ||||
| int ret; | |||||
| #ifdef NEED_STACKATTR | #ifdef NEED_STACKATTR | ||||
| pthread_attr_t attr; | pthread_attr_t attr; | ||||
| #endif | #endif | ||||
| @@ -545,12 +546,16 @@ int blas_thread_init(void){ | |||||
| pthread_cond_init (&thread_status[i].wakeup, NULL); | pthread_cond_init (&thread_status[i].wakeup, NULL); | ||||
| #ifdef NEED_STACKATTR | #ifdef NEED_STACKATTR | ||||
| pthread_create(&blas_threads[i], &attr, | |||||
| ret=pthread_create(&blas_threads[i], &attr, | |||||
| (void *)&blas_thread_server, (void *)i); | (void *)&blas_thread_server, (void *)i); | ||||
| #else | #else | ||||
| pthread_create(&blas_threads[i], NULL, | |||||
| ret=pthread_create(&blas_threads[i], NULL, | |||||
| (void *)&blas_thread_server, (void *)i); | (void *)&blas_thread_server, (void *)i); | ||||
| #endif | #endif | ||||
| if(ret!=0){ | |||||
| fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret); | |||||
| exit(1); | |||||
| } | |||||
| } | } | ||||
| #ifdef MONITOR | #ifdef MONITOR | ||||
| @@ -797,6 +802,11 @@ void goto_set_num_threads(int num_threads) { | |||||
| blas_cpu_number = num_threads; | blas_cpu_number = num_threads; | ||||
| #if defined(ARCH_MIPS64) | |||||
| //set parameters for different number of threads. | |||||
| blas_set_parameter(); | |||||
| #endif | |||||
| } | } | ||||
| void openblas_set_num_threads(int num_threads) { | void openblas_set_num_threads(int num_threads) { | ||||
| @@ -63,6 +63,11 @@ void goto_set_num_threads(int num_threads) { | |||||
| omp_set_num_threads(blas_cpu_number); | omp_set_num_threads(blas_cpu_number); | ||||
| #if defined(ARCH_MIPS64) | |||||
| //set parameters for different number of threads. | |||||
| blas_set_parameter(); | |||||
| #endif | |||||
| } | } | ||||
| void openblas_set_num_threads(int num_threads) { | void openblas_set_num_threads(int num_threads) { | ||||
| @@ -884,7 +884,7 @@ void *blas_memory_alloc(int procpos){ | |||||
| if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); | if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); | ||||
| #endif | #endif | ||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) | |||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) | |||||
| #ifndef DYNAMIC_ARCH | #ifndef DYNAMIC_ARCH | ||||
| blas_set_parameter(); | blas_set_parameter(); | ||||
| #endif | #endif | ||||
| @@ -45,8 +45,22 @@ int get_L2_size(void); | |||||
| #define DEFAULT_GEMM_P 128 | #define DEFAULT_GEMM_P 128 | ||||
| #define DEFAULT_GEMM_Q 128 | #define DEFAULT_GEMM_Q 128 | ||||
| #define DEFAULT_GEMM_R 128 | #define DEFAULT_GEMM_R 128 | ||||
| #define DEFAULT_GEMM_OFFSET_A 0 | |||||
| #define DEFAULT_GEMM_OFFSET_B 0 | |||||
| /* Global Parameter */ | /* Global Parameter */ | ||||
| #if GEMM_OFFSET_A == gemm_offset_a | |||||
| BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A; | |||||
| #else | |||||
| BLASLONG gemm_offset_a = GEMM_OFFSET_A; | |||||
| #endif | |||||
| #if GEMM_OFFSET_B == gemm_offset_b | |||||
| BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; | |||||
| #else | |||||
| BLASLONG gemm_offset_b = GEMM_OFFSET_B; | |||||
| #endif | |||||
| #if SGEMM_P == sgemm_p | #if SGEMM_P == sgemm_p | ||||
| BLASLONG sgemm_p = DEFAULT_GEMM_P; | BLASLONG sgemm_p = DEFAULT_GEMM_P; | ||||
| #else | #else | ||||
| @@ -666,3 +680,21 @@ void blas_set_parameter(void){ | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #if defined(ARCH_MIPS64) | |||||
| void blas_set_parameter(void){ | |||||
| #if defined(LOONGSON3A) | |||||
| #ifdef SMP | |||||
| if(blas_num_threads == 1){ | |||||
| #endif | |||||
| //single thread | |||||
| dgemm_r = 1024; | |||||
| #ifdef SMP | |||||
| }else{ | |||||
| //multi thread | |||||
| dgemm_r = 200; | |||||
| } | |||||
| #endif | |||||
| #endif | |||||
| } | |||||
| #endif | |||||
| @@ -136,6 +136,7 @@ void NAME(char *SIDE, char *UPLO, | |||||
| FLOAT *sa, *sb; | FLOAT *sa, *sb; | ||||
| #ifdef SMP | #ifdef SMP | ||||
| #ifndef COMPLEX | |||||
| #ifdef XDOUBLE | #ifdef XDOUBLE | ||||
| int mode = BLAS_XDOUBLE | BLAS_REAL; | int mode = BLAS_XDOUBLE | BLAS_REAL; | ||||
| #elif defined(DOUBLE) | #elif defined(DOUBLE) | ||||
| @@ -143,6 +144,15 @@ void NAME(char *SIDE, char *UPLO, | |||||
| #else | #else | ||||
| int mode = BLAS_SINGLE | BLAS_REAL; | int mode = BLAS_SINGLE | BLAS_REAL; | ||||
| #endif | #endif | ||||
| #else | |||||
| #ifdef XDOUBLE | |||||
| int mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||||
| #elif defined(DOUBLE) | |||||
| int mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||||
| #else | |||||
| int mode = BLAS_SINGLE | BLAS_COMPLEX; | |||||
| #endif | |||||
| #endif | |||||
| #endif | #endif | ||||
| #if defined(SMP) && !defined(NO_AFFINITY) | #if defined(SMP) && !defined(NO_AFFINITY) | ||||
| @@ -237,6 +247,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, | |||||
| FLOAT *sa, *sb; | FLOAT *sa, *sb; | ||||
| #ifdef SMP | #ifdef SMP | ||||
| #ifndef COMPLEX | |||||
| #ifdef XDOUBLE | #ifdef XDOUBLE | ||||
| int mode = BLAS_XDOUBLE | BLAS_REAL; | int mode = BLAS_XDOUBLE | BLAS_REAL; | ||||
| #elif defined(DOUBLE) | #elif defined(DOUBLE) | ||||
| @@ -244,6 +255,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, | |||||
| #else | #else | ||||
| int mode = BLAS_SINGLE | BLAS_REAL; | int mode = BLAS_SINGLE | BLAS_REAL; | ||||
| #endif | #endif | ||||
| #else | |||||
| #ifdef XDOUBLE | |||||
| int mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||||
| #elif defined(DOUBLE) | |||||
| int mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||||
| #else | |||||
| int mode = BLAS_SINGLE | BLAS_COMPLEX; | |||||
| #endif | |||||
| #endif | |||||
| #endif | #endif | ||||
| #if defined(SMP) && !defined(NO_AFFINITY) | #if defined(SMP) && !defined(NO_AFFINITY) | ||||
| @@ -123,15 +123,37 @@ ifndef DTRSMKERNEL_RT | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT.S | DTRSMKERNEL_RT = trsm_kernel_RT.S | ||||
| endif | endif | ||||
| ifndef CTRSMKERNEL_LN | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LT.S | CTRSMKERNEL_LN = ztrsm_kernel_LT.S | ||||
| endif | |||||
| ifndef CTRSMKERNEL_LT | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT.S | CTRSMKERNEL_LT = ztrsm_kernel_LT.S | ||||
| endif | |||||
| ifndef CTRSMKERNEL_RN | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT.S | CTRSMKERNEL_RN = ztrsm_kernel_LT.S | ||||
| endif | |||||
| ifndef CTRSMKERNEL_RT | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT.S | CTRSMKERNEL_RT = ztrsm_kernel_RT.S | ||||
| endif | |||||
| ifndef ZTRSMKERNEL_LN | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT.S | ZTRSMKERNEL_LN = ztrsm_kernel_LT.S | ||||
| endif | |||||
| ifndef ZTRSMKERNEL_LT | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT.S | ZTRSMKERNEL_LT = ztrsm_kernel_LT.S | ||||
| endif | |||||
| ifndef ZTRSMKERNEL_RN | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT.S | ZTRSMKERNEL_RN = ztrsm_kernel_LT.S | ||||
| endif | |||||
| ifndef ZTRSMKERNEL_RT | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT.S | ZTRSMKERNEL_RT = ztrsm_kernel_RT.S | ||||
| endif | |||||
| CGEMM3MKERNEL = zgemm3m_kernel.S | CGEMM3MKERNEL = zgemm3m_kernel.S | ||||
| ZGEMM3MKERNEL = zgemm3m_kernel.S | ZGEMM3MKERNEL = zgemm3m_kernel.S | ||||
| @@ -1,18 +1,48 @@ | |||||
| SAXPYKERNEL=axpy_loongson3a.S | SAXPYKERNEL=axpy_loongson3a.S | ||||
| DAXPYKERNEL=daxpy_loongson3a_simd.S | DAXPYKERNEL=daxpy_loongson3a_simd.S | ||||
| SGEMMKERNEL = sgemm_kernel_loongson3a.S | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMVNKERNEL = gemv_n_loongson3a.c | |||||
| SGEMVTKERNEL = gemv_t_loongson3a.c | |||||
| DGEMVNKERNEL = gemv_n_loongson3a.c | |||||
| DGEMVTKERNEL = gemv_t_loongson3a.c | |||||
| CGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
| CGEMVTKERNEL = zgemv_t_loongson3a.c | |||||
| ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
| ZGEMVTKERNEL = zgemv_t_loongson3a.c | |||||
| SGEMMKERNEL = sgemm_kernel_8x4_ps.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | SGEMMONCOPYOBJ = sgemm_oncopy.o | ||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | SGEMMOTCOPYOBJ = sgemm_otcopy.o | ||||
| DGEMMKERNEL = gemm_kernel_loongson3a.S | |||||
| DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | DGEMMONCOPY = ../generic/gemm_ncopy_4.c | ||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | ||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | DGEMMONCOPYOBJ = dgemm_oncopy.o | ||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | DGEMMOTCOPYOBJ = dgemm_otcopy.o | ||||
| CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| @@ -22,3 +52,17 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| @@ -0,0 +1,101 @@ | |||||
| #include "common.h" | |||||
| //These are auto-tuning codes on Loongson-3A platform. | |||||
| //#define prefetch(x) __builtin_prefetch(x) | |||||
| //#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) | |||||
| #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) | |||||
| #define likely(x) __builtin_expect(!!(x), 1) | |||||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||||
| #define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0) | |||||
| #define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0) | |||||
| #define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) | |||||
| #define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) | |||||
| { | |||||
| BLASLONG kx=0, ky=0; | |||||
| if(!ALPHA) | |||||
| return 0; | |||||
| //if(INCX < 0) | |||||
| // kx = (1-N) * INCX; | |||||
| // INCX = -INCX; | |||||
| //if(INCY < 0) | |||||
| // ky = (1-M) * INCY; | |||||
| // INCY = -INCY; | |||||
| BLASLONG fahead = 30; | |||||
| BLASLONG spec_unroll = 4; | |||||
| BLASLONG tMQ = M - M % spec_unroll; | |||||
| BLASLONG j = 0, k = 0; | |||||
| if(ALPHA == 1) { | |||||
| if(INCY == 1) { | |||||
| for(k=kx; likely(j < N); j++, k += INCX) { | |||||
| BLASLONG i = 0; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(Y[i + fahead]); | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| spec_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(k=kx; likely(j < N); j++, k += INCX) { | |||||
| BLASLONG i = 0, h = ky; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(Y[h + fahead]); | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| norm_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| if(INCY == 1) { | |||||
| for(k=kx; likely(j < N); j++, k += INCX) { | |||||
| BLASLONG i = 0; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(Y[i + fahead]); | |||||
| /*loop_mark*/ spec_loop; | |||||
| /*loop_mark*/ spec_loop; | |||||
| /*loop_mark*/ spec_loop; | |||||
| /*loop_mark*/ spec_loop; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| spec_loop; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(k=kx; likely(j < N); j++, k += INCX) { | |||||
| BLASLONG i = 0, h = ky; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(Y[h + fahead]); | |||||
| /*loop_mark*/ norm_loop; | |||||
| /*loop_mark*/ norm_loop; | |||||
| /*loop_mark*/ norm_loop; | |||||
| /*loop_mark*/ norm_loop; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| norm_loop; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,93 @@ | |||||
| #include "common.h" | |||||
| //These are auto-tuning codes on Loongson-3A platform. | |||||
| //#define prefetch(x) __builtin_prefetch(x) | |||||
| //#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) | |||||
| #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) | |||||
| #define likely(x) __builtin_expect(!!(x), 1) | |||||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||||
| #define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0) | |||||
| #define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0) | |||||
| #define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) | |||||
| #define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { | |||||
| if(!ALPHA) | |||||
| return 0; | |||||
| // if(INCX < 0) | |||||
| // INCX = -INCX; | |||||
| // if(INCY < 0) | |||||
| // INCY = -INCY; | |||||
| BLASLONG fahead = 30; | |||||
| BLASLONG spec_unroll = 3; | |||||
| BLASLONG tMQ = M - M % spec_unroll; | |||||
| BLASLONG j = 0, k = 0; | |||||
| if(ALPHA == 1) { | |||||
| if(INCX == 1) { | |||||
| for(; likely(j < N); j++, k += INCY) { | |||||
| BLASLONG i = 0; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(X[i + fahead]); | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| spec_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(; likely(j < N); j++, k += INCY) { | |||||
| BLASLONG i = 0, h = 0; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(X[h + fahead]); | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| norm_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| if(INCX == 1) { | |||||
| for(; likely(j < N); j++, k += INCY) { | |||||
| BLASLONG i = 0; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(X[i + fahead]); | |||||
| /*loop_mark*/ spec_loop; | |||||
| /*loop_mark*/ spec_loop; | |||||
| /*loop_mark*/ spec_loop; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| spec_loop; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(; likely(j < N); j++, k += INCY) { | |||||
| BLASLONG i = 0, h = 0; | |||||
| for(; likely(i < tMQ);) { | |||||
| prefetch(A[LDA * j + i + fahead]); | |||||
| prefetch(X[h + fahead]); | |||||
| /*loop_mark*/ norm_loop; | |||||
| /*loop_mark*/ norm_loop; | |||||
| /*loop_mark*/ norm_loop; | |||||
| } | |||||
| for(; likely(i < M);) { | |||||
| norm_loop; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,139 @@ | |||||
| #include "common.h" | |||||
| //typedef int BLASLONG; | |||||
| //typedef double FLOAT; | |||||
| #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) | |||||
| #define likely(x) __builtin_expect(!!(x), 1) | |||||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||||
| #if !defined(CONJ) && !defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_0 | |||||
| #define spec_loop spec_loop_0 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_0 | |||||
| #define norm_loop norm_loop_0 | |||||
| #endif | |||||
| #if defined(CONJ) && !defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_1 | |||||
| #define spec_loop spec_loop_1 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_1 | |||||
| #define norm_loop norm_loop_1 | |||||
| #endif | |||||
| #if !defined(CONJ) && defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_2 | |||||
| #define spec_loop spec_loop_2 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_2 | |||||
| #define norm_loop norm_loop_2 | |||||
| #endif | |||||
| #if defined(CONJ) && defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_3 | |||||
| #define spec_loop spec_loop_3 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_3 | |||||
| #define norm_loop norm_loop_3 | |||||
| #endif | |||||
| #define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) | |||||
| #define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) | |||||
| #define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) | |||||
| #define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) | |||||
| #define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) | |||||
| #define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) | |||||
| #define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) | |||||
| #define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { | |||||
| if(!rALPHA && iALPHA) | |||||
| return 0; | |||||
| BLASLONG fahead = 60; | |||||
| BLASLONG spec_unroll = 2; | |||||
| BLASLONG tMQ = M - M % spec_unroll; | |||||
| BLASLONG j = 0, k = 0, jj = 0; | |||||
| if(rALPHA == 1 && iALPHA == 0) { | |||||
| if(INCY == 1) { | |||||
| for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(Y[ii + fahead]); | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| spec_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0, iii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(Y[iii + fahead]); | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| norm_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| FLOAT rTmp, iTmp; | |||||
| if(INCY == 1) { | |||||
| for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(Y[ii + fahead]); | |||||
| /*loop_mark*/ spec_loop; | |||||
| /*loop_mark*/ spec_loop; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| spec_loop; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0, iii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(Y[iii + fahead]); | |||||
| /*loop_mark*/ norm_loop; | |||||
| /*loop_mark*/ norm_loop; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| norm_loop; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,125 @@ | |||||
| #include "common.h" | |||||
| #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) | |||||
| #define likely(x) __builtin_expect(!!(x), 1) | |||||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||||
| #if !defined(CONJ) && !defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_0 | |||||
| #define spec_loop spec_loop_0 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_0 | |||||
| #define norm_loop norm_loop_0 | |||||
| #endif | |||||
| #if defined(CONJ) && !defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_1 | |||||
| #define spec_loop spec_loop_1 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_1 | |||||
| #define norm_loop norm_loop_1 | |||||
| #endif | |||||
| #if !defined(CONJ) && defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_2 | |||||
| #define spec_loop spec_loop_2 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_2 | |||||
| #define norm_loop norm_loop_2 | |||||
| #endif | |||||
| #if defined(CONJ) && defined(XCONJ) | |||||
| #define spec_loop_alpha1 spec_loop_alpha1_3 | |||||
| #define spec_loop spec_loop_3 | |||||
| #define norm_loop_alpha1 norm_loop_alpha1_3 | |||||
| #define norm_loop norm_loop_3 | |||||
| #endif | |||||
| #define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) | |||||
| #define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) | |||||
| #define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) | |||||
| #define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) | |||||
| #define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) | |||||
| #define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) | |||||
| #define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) | |||||
| #define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) | |||||
| #define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) | |||||
| #define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { | |||||
| if(!rALPHA && iALPHA) | |||||
| return 0; | |||||
| BLASLONG fahead = 30; | |||||
| BLASLONG spec_unroll = 2; | |||||
| BLASLONG tMQ = M - M % spec_unroll; | |||||
| BLASLONG j = 0, k = 0, jj = 0; | |||||
| if(rALPHA == 1 && iALPHA == 0) { | |||||
| if(INCX == 1) { | |||||
| for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(X[ii + fahead]); | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| /*loop_mark*/ spec_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| spec_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0, iii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(X[iii + fahead]); | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| /*loop_mark*/ norm_loop_alpha1; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| norm_loop_alpha1; | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| FLOAT rTmp, iTmp; | |||||
| if(INCX == 1) { | |||||
| for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(X[ii + fahead]); | |||||
| /*loop_mark*/ spec_loop; | |||||
| /*loop_mark*/ spec_loop; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| spec_loop; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { | |||||
| BLASLONG i = 0, ii = 0, iii = 0; | |||||
| for(; likely(i < tMQ); i += spec_unroll) { | |||||
| prefetch(A[jj + ii + fahead]); | |||||
| prefetch(X[iii + fahead]); | |||||
| /*loop_mark*/ norm_loop; | |||||
| /*loop_mark*/ norm_loop; | |||||
| } | |||||
| for(; likely(i < M); i++) { | |||||
| norm_loop; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -1480,31 +1480,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | #define GEMM_DEFAULT_ALIGN 0x03fffUL | ||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define DGEMM_DEFAULT_UNROLL_M 4 | #define DGEMM_DEFAULT_UNROLL_M 4 | ||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | #define DGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define CGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define SGEMM_DEFAULT_P 32 | |||||
| #define DGEMM_DEFAULT_P 32 | |||||
| #define CGEMM_DEFAULT_P 108 | |||||
| #define ZGEMM_DEFAULT_P 112 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define SGEMM_DEFAULT_Q 116 | |||||
| #define DGEMM_DEFAULT_Q 116 | |||||
| #define CGEMM_DEFAULT_Q 144 | |||||
| #define ZGEMM_DEFAULT_Q 72 | |||||
| #define SGEMM_DEFAULT_P 64 | |||||
| #define DGEMM_DEFAULT_P 44 | |||||
| #define CGEMM_DEFAULT_P 64 | |||||
| #define ZGEMM_DEFAULT_P 32 | |||||
| #define SGEMM_DEFAULT_R 1000 | |||||
| #define DGEMM_DEFAULT_R 1000 | |||||
| #define CGEMM_DEFAULT_R 2000 | |||||
| #define ZGEMM_DEFAULT_R 2000 | |||||
| #define SGEMM_DEFAULT_Q 192 | |||||
| #define DGEMM_DEFAULT_Q 92 | |||||
| #define CGEMM_DEFAULT_Q 128 | |||||
| #define ZGEMM_DEFAULT_Q 80 | |||||
| #define SGEMM_DEFAULT_R 1024 | |||||
| #define DGEMM_DEFAULT_R dgemm_r | |||||
| #define CGEMM_DEFAULT_R 1024 | |||||
| #define ZGEMM_DEFAULT_R 1024 | |||||
| #define GEMM_OFFSET_A1 0x10000 | |||||
| #define GEMM_OFFSET_B1 0x100000 | |||||
| #define SYMV_P 16 | #define SYMV_P 16 | ||||
| #endif | #endif | ||||
| @@ -1301,6 +1301,8 @@ | |||||
| NC = 0 | NC = 0 | ||||
| RESET = .TRUE. | RESET = .TRUE. | ||||
| ERRMAX = RZERO | ERRMAX = RZERO | ||||
| RALS = RONE | |||||
| RBETS = RONE | |||||
| * | * | ||||
| DO 100 IN = 1, NIDIM | DO 100 IN = 1, NIDIM | ||||
| N = IDIM( IN ) | N = IDIM( IN ) | ||||
| @@ -1303,6 +1303,8 @@ | |||||
| NC = 0 | NC = 0 | ||||
| RESET = .TRUE. | RESET = .TRUE. | ||||
| ERRMAX = RZERO | ERRMAX = RZERO | ||||
| RALS = RONE | |||||
| RBETS = RONE | |||||
| * | * | ||||
| DO 100 IN = 1, NIDIM | DO 100 IN = 1, NIDIM | ||||
| N = IDIM( IN ) | N = IDIM( IN ) | ||||