| @@ -332,7 +332,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| #else | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
| #if defined(BULLDOZER) && defined(ARCH_X86_64) && defined(DOUBLE) && !defined(COMPLEX) | |||
| if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; | |||
| else | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| else | |||
| if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | |||
| else | |||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
| #else | |||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
| #endif | |||
| START_RPCC(); | |||
| @@ -360,8 +360,20 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ | |||
| min_jj = MIN(n_to, xxx + div_n) - jjs; | |||
| #if defined(BULLDOZER) && defined(ARCH_X86_64) && defined(DOUBLE) && !defined(COMPLEX) | |||
| if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; | |||
| else | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| else | |||
| if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | |||
| else | |||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
| #else | |||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
| #endif | |||
| START_RPCC(); | |||
| OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, | |||
| @@ -634,7 +646,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| num_cpu_n ++; | |||
| } | |||
| for (j = 0; j < num_cpu_m; j++) { | |||
| for (i = 0; i < num_cpu_m; i++) { | |||
| for (k = 0; k < DIVIDE_RATE; k++) { | |||
| @@ -10,13 +10,13 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S | |||
| DGEMMINCOPY = | |||
| DGEMMITCOPY = | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPYOBJ = | |||
| DGEMMITCOPYOBJ = | |||
| DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||
| @@ -38,25 +38,27 @@ ZGEMMITCOPYOBJ = | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||
| STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||
| STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||
| STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||
| DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
| @@ -153,7 +153,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(COMPLEX) | |||
| #define DGEMM_DEFAULT_UNROLL_N 2 | |||
| #else | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #endif | |||
| #define QGEMM_DEFAULT_UNROLL_N 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| @@ -161,14 +165,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef ARCH_X86 | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||
| #else | |||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||
| #if defined(BULLDOZER) && !defined(COMPLEX) | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #else | |||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||
| #endif | |||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||
| @@ -193,26 +201,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #define SGEMM_DEFAULT_P 448 | |||
| #if defined(BULLDOZER) && defined(ARCH_X86_64) | |||
| #define DGEMM_DEFAULT_P 248 | |||
| #if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(COMPLEX) | |||
| #define DGEMM_DEFAULT_P 384 | |||
| #else | |||
| #define DGEMM_DEFAULT_P 224 | |||
| #endif | |||
| #define QGEMM_DEFAULT_P 112 | |||
| #define CGEMM_DEFAULT_P 224 | |||
| #define ZGEMM_DEFAULT_P 112 | |||
| #define XGEMM_DEFAULT_P 56 | |||
| #define SGEMM_DEFAULT_Q 224 | |||
| #if defined(BULLDOZER) && defined(ARCH_X86_64) | |||
| #define DGEMM_DEFAULT_Q 248 | |||
| #if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(COMPLEX) | |||
| #define DGEMM_DEFAULT_Q 168 | |||
| #else | |||
| #define DGEMM_DEFAULT_Q 224 | |||
| #endif | |||
| #define QGEMM_DEFAULT_Q 224 | |||
| #define CGEMM_DEFAULT_Q 224 | |||
| #define ZGEMM_DEFAULT_Q 224 | |||
| @@ -230,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SYMV_P 16 | |||
| #define HAVE_EXCLUSIVE_CACHE | |||
| #define GEMM_THREAD gemm_thread_mn | |||
| #define GEMM_THREAD gemm_thread_m | |||
| #endif | |||