Changing the unroll factors for dgemm to 8 shows improved performance with POWER10 MMA feature. Also made some minor changes in sgemm for edge cases.tags/v0.3.13^2
| @@ -34,12 +34,12 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_power10.c | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| DGEMMITCOPY = dgemm_tcopy_16_power8.S | |||
| DGEMMONCOPY = dgemm_ncopy_4_power8.S | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMINCOPY = | |||
| DGEMMITCOPY = | |||
| DGEMMONCOPY = dgemm_ncopy_8_power10.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
| DGEMMINCOPYOBJ = | |||
| DGEMMITCOPYOBJ = | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -69,7 +69,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| @@ -149,7 +149,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| #endif | |||
| ) | |||
| { | |||
| BLASLONG N = n; | |||
| BLASLONG i1; | |||
| #if defined(TRMMKERNEL) | |||
| BLASLONG off; | |||
| @@ -158,85 +157,232 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| off = -offset; | |||
| #endif | |||
| v4sf_t valpha = { alpha, alpha }; | |||
| N = n >> 2; | |||
| for (i1 = 0; i1 < N; i1++) | |||
| for (i1 = 0; i1 < (n >> 3); i1++) | |||
| { | |||
| BLASLONG i, j, temp; | |||
| BLASLONG j, temp; | |||
| FLOAT *CO; | |||
| FLOAT *AO; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| CO = C; | |||
| C += ldc << 2; | |||
| C += ldc << 3; | |||
| AO = A; | |||
| PREFETCH1 (A, 128); | |||
| PREFETCH1 (A, 256); | |||
| i = m >> 4; | |||
| for (j = 0; j < i; j++) | |||
| for (j = 0; j < (m >> 3); j++) | |||
| { | |||
| FLOAT *BO; | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (16, 4); | |||
| REFRESH_POINTERS (8, 8); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| v4sf_t *rowC; | |||
| v4sf_t result[4]; | |||
| __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7; | |||
| BLASLONG l = 0; | |||
| PREFETCH1 (CO, 0); | |||
| PREFETCH1 (CO + ldc, 0); | |||
| PREFETCH1 (CO + ldc + ldc, 0); | |||
| PREFETCH1 (CO + ldc + ldc + ldc, 0); | |||
| PREFETCH1 (CO, 128); | |||
| PREFETCH1 (CO + ldc, 128); | |||
| PREFETCH1 (CO + ldc + ldc, 128); | |||
| PREFETCH1 (CO + ldc + ldc + ldc, 128); | |||
| __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
| vec_t *rowA = (vec_t *) & AO[0]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & BO[0]; | |||
| __vector_pair rowB, rowB1; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | |||
| __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); | |||
| __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); | |||
| __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); | |||
| __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); | |||
| __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); | |||
| __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); | |||
| __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); | |||
| __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]); | |||
| __builtin_mma_xvf64ger (&acc4, rowB, rowA[2]); | |||
| __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]); | |||
| __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]); | |||
| __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]); | |||
| for (l = 1; l < temp; l++) | |||
| { | |||
| rowA = (vec_t *) & AO[l << 4]; | |||
| rb = (vec_t *) & BO[l << 2]; | |||
| rowA = (vec_t *) & AO[l << 3]; | |||
| rb = (vec_t *) & BO[l << 3]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | |||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); | |||
| __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); | |||
| __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); | |||
| __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); | |||
| __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); | |||
| __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); | |||
| __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]); | |||
| __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]); | |||
| __builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]); | |||
| __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]); | |||
| __builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]); | |||
| } | |||
| SAVE_ACC (&acc0, 0); | |||
| SAVE_ACC (&acc2, 4); | |||
| SAVE_ACC (&acc1, 2); | |||
| SAVE_ACC (&acc3, 6); | |||
| SAVE_ACC (&acc4, 8); | |||
| SAVE_ACC (&acc6, 12); | |||
| SAVE_ACC (&acc5, 10); | |||
| SAVE_ACC (&acc7, 14); | |||
| AO += temp << 4; | |||
| BO += temp << 2; | |||
| SAVE_ACC1 (&acc1, 0); | |||
| SAVE_ACC (&acc2, 2); | |||
| SAVE_ACC1 (&acc3, 2); | |||
| SAVE_ACC (&acc4, 4); | |||
| SAVE_ACC1 (&acc5, 4); | |||
| SAVE_ACC (&acc6, 6); | |||
| SAVE_ACC1 (&acc7, 6); | |||
| CO += 8; | |||
| AO += temp << 3; | |||
| BO += temp << 3; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (8, 8) | |||
| #endif | |||
| } | |||
| if (m & 4) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (4, 8); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| v4sf_t *rowC; | |||
| v4sf_t result[4]; | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| BLASLONG l = 0; | |||
| vec_t *rowA = (vec_t *) & AO[0]; | |||
| __vector_pair rowB, rowB1; | |||
| vec_t *rb = (vec_t *) & BO[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); | |||
| __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]); | |||
| for (l = 1; l < temp; l++) | |||
| { | |||
| rowA = (vec_t *) & AO[l << 2]; | |||
| rb = (vec_t *) & BO[l << 3]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); | |||
| __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]); | |||
| } | |||
| SAVE_ACC (&acc0, 0); | |||
| SAVE_ACC1 (&acc1, 0); | |||
| SAVE_ACC (&acc2, 2); | |||
| SAVE_ACC1 (&acc3, 2); | |||
| CO += 4; | |||
| AO += temp << 2; | |||
| BO += temp << 3; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (4, 8) | |||
| #endif | |||
| } | |||
| if (m & 2) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (2, 8); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| v4sf_t *rowC; | |||
| v4sf_t result[4]; | |||
| __vector_quad acc0, acc1; | |||
| BLASLONG l = 0; | |||
| vec_t *rowA = (vec_t *) & AO[0]; | |||
| __vector_pair rowB, rowB1; | |||
| vec_t *rb = (vec_t *) & BO[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | |||
| for (l = 1; l < temp; l++) | |||
| { | |||
| rowA = (vec_t *) & AO[l << 1]; | |||
| rb = (vec_t *) & BO[l << 3]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | |||
| } | |||
| SAVE_ACC (&acc0, 0); | |||
| SAVE_ACC1 (&acc1, 0); | |||
| CO += 2; | |||
| AO += temp << 1; | |||
| BO += temp << 3; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (16, 4) | |||
| REFRESH_AFTER_SAVE (2, 8) | |||
| #endif | |||
| CO += 16; | |||
| } | |||
| i = (m & 15) >> 3; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 1) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (1, 8); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| BLASLONG l = 0; | |||
| v4sf_t t = { 0, 0 }; | |||
| v4sf_t t1 = { 0, 0 }; | |||
| v4sf_t t2 = { 0, 0 }; | |||
| v4sf_t t3 = { 0, 0 }; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| v4sf_t rowA = { AO[l], AO[l] }; | |||
| v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] }; | |||
| v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] }; | |||
| v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] }; | |||
| v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] }; | |||
| t += rowA * rowB; | |||
| t1 += rowA * rowB1; | |||
| t2 += rowA * rowB2; | |||
| t3 += rowA * rowB3; | |||
| } | |||
| t = t * valpha; | |||
| t1 = t1 * valpha; | |||
| t2 = t2 * valpha; | |||
| t3 = t3 * valpha; | |||
| #if defined(TRMMKERNEL) | |||
| CO[0 * ldc] = t[0]; | |||
| CO[1 * ldc] = t[1]; | |||
| CO[2 * ldc] = t1[0]; | |||
| CO[3 * ldc] = t1[1]; | |||
| CO[4 * ldc] = t2[0]; | |||
| CO[5 * ldc] = t2[1]; | |||
| CO[6 * ldc] = t3[0]; | |||
| CO[7 * ldc] = t3[1]; | |||
| #else | |||
| CO[0 * ldc] += t[0]; | |||
| CO[1 * ldc] += t[1]; | |||
| CO[2 * ldc] += t1[0]; | |||
| CO[3 * ldc] += t1[1]; | |||
| CO[4 * ldc] += t2[0]; | |||
| CO[5 * ldc] += t2[1]; | |||
| CO[6 * ldc] += t3[0]; | |||
| CO[7 * ldc] += t3[1]; | |||
| #endif | |||
| CO += 1; | |||
| AO += temp; | |||
| BO += temp << 3; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (1, 8) | |||
| #endif | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 8; // number of values in A | |||
| #endif | |||
| B += k << 3; | |||
| } | |||
| if (n & 4) | |||
| { | |||
| BLASLONG j, temp; | |||
| FLOAT *CO; | |||
| FLOAT *AO; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| CO = C; | |||
| C += ldc << 2; | |||
| AO = A; | |||
| PREFETCH1 (A, 128); | |||
| PREFETCH1 (A, 256); | |||
| for (j = 0; j < (m >> 3); j++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -278,8 +424,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (8, 4) | |||
| #endif | |||
| } | |||
| i = (m & 7) >> 2; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 4) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -315,8 +460,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (4, 4) | |||
| #endif | |||
| } | |||
| i = (m & 3) >> 1; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 2) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -349,8 +493,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (2, 4) | |||
| #endif | |||
| } | |||
| i = (m & 1) >> 0; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 1) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -395,10 +538,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| #endif | |||
| B += k << 2; | |||
| } | |||
| N = (n & 3) >> 1; | |||
| for (i1 = 0; i1 < N; i1++) | |||
| if (n & 2) | |||
| { | |||
| BLASLONG i, j, temp; | |||
| BLASLONG j, temp; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| @@ -407,66 +549,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| CO = C; | |||
| C += ldc << 1; | |||
| AO = A; | |||
| i = m >> 4; | |||
| for (j = 0; j < i; j++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (16, 2); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| v4sf_t *rowC; | |||
| v4sf_t result[4]; | |||
| __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
| BLASLONG l = 0; | |||
| FLOAT t[4] = { 0, 0, 0, 0 }; | |||
| t[0] = BO[0], t[1] = BO[1]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & t[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| vec_t *rowA = (vec_t *) & AO[0]; | |||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | |||
| __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); | |||
| __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); | |||
| __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); | |||
| __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); | |||
| __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); | |||
| __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); | |||
| for (l = 1; l < temp; l++) | |||
| { | |||
| t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; | |||
| rb = (vec_t *) & t[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| rowA = (vec_t *) & AO[l << 4]; | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | |||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); | |||
| __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); | |||
| __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); | |||
| __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); | |||
| __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); | |||
| __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); | |||
| } | |||
| SAVE2x4_ACC (&acc0, 0); | |||
| SAVE2x4_ACC (&acc1, 2); | |||
| SAVE2x4_ACC (&acc2, 4); | |||
| SAVE2x4_ACC (&acc3, 6); | |||
| SAVE2x4_ACC (&acc4, 8); | |||
| SAVE2x4_ACC (&acc5, 10); | |||
| SAVE2x4_ACC (&acc6, 12); | |||
| SAVE2x4_ACC (&acc7, 14); | |||
| CO += 16; | |||
| AO += temp << 4; | |||
| BO += temp << 1; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (16, 2) | |||
| #endif | |||
| } | |||
| i = (m & 15) >> 3; | |||
| for (j = 0; j < i; j++) | |||
| for (j = 0; j < (m >> 3); j++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -511,8 +594,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (8, 2) | |||
| #endif | |||
| } | |||
| i = (m & 7) >> 2; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 4) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -551,8 +633,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (4, 2) | |||
| #endif | |||
| } | |||
| i = (m & 3) >> 1; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 2) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -588,8 +669,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (2, 2) | |||
| #endif | |||
| } | |||
| i = (m & 1) >> 0; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 1) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -626,8 +706,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| #endif | |||
| B += k << 1; | |||
| } | |||
| N = (n & 1) >> 0; | |||
| for (i1 = 0; i1 < N; i1++) | |||
| if (n & 1) | |||
| { | |||
| BLASLONG i, temp; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| @@ -638,97 +717,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| CO = C; | |||
| C += ldc; | |||
| AO = A; | |||
| i = m; | |||
| while (i >= 16) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (16, 1) | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| BLASLONG l = 0; | |||
| v4sf_t t = { 0, 0 }; | |||
| v4sf_t t1 = { 0, 0 }; | |||
| v4sf_t t2 = { 0, 0 }; | |||
| v4sf_t t3 = { 0, 0 }; | |||
| v4sf_t t4 = { 0, 0 }; | |||
| v4sf_t t5 = { 0, 0 }; | |||
| v4sf_t t6 = { 0, 0 }; | |||
| v4sf_t t7 = { 0, 0 }; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| v4sf_t rowB = { BO[l], BO[l] }; | |||
| v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] }; | |||
| v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] }; | |||
| v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] }; | |||
| v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] }; | |||
| v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] }; | |||
| v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] }; | |||
| v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] }; | |||
| v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] }; | |||
| t += rowA * rowB; | |||
| t1 += rowA1 * rowB; | |||
| t2 += rowA2 * rowB; | |||
| t3 += rowA3 * rowB; | |||
| t4 += rowA4 * rowB; | |||
| t5 += rowA5 * rowB; | |||
| t6 += rowA6 * rowB; | |||
| t7 += rowA7 * rowB; | |||
| } | |||
| t = t * valpha; | |||
| t1 = t1 * valpha; | |||
| t2 = t2 * valpha; | |||
| t3 = t3 * valpha; | |||
| t4 = t4 * valpha; | |||
| t5 = t5 * valpha; | |||
| t6 = t6 * valpha; | |||
| t7 = t7 * valpha; | |||
| #if defined(TRMMKERNEL) | |||
| CO[0] = t[0]; | |||
| CO[1] = t[1]; | |||
| CO[2] = t1[0]; | |||
| CO[3] = t1[1]; | |||
| CO[4] = t2[0]; | |||
| CO[5] = t2[1]; | |||
| CO[6] = t3[0]; | |||
| CO[7] = t3[1]; | |||
| CO[8] = t4[0]; | |||
| CO[9] = t4[1]; | |||
| CO[10] = t5[0]; | |||
| CO[11] = t5[1]; | |||
| CO[12] = t6[0]; | |||
| CO[13] = t6[1]; | |||
| CO[14] = t7[0]; | |||
| CO[15] = t7[1]; | |||
| #else | |||
| CO[0] += t[0]; | |||
| CO[1] += t[1]; | |||
| CO[2] += t1[0]; | |||
| CO[3] += t1[1]; | |||
| CO[4] += t2[0]; | |||
| CO[5] += t2[1]; | |||
| CO[6] += t3[0]; | |||
| CO[7] += t3[1]; | |||
| CO[8] += t4[0]; | |||
| CO[9] += t4[1]; | |||
| CO[10] += t5[0]; | |||
| CO[11] += t5[1]; | |||
| CO[12] += t6[0]; | |||
| CO[13] += t6[1]; | |||
| CO[14] += t7[0]; | |||
| CO[15] += t7[1]; | |||
| #endif | |||
| AO += temp << 4; | |||
| BO += temp; | |||
| CO += 16; | |||
| i -= 16; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (16, 1) | |||
| #endif | |||
| } | |||
| while (i >= 8) | |||
| for (i = 0; i < (m >> 3); i++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -780,12 +769,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| AO += temp << 3; | |||
| BO += temp; | |||
| CO += 8; | |||
| i -= 8; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (8, 1) | |||
| #endif | |||
| } | |||
| while (i >= 4) | |||
| if (m & 4) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -821,12 +809,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| AO += temp << 2; | |||
| BO += temp; | |||
| CO += 4; | |||
| i -= 4; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (4, 1) | |||
| #endif | |||
| } | |||
| while (i >= 2) | |||
| if (m & 2) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -854,12 +841,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| AO += temp << 1; | |||
| BO += temp; | |||
| CO += 2; | |||
| i -= 2; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (2, 1) | |||
| #endif | |||
| } | |||
| while (i >= 1) | |||
| if (m & 1) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -882,7 +868,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| CO[0] += t * alpha; | |||
| #endif | |||
| CO += 1; | |||
| i -= 1; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (1, 1) | |||
| #endif | |||
| @@ -0,0 +1,326 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| #define PREFETCHA(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| IFLOAT *boffset; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp09, ctemp17, ctemp33; | |||
| IFLOAT ctemp25, ctemp41; | |||
| IFLOAT ctemp49, ctemp57; | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = (n >> 3); | |||
| if (j > 0){ | |||
| do{ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset1 + lda; | |||
| aoffset3 = aoffset2 + lda; | |||
| aoffset4 = aoffset3 + lda; | |||
| aoffset5 = aoffset4 + lda; | |||
| aoffset6 = aoffset5 + lda; | |||
| aoffset7 = aoffset6 + lda; | |||
| aoffset8 = aoffset7 + lda; | |||
| aoffset += 8 * lda; | |||
| i = (m >> 3); | |||
| if (i > 0){ | |||
| do{ | |||
| PREFETCHA (aoffset1, 384); | |||
| PREFETCHA (aoffset2, 384); | |||
| PREFETCHA (aoffset3, 384); | |||
| PREFETCHA (aoffset4, 384); | |||
| PREFETCHA (aoffset5, 384); | |||
| PREFETCHA (aoffset6, 384); | |||
| PREFETCHA (aoffset7, 384); | |||
| PREFETCHA (aoffset8, 384); | |||
| __vector double va0 = *(__vector double*)(aoffset1 + 0); | |||
| __vector double va1 = *(__vector double*)(aoffset1 + 2); | |||
| __vector double va2 = *(__vector double*)(aoffset1 + 4); | |||
| __vector double va3 = *(__vector double*)(aoffset1 + 6); | |||
| __vector double va4 = *(__vector double*)(aoffset2 + 0); | |||
| __vector double va5 = *(__vector double*)(aoffset2 + 2); | |||
| __vector double va6 = *(__vector double*)(aoffset2 + 4); | |||
| __vector double va7 = *(__vector double*)(aoffset2 + 6); | |||
| __vector double va8 = *(__vector double*)(aoffset3 + 0); | |||
| __vector double va9 = *(__vector double*)(aoffset3 + 2); | |||
| __vector double va10 = *(__vector double*)(aoffset3 + 4); | |||
| __vector double va11 = *(__vector double*)(aoffset3 + 6); | |||
| __vector double va12 = *(__vector double*)(aoffset4 + 0); | |||
| __vector double va13 = *(__vector double*)(aoffset4 + 2); | |||
| __vector double va14 = *(__vector double*)(aoffset4 + 4); | |||
| __vector double va15 = *(__vector double*)(aoffset4 + 6); | |||
| __vector double va16 = *(__vector double*)(aoffset5 + 0); | |||
| __vector double va17 = *(__vector double*)(aoffset5 + 2); | |||
| __vector double va18 = *(__vector double*)(aoffset5 + 4); | |||
| __vector double va19 = *(__vector double*)(aoffset5 + 6); | |||
| __vector double va20 = *(__vector double*)(aoffset6 + 0); | |||
| __vector double va21 = *(__vector double*)(aoffset6 + 2); | |||
| __vector double va22 = *(__vector double*)(aoffset6 + 4); | |||
| __vector double va23 = *(__vector double*)(aoffset6 + 6); | |||
| __vector double va24 = *(__vector double*)(aoffset7 + 0); | |||
| __vector double va25 = *(__vector double*)(aoffset7 + 2); | |||
| __vector double va26 = *(__vector double*)(aoffset7 + 4); | |||
| __vector double va27 = *(__vector double*)(aoffset7 + 6); | |||
| __vector double va28 = *(__vector double*)(aoffset8 + 0); | |||
| __vector double va29 = *(__vector double*)(aoffset8 + 2); | |||
| __vector double va30 = *(__vector double*)(aoffset8 + 4); | |||
| __vector double va31 = *(__vector double*)(aoffset8 + 6); | |||
| *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va4, 0); | |||
| *(__vector double*)(boffset + 2) = vec_xxpermdi(va8, va12, 0); | |||
| *(__vector double*)(boffset + 4) = vec_xxpermdi(va16, va20, 0); | |||
| *(__vector double*)(boffset + 6) = vec_xxpermdi(va24, va28, 0); | |||
| *(__vector double*)(boffset + 8) = vec_xxpermdi(va0, va4, 3); | |||
| *(__vector double*)(boffset + 10) = vec_xxpermdi(va8, va12, 3); | |||
| *(__vector double*)(boffset + 12) = vec_xxpermdi(va16, va20, 3); | |||
| *(__vector double*)(boffset + 14) = vec_xxpermdi(va24, va28, 3); | |||
| *(__vector double*)(boffset + 16) = vec_xxpermdi(va1, va5, 0); | |||
| *(__vector double*)(boffset + 18) = vec_xxpermdi(va9, va13, 0); | |||
| *(__vector double*)(boffset + 20) = vec_xxpermdi(va17, va21, 0); | |||
| *(__vector double*)(boffset + 22) = vec_xxpermdi(va25, va29, 0); | |||
| *(__vector double*)(boffset + 24) = vec_xxpermdi(va1, va5, 3); | |||
| *(__vector double*)(boffset + 26) = vec_xxpermdi(va9, va13, 3); | |||
| *(__vector double*)(boffset + 28) = vec_xxpermdi(va17, va21, 3); | |||
| *(__vector double*)(boffset + 30) = vec_xxpermdi(va25, va29, 3); | |||
| *(__vector double*)(boffset + 32) = vec_xxpermdi(va2, va6, 0); | |||
| *(__vector double*)(boffset + 34) = vec_xxpermdi(va10, va14, 0); | |||
| *(__vector double*)(boffset + 36) = vec_xxpermdi(va18, va22, 0); | |||
| *(__vector double*)(boffset + 38) = vec_xxpermdi(va26, va30, 0); | |||
| *(__vector double*)(boffset + 40) = vec_xxpermdi(va2, va6, 3); | |||
| *(__vector double*)(boffset + 42) = vec_xxpermdi(va10, va14, 3); | |||
| *(__vector double*)(boffset + 44) = vec_xxpermdi(va18, va22, 3); | |||
| *(__vector double*)(boffset + 46) = vec_xxpermdi(va26, va30, 3); | |||
| *(__vector double*)(boffset + 48) = vec_xxpermdi(va3, va7, 0); | |||
| *(__vector double*)(boffset + 50) = vec_xxpermdi(va11, va15, 0); | |||
| *(__vector double*)(boffset + 52) = vec_xxpermdi(va19, va23, 0); | |||
| *(__vector double*)(boffset + 54) = vec_xxpermdi(va27, va31, 0); | |||
| *(__vector double*)(boffset + 56) = vec_xxpermdi(va3, va7, 3); | |||
| *(__vector double*)(boffset + 58) = vec_xxpermdi(va11, va15, 3); | |||
| *(__vector double*)(boffset + 60) = vec_xxpermdi(va19, va23, 3); | |||
| *(__vector double*)(boffset + 62) = vec_xxpermdi(va27, va31, 3); | |||
| aoffset1 += 8; | |||
| aoffset2 += 8; | |||
| aoffset3 += 8; | |||
| aoffset4 += 8; | |||
| aoffset5 += 8; | |||
| aoffset6 += 8; | |||
| aoffset7 += 8; | |||
| aoffset8 += 8; | |||
| boffset += 64; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| i = (m & 7); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp09 = *(aoffset2 + 0); | |||
| ctemp17 = *(aoffset3 + 0); | |||
| ctemp25 = *(aoffset4 + 0); | |||
| ctemp33 = *(aoffset5 + 0); | |||
| ctemp41 = *(aoffset6 + 0); | |||
| ctemp49 = *(aoffset7 + 0); | |||
| ctemp57 = *(aoffset8 + 0); | |||
| *(boffset + 0) = ctemp01; | |||
| *(boffset + 1) = ctemp09; | |||
| *(boffset + 2) = ctemp17; | |||
| *(boffset + 3) = ctemp25; | |||
| *(boffset + 4) = ctemp33; | |||
| *(boffset + 5) = ctemp41; | |||
| *(boffset + 6) = ctemp49; | |||
| *(boffset + 7) = ctemp57; | |||
| aoffset1 ++; | |||
| aoffset2 ++; | |||
| aoffset3 ++; | |||
| aoffset4 ++; | |||
| aoffset5 ++; | |||
| aoffset6 ++; | |||
| aoffset7 ++; | |||
| aoffset8 ++; | |||
| boffset += 8; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| j--; | |||
| }while(j > 0); | |||
| } /* end of if(j > 0) */ | |||
| if (n & 4){ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset1 + lda; | |||
| aoffset3 = aoffset2 + lda; | |||
| aoffset4 = aoffset3 + lda; | |||
| aoffset += 4 * lda; | |||
| i = (m >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| PREFETCHA (aoffset1, 384); | |||
| PREFETCHA (aoffset2, 384); | |||
| PREFETCHA (aoffset3, 384); | |||
| PREFETCHA (aoffset4, 384); | |||
| __vector double va0 = *(__vector double*)(aoffset1 + 0); | |||
| __vector double va1 = *(__vector double*)(aoffset1 + 2); | |||
| __vector double va2 = *(__vector double*)(aoffset2 + 0); | |||
| __vector double va3 = *(__vector double*)(aoffset2 + 2); | |||
| __vector double va4 = *(__vector double*)(aoffset3 + 0); | |||
| __vector double va5 = *(__vector double*)(aoffset3 + 2); | |||
| __vector double va6 = *(__vector double*)(aoffset4 + 0); | |||
| __vector double va7 = *(__vector double*)(aoffset4 + 2); | |||
| *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va2, 0); | |||
| *(__vector double*)(boffset + 2) = vec_xxpermdi(va4, va6, 0); | |||
| *(__vector double*)(boffset + 4) = vec_xxpermdi(va0, va2, 3); | |||
| *(__vector double*)(boffset + 6) = vec_xxpermdi(va4, va6, 3); | |||
| *(__vector double*)(boffset + 8) = vec_xxpermdi(va1, va3, 0); | |||
| *(__vector double*)(boffset + 10) = vec_xxpermdi(va5, va7, 0); | |||
| *(__vector double*)(boffset + 12) = vec_xxpermdi(va1, va3, 3); | |||
| *(__vector double*)(boffset + 14) = vec_xxpermdi(va5, va7, 3); | |||
| aoffset1 += 4; | |||
| aoffset2 += 4; | |||
| aoffset3 += 4; | |||
| aoffset4 += 4; | |||
| boffset += 16; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset2 + 0); | |||
| ctemp03 = *(aoffset3 + 0); | |||
| ctemp04 = *(aoffset4 + 0); | |||
| *(boffset + 0) = ctemp01; | |||
| *(boffset + 1) = ctemp02; | |||
| *(boffset + 2) = ctemp03; | |||
| *(boffset + 3) = ctemp04; | |||
| aoffset1 ++; | |||
| aoffset2 ++; | |||
| aoffset3 ++; | |||
| aoffset4 ++; | |||
| boffset += 4; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| } /* end of if(j > 0) */ | |||
| if (n & 2){ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset1 + lda; | |||
| aoffset += 2 * lda; | |||
| i = (m >> 1); | |||
| if (i > 0){ | |||
| do{ | |||
| __vector double va0 = *(__vector double*)(aoffset1 + 0); | |||
| __vector double va1 = *(__vector double*)(aoffset2 + 0); | |||
| *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va1, 0); | |||
| *(__vector double*)(boffset + 2) = vec_xxpermdi(va0, va1, 3); | |||
| aoffset1 += 2; | |||
| aoffset2 += 2; | |||
| boffset += 4; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (m & 1){ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset2 + 0); | |||
| *(boffset + 0) = ctemp01; | |||
| *(boffset + 1) = ctemp02; | |||
| aoffset1 ++; | |||
| aoffset2 ++; | |||
| boffset += 2; | |||
| } | |||
| } /* end of if(j > 0) */ | |||
| if (n & 1){ | |||
| aoffset1 = aoffset; | |||
| i = m; | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| *(boffset + 0) = ctemp01; | |||
| aoffset1 ++; | |||
| boffset ++; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| } /* end of if(j > 0) */ | |||
| return 0; | |||
| } | |||
| @@ -197,7 +197,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| #endif | |||
| ) | |||
| { | |||
| BLASLONG N = n; | |||
| BLASLONG i1; | |||
| #if defined(TRMMKERNEL) | |||
| BLASLONG off; | |||
| @@ -207,10 +206,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| #endif | |||
| v4sf_t valpha = { alpha, alpha, alpha, alpha }; | |||
| N = n >> 3; | |||
| for (i1 = 0; i1 < N; i1++) | |||
| for (i1 = 0; i1 < (n >> 3); i1++) | |||
| { | |||
| BLASLONG i, j, temp; | |||
| BLASLONG j, temp; | |||
| FLOAT *CO; | |||
| FLOAT *AO; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| @@ -221,8 +219,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| AO = A; | |||
| PREFETCH1 (A, 128); | |||
| PREFETCH1 (A, 256); | |||
| i = m >> 4; | |||
| for (j = 0; j < i; j++) | |||
| for (j = 0; j < (m >> 4); j++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -438,8 +435,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| #endif | |||
| CO += 16; | |||
| } | |||
| i = (m & 15) >> 3; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 8) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -478,8 +474,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (8, 8) | |||
| #endif | |||
| } | |||
| i = (m & 7) >> 2; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 4) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -512,8 +507,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (4, 8) | |||
| #endif | |||
| } | |||
| i = (m & 3) >> 1; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 2) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -550,8 +544,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (2, 8) | |||
| #endif | |||
| } | |||
| i = (m & 1) >> 0; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 1) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -610,8 +603,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| B += k << 3; | |||
| } | |||
| N = (n & 7) >> 2; | |||
| for (i1 = 0; i1 < N; i1++) | |||
| if (n & 4) | |||
| { | |||
| BLASLONG i, j, temp; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| @@ -719,8 +711,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (16, 4) | |||
| #endif | |||
| } | |||
| i = (m & 15) >> 3; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 8) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -753,8 +744,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (8, 4) | |||
| #endif | |||
| } | |||
| i = (m & 7) >> 2; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 4) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -784,8 +774,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (4, 4) | |||
| #endif | |||
| } | |||
| i = (m & 3) >> 1; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 2) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -818,8 +807,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (2, 4) | |||
| #endif | |||
| } | |||
| i = (m & 1) >> 0; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 1) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -863,8 +851,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| B += k << 2; | |||
| } | |||
| N = (n & 3) >> 1; | |||
| for (i1 = 0; i1 < N; i1++) | |||
| if (n & 2) | |||
| { | |||
| BLASLONG i, j, temp; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| @@ -973,8 +960,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (16, 2) | |||
| #endif | |||
| } | |||
| i = (m & 15) >> 3; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 8) | |||
| { | |||
| FLOAT *BO; | |||
| v4sf_t *rowC; | |||
| @@ -1010,8 +996,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (8, 2) | |||
| #endif | |||
| } | |||
| i = (m & 7) >> 2; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 4) | |||
| { | |||
| FLOAT *BO; | |||
| v4sf_t *rowC; | |||
| @@ -1044,8 +1029,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (4, 2) | |||
| #endif | |||
| } | |||
| i = (m & 3) >> 1; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 2) | |||
| { | |||
| FLOAT *BO; | |||
| BLASLONG l = 0; | |||
| @@ -1081,8 +1065,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| REFRESH_AFTER_SAVE (2, 2) | |||
| #endif | |||
| } | |||
| i = (m & 1) >> 0; | |||
| for (j = 0; j < i; j++) | |||
| if (m & 1) | |||
| { | |||
| FLOAT *BO; | |||
| BLASLONG l = 0; | |||
| @@ -1120,8 +1103,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| B += k << 1; | |||
| } | |||
| N = (n & 1) >> 0; | |||
| for (i1 = 0; i1 < N; i1++) | |||
| if (n & 1) | |||
| { | |||
| BLASLONG i, temp; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| @@ -1132,8 +1114,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| CO = C; | |||
| C += ldc; | |||
| AO = A; | |||
| i = m; | |||
| while (i >= 16) | |||
| for (i = 0; i < (m >> 4); i++) | |||
| { | |||
| FLOAT *BO; | |||
| BLASLONG l = 0; | |||
| @@ -1213,12 +1194,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| AO += temp << 4; | |||
| BO += temp; | |||
| CO += 16; | |||
| i -= 16; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (16, 1) | |||
| #endif | |||
| } | |||
| while (i >= 8) | |||
| if (m & 8) | |||
| { | |||
| FLOAT *BO; | |||
| BLASLONG l = 0; | |||
| @@ -1268,12 +1248,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| AO += temp << 3; | |||
| BO += temp; | |||
| CO += 8; | |||
| i -= 8; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (8, 1) | |||
| #endif | |||
| } | |||
| while (i >= 4) | |||
| if (m & 4) | |||
| { | |||
| FLOAT *BO; | |||
| BLASLONG l = 0; | |||
| @@ -1308,12 +1287,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| AO += temp << 2; | |||
| BO += temp; | |||
| CO += 4; | |||
| i -= 4; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (4, 1) | |||
| #endif | |||
| } | |||
| while (i >= 2) | |||
| if (m & 2) | |||
| { | |||
| FLOAT *BO; | |||
| BLASLONG l = 0; | |||
| @@ -1342,12 +1320,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| AO += temp << 1; | |||
| BO += temp; | |||
| CO += 2; | |||
| i -= 2; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (2, 1) | |||
| #endif | |||
| } | |||
| while (i >= 1) | |||
| if (m & 1) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| @@ -1371,7 +1348,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| CO[0] += t * alpha; | |||
| #endif | |||
| CO += 1; | |||
| i -= 1; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (1, 1) | |||
| #endif | |||
| @@ -2436,6 +2436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SBGEMM_DEFAULT_P 832 | |||
| #define SBGEMM_DEFAULT_Q 1026 | |||
| #define SBGEMM_DEFAULT_R 4096 | |||
| #undef DGEMM_DEFAULT_UNROLL_M | |||
| #undef DGEMM_DEFAULT_UNROLL_N | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||
| #endif | |||
| #if defined(SPARC) && defined(V7) | |||