|
|
|
@@ -134,21 +134,6 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); |
|
|
|
__builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \ |
|
|
|
__builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \ |
|
|
|
__builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]); |
|
|
|
#define SET_ACC_ZERO4() \ |
|
|
|
__builtin_mma_xxsetaccz (&acc0); \ |
|
|
|
__builtin_mma_xxsetaccz (&acc1); \ |
|
|
|
__builtin_mma_xxsetaccz (&acc2); \ |
|
|
|
__builtin_mma_xxsetaccz (&acc3); |
|
|
|
|
|
|
|
#define SET_ACC_ZERO8() \ |
|
|
|
__builtin_mma_xxsetaccz (&acc0); \ |
|
|
|
__builtin_mma_xxsetaccz (&acc1); \ |
|
|
|
__builtin_mma_xxsetaccz (&acc2); \ |
|
|
|
__builtin_mma_xxsetaccz (&acc3); \ |
|
|
|
__builtin_mma_xxsetaccz (&acc4); \ |
|
|
|
__builtin_mma_xxsetaccz (&acc5); \ |
|
|
|
__builtin_mma_xxsetaccz (&acc6); \ |
|
|
|
__builtin_mma_xxsetaccz (&acc7); |
|
|
|
|
|
|
|
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); |
|
|
|
|
|
|
|
@@ -249,8 +234,20 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
v4sf_t *rowC; |
|
|
|
v4sf_t result[4]; |
|
|
|
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; |
|
|
|
SET_ACC_ZERO8 (); |
|
|
|
BLASLONG l = 0; |
|
|
|
vec_t *rowA1 = (vec_t *) & AO[0]; |
|
|
|
vec_t *rowB1 = (vec_t *) & BO[0]; |
|
|
|
__builtin_mma_xvf32ger (&acc0, rowB1[0], rowA1[0]); |
|
|
|
__builtin_mma_xvf32ger (&acc1, rowB1[1], rowA1[0]); |
|
|
|
__builtin_mma_xvf32ger (&acc2, rowB1[0], rowA1[1]); |
|
|
|
__builtin_mma_xvf32ger (&acc3, rowB1[1], rowA1[1]); |
|
|
|
__builtin_mma_xvf32ger (&acc4, rowB1[0], rowA1[2]); |
|
|
|
__builtin_mma_xvf32ger (&acc5, rowB1[1], rowA1[2]); |
|
|
|
__builtin_mma_xvf32ger (&acc6, rowB1[0], rowA1[3]); |
|
|
|
__builtin_mma_xvf32ger (&acc7, rowB1[1], rowA1[3]); |
|
|
|
AO += 16; |
|
|
|
BO += 8; |
|
|
|
temp--; |
|
|
|
BLASLONG K = temp / 64; |
|
|
|
for (l = 0; l < K; l++) |
|
|
|
{ |
|
|
|
@@ -454,12 +451,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
v4sf_t *rowC; |
|
|
|
v4sf_t result[4]; |
|
|
|
__vector_quad acc0, acc1, acc2, acc3; |
|
|
|
SET_ACC_ZERO4 (); |
|
|
|
BLASLONG l = 0; |
|
|
|
for (l = 0; l < temp; l++) |
|
|
|
vec_t *rowA = (vec_t *) & AO[0]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[0]; |
|
|
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]); |
|
|
|
__builtin_mma_xvf32ger (&acc2, rowB[0], rowA[1]); |
|
|
|
__builtin_mma_xvf32ger (&acc3, rowB[1], rowA[1]); |
|
|
|
for (l = 1; l < temp; l++) |
|
|
|
{ |
|
|
|
vec_t *rowA = (vec_t *) & AO[l << 3]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[l << 3]; |
|
|
|
rowA = (vec_t *) & AO[l << 3]; |
|
|
|
rowB = (vec_t *) & BO[l << 3]; |
|
|
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); |
|
|
|
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]); |
|
|
|
@@ -489,13 +491,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
v4sf_t *rowC; |
|
|
|
v4sf_t result[4]; |
|
|
|
__vector_quad acc0, acc1; |
|
|
|
__builtin_mma_xxsetaccz (&acc0); |
|
|
|
__builtin_mma_xxsetaccz (&acc1); |
|
|
|
BLASLONG l = 0; |
|
|
|
for (l = 0; l < temp; l++) |
|
|
|
vec_t *rowA = (vec_t *) & AO[0]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[0]; |
|
|
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]); |
|
|
|
for (l = 1; l < temp; l++) |
|
|
|
{ |
|
|
|
vec_t *rowA = (vec_t *) & AO[l << 2]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[l << 3]; |
|
|
|
rowA = (vec_t *) & AO[l << 2]; |
|
|
|
rowB = (vec_t *) & BO[l << 3]; |
|
|
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); |
|
|
|
} |
|
|
|
@@ -522,15 +526,18 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
v2sf_t *rowC; |
|
|
|
v2sf_t result[8]; |
|
|
|
__vector_quad acc0, acc1; |
|
|
|
__builtin_mma_xxsetaccz (&acc0); |
|
|
|
__builtin_mma_xxsetaccz (&acc1); |
|
|
|
BLASLONG l = 0; |
|
|
|
for (l = 0; l < temp; l++) |
|
|
|
FLOAT t[4] = { 0 }; |
|
|
|
t[0] = AO[0], t[1] = AO[1]; |
|
|
|
vec_t *rowA = (vec_t *) & t[0]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[0]; |
|
|
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]); |
|
|
|
for (l = 1; l < temp; l++) |
|
|
|
{ |
|
|
|
FLOAT t[4] = { 0 }; |
|
|
|
t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; |
|
|
|
vec_t *rowA = (vec_t *) & t[0]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[l << 3]; |
|
|
|
rowA = (vec_t *) & t[0]; |
|
|
|
rowB = (vec_t *) & BO[l << 3]; |
|
|
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); |
|
|
|
} |
|
|
|
@@ -625,13 +632,23 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
FLOAT *A1; |
|
|
|
A1 = AO + (16 * k); |
|
|
|
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; |
|
|
|
SET_ACC_ZERO8 (); |
|
|
|
BLASLONG l = 0; |
|
|
|
for (l = 0; l < k; l++) |
|
|
|
vec_t *rowA = (vec_t *) & AO[0]; |
|
|
|
vec_t *rowA1 = (vec_t *) & A1[0]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[0]; |
|
|
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); |
|
|
|
__builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); |
|
|
|
__builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); |
|
|
|
__builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]); |
|
|
|
__builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]); |
|
|
|
__builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]); |
|
|
|
__builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]); |
|
|
|
for (l = 1; l < k; l++) |
|
|
|
{ |
|
|
|
vec_t *rowA = (vec_t *) & AO[l << 4]; |
|
|
|
vec_t *rowA1 = (vec_t *) & A1[l << 4]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[l << 2]; |
|
|
|
rowA = (vec_t *) & AO[l << 4]; |
|
|
|
rowA1 = (vec_t *) & A1[l << 4]; |
|
|
|
rowB = (vec_t *) & BO[l << 2]; |
|
|
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); |
|
|
|
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); |
|
|
|
@@ -673,12 +690,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
v4sf_t *rowC; |
|
|
|
v4sf_t result[4]; |
|
|
|
__vector_quad acc0, acc1, acc2, acc3; |
|
|
|
SET_ACC_ZERO4 (); |
|
|
|
BLASLONG l = 0; |
|
|
|
for (l = 0; l < temp; l++) |
|
|
|
vec_t *rowA = (vec_t *) & AO[0]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[0]; |
|
|
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); |
|
|
|
__builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); |
|
|
|
__builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); |
|
|
|
for (l = 1; l < temp; l++) |
|
|
|
{ |
|
|
|
vec_t *rowA = (vec_t *) & AO[l << 4]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[l << 2]; |
|
|
|
rowA = (vec_t *) & AO[l << 4]; |
|
|
|
rowB = (vec_t *) & BO[l << 2]; |
|
|
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); |
|
|
|
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); |
|
|
|
@@ -710,13 +732,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
v4sf_t *rowC; |
|
|
|
v4sf_t result[4]; |
|
|
|
__vector_quad acc0, acc1; |
|
|
|
__builtin_mma_xxsetaccz (&acc0); |
|
|
|
__builtin_mma_xxsetaccz (&acc1); |
|
|
|
BLASLONG l = 0; |
|
|
|
for (l = 0; l < temp; l++) |
|
|
|
vec_t *rowA = (vec_t *) & AO[0]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[0]; |
|
|
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); |
|
|
|
for (l = 1; l < temp; l++) |
|
|
|
{ |
|
|
|
vec_t *rowA = (vec_t *) & AO[l << 3]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[l << 2]; |
|
|
|
rowA = (vec_t *) & AO[l << 3]; |
|
|
|
rowB = (vec_t *) & BO[l << 2]; |
|
|
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); |
|
|
|
} |
|
|
|
@@ -742,12 +766,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
v4sf_t *rowC; |
|
|
|
__vector_quad acc0; |
|
|
|
v4sf_t result[4]; |
|
|
|
__builtin_mma_xxsetaccz (&acc0); |
|
|
|
BLASLONG l = 0; |
|
|
|
for (l = 0; l < temp; l++) |
|
|
|
vec_t *rowA = (vec_t *) & AO[0]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[0]; |
|
|
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); |
|
|
|
for (l = 1; l < temp; l++) |
|
|
|
{ |
|
|
|
vec_t *rowA = (vec_t *) & AO[l << 2]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[l << 2]; |
|
|
|
rowA = (vec_t *) & AO[l << 2]; |
|
|
|
rowB = (vec_t *) & BO[l << 2]; |
|
|
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); |
|
|
|
} |
|
|
|
SAVE_ACC (&acc0, 0); |
|
|
|
@@ -771,14 +797,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
v2sf_t *rowC; |
|
|
|
v2sf_t result[8]; |
|
|
|
__vector_quad acc0; |
|
|
|
__builtin_mma_xxsetaccz (&acc0); |
|
|
|
BLASLONG l = 0; |
|
|
|
for (l = 0; l < temp; l++) |
|
|
|
FLOAT t[4] = { 0 }; |
|
|
|
t[0] = AO[0], t[1] = AO[1]; |
|
|
|
vec_t *rowA = (vec_t *) & t[0]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[0]; |
|
|
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); |
|
|
|
for (l = 1; l < temp; l++) |
|
|
|
{ |
|
|
|
FLOAT t[4] = { 0 }; |
|
|
|
t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; |
|
|
|
vec_t *rowA = (vec_t *) & t[0]; |
|
|
|
vec_t *rowB = (vec_t *) & BO[l << 2]; |
|
|
|
rowA = (vec_t *) & t[0]; |
|
|
|
rowB = (vec_t *) & BO[l << 2]; |
|
|
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); |
|
|
|
} |
|
|
|
SAVE4x2_ACC (&acc0, 0); |
|
|
|
@@ -856,15 +885,26 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
FLOAT *A1; |
|
|
|
A1 = AO + (16 * k); |
|
|
|
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; |
|
|
|
SET_ACC_ZERO8 (); |
|
|
|
BLASLONG l = 0; |
|
|
|
for (l = 0; l < k; l++) |
|
|
|
FLOAT t[4] = { 0 }; |
|
|
|
t[0] = BO[0], t[1] = BO[1]; |
|
|
|
vec_t *rowB = (vec_t *) & t[0]; |
|
|
|
vec_t *rowA = (vec_t *) & AO[0]; |
|
|
|
vec_t *rowA1 = (vec_t *) & A1[0]; |
|
|
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); |
|
|
|
__builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); |
|
|
|
__builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); |
|
|
|
__builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]); |
|
|
|
__builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]); |
|
|
|
__builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]); |
|
|
|
__builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]); |
|
|
|
for (l = 1; l < k; l++) |
|
|
|
{ |
|
|
|
FLOAT t[4] = { 0 }; |
|
|
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; |
|
|
|
vec_t *rowB = (vec_t *) & t[0]; |
|
|
|
vec_t *rowA = (vec_t *) & AO[l << 4]; |
|
|
|
vec_t *rowA1 = (vec_t *) & A1[l << 4]; |
|
|
|
rowB = (vec_t *) & t[0]; |
|
|
|
rowA = (vec_t *) & AO[l << 4]; |
|
|
|
rowA1 = (vec_t *) & A1[l << 4]; |
|
|
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); |
|
|
|
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); |
|
|
|
@@ -897,7 +937,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
v4sf_t *rowC; |
|
|
|
v4sf_t result[4]; |
|
|
|
__vector_quad acc0, acc1, acc2, acc3; |
|
|
|
SET_ACC_ZERO4 (); |
|
|
|
BLASLONG l = 0; |
|
|
|
#if defined(TRMMKERNEL) |
|
|
|
REFRESH_POINTERS (16, 2) |
|
|
|
@@ -905,12 +944,19 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
BO = B; |
|
|
|
temp = k; |
|
|
|
#endif |
|
|
|
for (l = 0; l < temp; l++) |
|
|
|
FLOAT t[4] = { 0 }; |
|
|
|
t[0] = BO[0], t[1] = BO[1]; |
|
|
|
vec_t *rowB = (vec_t *) & t[0]; |
|
|
|
vec_t *rowA = (vec_t *) & AO[0]; |
|
|
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); |
|
|
|
__builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); |
|
|
|
__builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); |
|
|
|
for (l = 1; l < temp; l++) |
|
|
|
{ |
|
|
|
FLOAT t[4] = { 0 }; |
|
|
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; |
|
|
|
vec_t *rowB = (vec_t *) & t[0]; |
|
|
|
vec_t *rowA = (vec_t *) & AO[l << 4]; |
|
|
|
rowB = (vec_t *) & t[0]; |
|
|
|
rowA = (vec_t *) & AO[l << 4]; |
|
|
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); |
|
|
|
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); |
|
|
|
@@ -934,8 +980,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
v4sf_t *rowC; |
|
|
|
v4sf_t result[4]; |
|
|
|
__vector_quad acc0, acc1; |
|
|
|
__builtin_mma_xxsetaccz (&acc0); |
|
|
|
__builtin_mma_xxsetaccz (&acc1); |
|
|
|
#if defined(TRMMKERNEL) |
|
|
|
REFRESH_POINTERS (8, 2) |
|
|
|
#else |
|
|
|
@@ -943,12 +987,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
temp = k; |
|
|
|
#endif |
|
|
|
BLASLONG l = 0; |
|
|
|
for (l = 0; l < temp; l++) |
|
|
|
FLOAT t[4] = { 0 }; |
|
|
|
t[0] = BO[0], t[1] = BO[1]; |
|
|
|
vec_t *rowB = (vec_t *) & t[0]; |
|
|
|
vec_t *rowA = (vec_t *) & AO[0]; |
|
|
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); |
|
|
|
for (l = 1; l < temp; l++) |
|
|
|
{ |
|
|
|
FLOAT t[4] = { 0 }; |
|
|
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; |
|
|
|
vec_t *rowB = (vec_t *) & t[0]; |
|
|
|
vec_t *rowA = (vec_t *) & AO[l << 3]; |
|
|
|
rowB = (vec_t *) & t[0]; |
|
|
|
rowA = (vec_t *) & AO[l << 3]; |
|
|
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); |
|
|
|
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); |
|
|
|
} |
|
|
|
@@ -968,7 +1017,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
v4sf_t *rowC; |
|
|
|
v4sf_t result[4]; |
|
|
|
__vector_quad acc0; |
|
|
|
__builtin_mma_xxsetaccz (&acc0); |
|
|
|
#if defined(TRMMKERNEL) |
|
|
|
REFRESH_POINTERS (4, 2) |
|
|
|
#else |
|
|
|
@@ -976,12 +1024,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, |
|
|
|
temp = k; |
|
|
|
#endif |
|
|
|
BLASLONG l = 0; |
|
|
|
for (l = 0; l < temp; l++) |
|
|
|
FLOAT t[4] = { 0 }; |
|
|
|
t[0] = BO[0], t[1] = BO[1]; |
|
|
|
vec_t *rowB = (vec_t *) & t[0]; |
|
|
|
vec_t *rowA = (vec_t *) & AO[0]; |
|
|
|
__builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); |
|
|
|
for (l = 1; l < temp; l++) |
|
|
|
{ |
|
|
|
FLOAT t[4] = { 0 }; |
|
|
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; |
|
|
|
vec_t *rowB = (vec_t *) & t[0]; |
|
|
|
vec_t *rowA = (vec_t *) & AO[l << 2]; |
|
|
|
rowB = (vec_t *) & t[0]; |
|
|
|
rowA = (vec_t *) & AO[l << 2]; |
|
|
|
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); |
|
|
|
} |
|
|
|
SAVE2x4_ACC (&acc0, 0); |
|
|
|
|