Browse Source

changed level3.c

tags/v0.2.9.rc1
wernsaar 12 years ago
parent
commit
be18cd47f6
2 changed files with 3 additions and 93 deletions
  1. +2
    -24
      driver/level3/level3.c
  2. +1
    -69
      driver/level3/level3_thread.c

+ 2
- 24
driver/level3/level3.c View File

@@ -36,8 +36,6 @@
/* or implied, of The University of Texas at Austin. */ /* or implied, of The University of Texas at Austin. */
/*********************************************************************/ /*********************************************************************/


// #define TIMING 1

/* This file is a template for level 3 operation */ /* This file is a template for level 3 operation */


#ifndef BETA_OPERATION #ifndef BETA_OPERATION
@@ -335,24 +333,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;


#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
else
#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
else else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#elif defined(ARMV7)
if (min_jj >= 32) min_jj = 32;
else
if (min_jj >= 16) min_jj = 16;
else
if (min_jj >= 8) min_jj = 8;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

#else #else

if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif #endif


@@ -412,22 +400,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#ifdef TIMING #ifdef TIMING
total = (double)outercost + (double)innercost + (double)kernelcost; total = (double)outercost + (double)innercost + (double)kernelcost;


#ifdef ARMV7

printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f\n",
innercost / total * 100., outercost / total * 100.,
kernelcost / total * 100.);


#else

printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n",
innercost / total * 100., outercost / total * 100., innercost / total * 100., outercost / total * 100.,
kernelcost / total * 100., kernelcost / total * 100.,
(double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / 2., (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / 2.,
(double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.); (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.);


#endif
#endif #endif


return 0; return 0;


+ 1
- 69
driver/level3/level3_thread.c View File

@@ -36,8 +36,6 @@
/* or implied, of The University of Texas at Austin. */ /* or implied, of The University of Texas at Austin. */
/*********************************************************************/ /*********************************************************************/


// #define TIMING 1

#ifndef CACHE_LINE_SIZE #ifndef CACHE_LINE_SIZE
#define CACHE_LINE_SIZE 8 #define CACHE_LINE_SIZE 8
#endif #endif
@@ -235,21 +233,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
BLASLONG l1stride, l2size; BLASLONG l1stride, l2size;


#ifdef TIMING #ifdef TIMING

#ifdef ARMV7

unsigned long long rpcc_counter;
unsigned long long copy_A = 0;
unsigned long long copy_B = 0;
unsigned long long kernel = 0;
unsigned long long waiting1 = 0;
unsigned long long waiting2 = 0;
unsigned long long waiting3 = 0;
unsigned long long waiting6[MAX_CPU_NUMBER];
unsigned long long ops = 0;

#else

BLASULONG rpcc_counter; BLASULONG rpcc_counter;
BLASULONG copy_A = 0; BLASULONG copy_A = 0;
BLASULONG copy_B = 0; BLASULONG copy_B = 0;
@@ -260,8 +243,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
BLASULONG waiting6[MAX_CPU_NUMBER]; BLASULONG waiting6[MAX_CPU_NUMBER];
BLASULONG ops = 0; BLASULONG ops = 0;


#endif

for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
#endif #endif


@@ -339,35 +320,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,


min_l = k - ls; min_l = k - ls;


#ifdef ARMV7_1
if (min_l >= GEMM_Q / 4 * 2) {
min_l = GEMM_Q / 4;
} else {
if (min_l > GEMM_Q / 4) min_l = (min_l + 1) / 2;
}

#else
if (min_l >= GEMM_Q * 2) { if (min_l >= GEMM_Q * 2) {
min_l = GEMM_Q; min_l = GEMM_Q;
} else { } else {
if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
} }
#endif


l1stride = 1; l1stride = 1;
min_i = m_to - m_from; min_i = m_to - m_from;
#ifdef ARMV7_1
if (min_i >= GEMM_P / 4 * 2) {
min_i = GEMM_P / 4;
} else {
if (min_i > GEMM_P / 4) {
min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
} else {
if (args -> nthreads == 1) l1stride = 0;
}
}
#else
if (min_i >= GEMM_P * 2) { if (min_i >= GEMM_P * 2) {
min_i = GEMM_P; min_i = GEMM_P;
} else { } else {
@@ -378,8 +339,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
} }
} }


#endif

START_RPCC(); START_RPCC();
ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
@@ -408,22 +367,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs; min_jj = MIN(n_to, xxx + div_n) - jjs;


#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
else
#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
else else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#elif defined(ARMV7)
if (min_jj >= 16) min_jj = 16;
else
if (min_jj >= 8) min_jj = 8;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;


#else #else


if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
@@ -555,21 +504,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
STOP_RPCC(waiting3); STOP_RPCC(waiting3);


#ifdef TIMING #ifdef TIMING

#ifdef ARMV7

unsigned long long waiting = waiting1 + waiting2 + waiting3;
unsigned long long total = copy_A + copy_B + kernel + waiting;

fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
(double)waiting1 /(double)total * 100.,
(double)waiting2 /(double)total * 100.,
(double)waiting3 /(double)total * 100.,
(double)kernel /(double)total * 100.);

#else

BLASLONG waiting = waiting1 + waiting2 + waiting3; BLASLONG waiting = waiting1 + waiting2 + waiting3;
BLASLONG total = copy_A + copy_B + kernel + waiting; BLASLONG total = copy_A + copy_B + kernel + waiting;


@@ -580,8 +514,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
(double)waiting3 /(double)total * 100., (double)waiting3 /(double)total * 100.,
(double)ops/(double)kernel / 4. * 100.); (double)ops/(double)kernel / 4. * 100.);


#endif

#if 0 #if 0
fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n",
mypos, copy_A, copy_B, waiting); mypos, copy_A, copy_B, waiting);


Loading…
Cancel
Save