| @@ -237,7 +237,7 @@ int main(int argc, char *argv[]){ | |||
| timeg = time1/loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6, time1); | |||
| COMPSIZE * COMPSIZE * (2.*(double)k+2.) * (double)m * (double)n / timeg * 1.e-6, time1); | |||
| } | |||
| @@ -801,32 +801,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| i = (m & 1); | |||
| if (i > 0) { | |||
| if (X > posY) { | |||
| /* a01 += 2; | |||
| a02 += 2; */ | |||
| b += 4; | |||
| } else | |||
| if (X < posY) { | |||
| b[ 0] = *(a01 + 0); | |||
| b[ 1] = *(a01 + 1); | |||
| b[ 2] = *(a01 + 2); | |||
| b[ 3] = *(a01 + 3); | |||
| /* a01 += lda; | |||
| a02 += lda; */ | |||
| b += 4; | |||
| } else { | |||
| #ifdef UNIT | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| #else | |||
| b[ 0] = *(a01 + 0); | |||
| b[ 1] = *(a01 + 1); | |||
| if (X < posY) { | |||
| #endif | |||
| b[ 2] = *(a01 + 2); | |||
| b[ 3] = *(a01 + 3); | |||
| b += 4; | |||
| } | |||
| b[ 0] = *(a01 + 0); | |||
| b[ 1] = *(a01 + 1); | |||
| #ifdef UNIT | |||
| } else { | |||
| b[ 0] = ONE; | |||
| b[ 1] = ZERO; | |||
| } | |||
| #endif | |||
| b[ 2] = *(a01 + 2); | |||
| b[ 3] = *(a01 + 3); | |||
| b += 4; | |||
| } | |||
| posY += 2; | |||
| } | |||
| @@ -301,7 +301,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| dtrmm_kernel_4x8( temp, &alpha , ptrba, ptrbb, C0, C1, C2, C3, C4, C5, C6, C7); | |||
| ptrba = ptrba + temp * 4; | |||
| ptrbb = ptrbb + temp * 8; | |||
| // ptrbb = ptrbb + temp * 8; | |||
| /* | |||
| for (k=0; k<temp; k++) | |||
| @@ -446,7 +446,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| temp - 8; // number of values in B | |||
| ptrba += temp*4; // number of values in A | |||
| ptrbb += temp*8; // number of values in B | |||
| // ptrbb += temp*8; // number of values in B | |||
| } | |||
| #ifdef LEFT | |||
| off += 4; // number of values in A | |||
| @@ -709,14 +709,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| off += 1; // number of values in A | |||
| #endif | |||
| C0 = C0+1; | |||
| /* C0 = C0+1; | |||
| C1 = C1+1; | |||
| C2 = C2+1; | |||
| C3 = C3+1; | |||
| C4 = C4+1; | |||
| C5 = C5+1; | |||
| C6 = C6+1; | |||
| C7 = C7+1; | |||
| C7 = C7+1; */ | |||
| } | |||
| @@ -862,7 +862,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| temp - 4; // number of values in B */ | |||
| ptrba += temp*4; // number of values in A | |||
| ptrbb += temp*4; // number of values in B | |||
| // ptrbb += temp*4; // number of values in B | |||
| } | |||
| #ifdef LEFT | |||
| off += 4; // number of values in A | |||
| @@ -1049,10 +1049,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| off += 1; // number of values in A | |||
| #endif | |||
| C0 = C0+1; | |||
| /* C0 = C0+1; | |||
| C1 = C1+1; | |||
| C2 = C2+1; | |||
| C3 = C3+1; | |||
| C3 = C3+1; */ | |||
| } | |||
| @@ -1311,8 +1311,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| off += 1; // number of values in A | |||
| #endif | |||
| C0 = C0+1; | |||
| C1 = C1+1; | |||
| /* C0 = C0+1; | |||
| C1 = C1+1; */ | |||
| } | |||
| @@ -1532,7 +1532,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| off += 1; // number of values in A | |||
| #endif | |||
| C0 = C0+1; | |||
| // C0 = C0+1; | |||
| } | |||
| @@ -124,13 +124,13 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra | |||
| min_jj = js + min_j - jjs; | |||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
| if (0 && GEMM_UNROLL_N <= 8) { | |||
| /* if (0 && GEMM_UNROLL_N <= 8) { | |||
| LASWP_NCOPY(min_jj, off + 1, off + k, | |||
| c + (- off + jjs * lda) * COMPSIZE, lda, | |||
| ipiv, sbb + k * (jjs - js) * COMPSIZE); | |||
| } else { | |||
| } else { */ | |||
| LASWP_PLUS(min_jj, off + 1, off + k, ZERO, | |||
| #ifdef COMPLEX | |||
| @@ -140,7 +140,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra | |||
| GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sbb + (jjs - js) * k * COMPSIZE); | |||
| } | |||
| // } | |||
| for (is = 0; is < k; is += GEMM_P) { | |||
| min_i = k - is; | |||
| @@ -251,14 +251,14 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * | |||
| min_jj = MIN(n_to, xxx + div_n) - jjs; | |||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
| if (0 && GEMM_UNROLL_N <= 8) { | |||
| /* if (0 && GEMM_UNROLL_N <= 8) { | |||
| printf("helllo\n"); | |||
| LASWP_NCOPY(min_jj, off + 1, off + k, | |||
| b + (- off + jjs * lda) * COMPSIZE, lda, | |||
| ipiv, buffer[bufferside] + (jjs - xxx) * k * COMPSIZE); | |||
| } else { | |||
| } else { */ | |||
| LASWP_PLUS(min_jj, off + 1, off + k, ZERO, | |||
| #ifdef COMPLEX | |||
| @@ -268,7 +268,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * | |||
| GEMM_ONCOPY (k, min_jj, b + jjs * lda * COMPSIZE, lda, | |||
| buffer[bufferside] + (jjs - xxx) * k * COMPSIZE); | |||
| } | |||
| // } | |||
| for (is = 0; is < k; is += GEMM_P) { | |||
| min_i = k - is; | |||
| @@ -54,7 +54,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| BLASLONG n, info; | |||
| BLASLONG bk, i, blocking, start_i; | |||
| int mode; | |||
| BLASLONG lda, range_N[2]; | |||
| BLASLONG lda;//, range_N[2]; | |||
| blas_arg_t newarg; | |||
| FLOAT *a; | |||
| FLOAT alpha[2] = { ONE, ZERO}; | |||
| @@ -100,8 +100,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| bk = n - i; | |||
| if (bk > blocking) bk = blocking; | |||
| range_N[0] = i; | |||
| range_N[1] = i + bk; | |||
| /* range_N[0] = i; | |||
| range_N[1] = i + bk; */ | |||
| newarg.lda = lda; | |||
| newarg.ldb = lda; | |||
| @@ -54,7 +54,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| BLASLONG n, info; | |||
| BLASLONG bk, i, blocking; | |||
| int mode; | |||
| BLASLONG lda, range_N[2]; | |||
| BLASLONG lda; // , range_N[2]; | |||
| blas_arg_t newarg; | |||
| FLOAT *a; | |||
| FLOAT alpha[2] = { ONE, ZERO}; | |||
| @@ -96,8 +96,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| bk = n - i; | |||
| if (bk > blocking) bk = blocking; | |||
| range_N[0] = i; | |||
| range_N[1] = i + bk; | |||
| /* range_N[0] = i; | |||
| range_N[1] = i + bk; */ | |||
| newarg.lda = lda; | |||
| newarg.ldb = lda; | |||