Browse Source

Update trmm_L.c

tags/v0.3.8^2
wjc404 GitHub 6 years ago
parent
commit
833bd0f8ff
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 20 additions and 4 deletions
  1. +20
    -4
      driver/level3/trmm_L.c

+ 20
- 4
driver/level3/trmm_L.c View File

@@ -135,10 +135,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO

for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

#endif
START_RPCC();

GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
@@ -201,10 +205,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO

for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

#endif
START_RPCC();

GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
@@ -292,10 +300,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO

for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

#endif
START_RPCC();

GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb,
@@ -358,10 +370,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO

for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

#endif
START_RPCC();

GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb,


Loading…
Cancel
Save