| @@ -122,10 +122,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| for(jjs = 0; jjs < ls - js; jjs += min_jj){ | for(jjs = 0; jjs < ls - js; jjs += min_jj){ | ||||
| min_jj = ls - js - jjs; | min_jj = ls - js - jjs; | ||||
| #ifdef SKYLAKEX | |||||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||||
| #else | |||||
| if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||
| else | else | ||||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||
| #endif | |||||
| #ifndef TRANSA | #ifndef TRANSA | ||||
| GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); | GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); | ||||
| #else | #else | ||||
| @@ -142,10 +146,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| for(jjs = 0; jjs < min_l; jjs += min_jj){ | for(jjs = 0; jjs < min_l; jjs += min_jj){ | ||||
| min_jj = min_l - jjs; | min_jj = min_l - jjs; | ||||
| #ifdef SKYLAKEX | |||||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||||
| #else | |||||
| if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||
| else | else | ||||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||
| #endif | |||||
| #ifndef TRANSA | #ifndef TRANSA | ||||
| TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); | TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); | ||||
| #else | #else | ||||
| @@ -195,10 +203,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | for(jjs = js; jjs < js + min_j; jjs += min_jj){ | ||||
| min_jj = min_j + js - jjs; | min_jj = min_j + js - jjs; | ||||
| #ifdef SKYLAKEX | |||||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||||
| #else | |||||
| if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||
| else | else | ||||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||
| #endif | |||||
| #ifndef TRANSA | #ifndef TRANSA | ||||
| GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); | GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); | ||||
| #else | #else | ||||
| @@ -246,10 +258,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| for(jjs = 0; jjs < min_l; jjs += min_jj){ | for(jjs = 0; jjs < min_l; jjs += min_jj){ | ||||
| min_jj = min_l - jjs; | min_jj = min_l - jjs; | ||||
| #ifdef SKYLAKEX | |||||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||||
| #else | |||||
| if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||
| else | else | ||||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||
| #endif | |||||
| #ifndef TRANSA | #ifndef TRANSA | ||||
| TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE); | TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE); | ||||
| #else | #else | ||||
| @@ -267,10 +283,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ | for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ | ||||
| min_jj = js - ls - min_l - jjs; | min_jj = js - ls - min_l - jjs; | ||||
| #ifdef SKYLAKEX | |||||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||||
| #else | |||||
| if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||
| else | else | ||||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||
| #endif | |||||
| #ifndef TRANSA | #ifndef TRANSA | ||||
| GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, | GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, | ||||
| sb + min_l * (min_l + jjs) * COMPSIZE); | sb + min_l * (min_l + jjs) * COMPSIZE); | ||||
| @@ -324,10 +344,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | for(jjs = js; jjs < js + min_j; jjs += min_jj){ | ||||
| min_jj = min_j + js - jjs; | min_jj = min_j + js - jjs; | ||||
| #ifdef SKYLAKEX | |||||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||||
| #else | |||||
| if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; | ||||
| else | else | ||||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | ||||
| #endif | |||||
| #ifndef TRANSA | #ifndef TRANSA | ||||
| GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); | GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); | ||||
| #else | #else | ||||