Browse Source

dgemm/avx512 simplify and speed up the 4x4 kernel

tags/v0.3.4
Arjan van de Ven 7 years ago
parent
commit
20c5d668fe
1 changed files with 4 additions and 22 deletions
  1. +4
    -22
      kernel/x86_64/dgemm_kernel_4x8_skylakex.c

+ 4
- 22
kernel/x86_64/dgemm_kernel_4x8_skylakex.c View File

@@ -333,17 +333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#define KERNEL4x4_SUB() \
ymm0 = _mm256_loadu_pd(AO - 16); \
ymm1 = _mm256_loadu_pd(BO - 12); \
ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 12)); \
\
ymm4 += ymm0 * ymm1; \
\
ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \
ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 11)); \
ymm5 += ymm0 * ymm1; \
\
ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \
ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 10)); \
ymm6 += ymm0 * ymm1; \
\
ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \
ymm1 = _mm256_broadcastsd_pd(_mm_load_sd(BO - 9)); \
ymm7 += ymm0 * ymm1; \
AO += 4; \
BO += 4;
@@ -356,24 +356,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ymm6 *= ymm0; \
ymm7 *= ymm0; \
\
ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \
ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \
\
ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \
ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \
ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \
ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \
\
ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \
ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \
ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \
ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \
\
ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \
ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \
ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \
ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \
\
ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \
ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \
ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \


Loading…
Cancel
Save