Sapphire Rapids sbgemm fixtags/v0.3.24
| @@ -645,7 +645,7 @@ DYNAMIC_CORE += HASWELL ZEN | |||||
| endif | endif | ||||
| ifneq ($(NO_AVX512), 1) | ifneq ($(NO_AVX512), 1) | ||||
| ifneq ($(NO_AVX2), 1) | ifneq ($(NO_AVX2), 1) | ||||
| DYNAMIC_CORE += SKYLAKEX COOPERLAKE | |||||
| DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS | |||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -82,7 +82,7 @@ if (DYNAMIC_ARCH) | |||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) | set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) | ||||
| endif () | endif () | ||||
| if (NOT NO_AVX512) | if (NOT NO_AVX512) | ||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE) | |||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE SAPPHIRERAPIDS) | |||||
| string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") | string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") | ||||
| endif () | endif () | ||||
| if (DYNAMIC_LIST) | if (DYNAMIC_LIST) | ||||
| @@ -87,6 +87,15 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||||
| #message(STATUS "skipping ${makefile_line}") | #message(STATUS "skipping ${makefile_line}") | ||||
| continue () | continue () | ||||
| endif () | endif () | ||||
| # Example 1: SBGEMM_SMALL_M_PERMIT = | |||||
| # Unset the variable | |||||
| string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*$" line_match "${makefile_line}") | |||||
| if (NOT "${line_match}" STREQUAL "") | |||||
| set(var_name ${CMAKE_MATCH_1}) | |||||
| unset(${var_name}) | |||||
| endif() | |||||
| string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") | string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") | ||||
| if (NOT "${line_match}" STREQUAL "") | if (NOT "${line_match}" STREQUAL "") | ||||
| #message(STATUS "match on ${line_match}") | #message(STATUS "match on ${line_match}") | ||||
| @@ -53,6 +53,7 @@ typedef struct { | |||||
| int sbgemm_p, sbgemm_q, sbgemm_r; | int sbgemm_p, sbgemm_q, sbgemm_r; | ||||
| int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; | int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; | ||||
| int sbgemm_align_k; | int sbgemm_align_k; | ||||
| int need_amxtile_permission; // 0 default, 1 for device support amx. | |||||
| void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); | void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); | ||||
| void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); | void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); | ||||
| @@ -1479,6 +1479,8 @@ int get_cpuname(void){ | |||||
| else | else | ||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| case 15: // Sapphire Rapids | case 15: // Sapphire Rapids | ||||
| if(support_amx_bf16()) | |||||
| return CPUTYPE_SAPPHIRERAPIDS; | |||||
| if(support_avx512_bf16()) | if(support_avx512_bf16()) | ||||
| return CPUTYPE_COOPERLAKE; | return CPUTYPE_COOPERLAKE; | ||||
| if(support_avx512()) | if(support_avx512()) | ||||
| @@ -1845,7 +1847,8 @@ static char *cpuname[] = { | |||||
| "ZEN", | "ZEN", | ||||
| "SKYLAKEX", | "SKYLAKEX", | ||||
| "DHYANA", | "DHYANA", | ||||
| "COOPERLAKE" | |||||
| "COOPERLAKE", | |||||
| "SAPPHIRERAPIDS", | |||||
| }; | }; | ||||
| static char *lowercpuname[] = { | static char *lowercpuname[] = { | ||||
| @@ -1902,7 +1905,8 @@ static char *lowercpuname[] = { | |||||
| "zen", | "zen", | ||||
| "skylakex", | "skylakex", | ||||
| "dhyana", | "dhyana", | ||||
| "cooperlake" | |||||
| "cooperlake", | |||||
| "sapphirerapids", | |||||
| }; | }; | ||||
| static char *corename[] = { | static char *corename[] = { | ||||
| @@ -220,6 +220,19 @@ extern gotoblas_t gotoblas_COOPERLAKE; | |||||
| #else | #else | ||||
| #define gotoblas_COOPERLAKE gotoblas_PRESCOTT | #define gotoblas_COOPERLAKE gotoblas_PRESCOTT | ||||
| #endif | #endif | ||||
| #ifdef DYN_SAPPHIRERAPIDS | |||||
| extern gotoblas_t gotoblas_SAPPHIRERAPIDS; | |||||
| #elif defined(DYN_SKYLAKEX) | |||||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_SKYLAKEX | |||||
| #elif defined(DYN_HASWELL) | |||||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL | |||||
| #elif defined(DYN_SANDYBRIDGE) | |||||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE | |||||
| #elif defined(DYN_NEHALEM) | |||||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM | |||||
| #else | |||||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_PRESCOTT | |||||
| #endif | |||||
| #else // not DYNAMIC_LIST | #else // not DYNAMIC_LIST | ||||
| @@ -268,9 +281,11 @@ extern gotoblas_t gotoblas_ZEN; | |||||
| #ifndef NO_AVX512 | #ifndef NO_AVX512 | ||||
| extern gotoblas_t gotoblas_SKYLAKEX; | extern gotoblas_t gotoblas_SKYLAKEX; | ||||
| extern gotoblas_t gotoblas_COOPERLAKE; | extern gotoblas_t gotoblas_COOPERLAKE; | ||||
| extern gotoblas_t gotoblas_SAPPHIRERAPIDS; | |||||
| #else | #else | ||||
| #define gotoblas_SKYLAKEX gotoblas_HASWELL | #define gotoblas_SKYLAKEX gotoblas_HASWELL | ||||
| #define gotoblas_COOPERLAKE gotoblas_HASWELL | #define gotoblas_COOPERLAKE gotoblas_HASWELL | ||||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #else | #else | ||||
| @@ -279,6 +294,7 @@ extern gotoblas_t gotoblas_COOPERLAKE; | |||||
| #define gotoblas_HASWELL gotoblas_NEHALEM | #define gotoblas_HASWELL gotoblas_NEHALEM | ||||
| #define gotoblas_SKYLAKEX gotoblas_NEHALEM | #define gotoblas_SKYLAKEX gotoblas_NEHALEM | ||||
| #define gotoblas_COOPERLAKE gotoblas_NEHALEM | #define gotoblas_COOPERLAKE gotoblas_NEHALEM | ||||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM | |||||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | #define gotoblas_BULLDOZER gotoblas_BARCELONA | ||||
| #define gotoblas_PILEDRIVER gotoblas_BARCELONA | #define gotoblas_PILEDRIVER gotoblas_BARCELONA | ||||
| #define gotoblas_STEAMROLLER gotoblas_BARCELONA | #define gotoblas_STEAMROLLER gotoblas_BARCELONA | ||||
| @@ -378,6 +394,31 @@ int support_avx512_bf16(){ | |||||
| #endif | #endif | ||||
| } | } | ||||
| #define BIT_AMX_TILE 0x01000000 | |||||
| #define BIT_AMX_BF16 0x00400000 | |||||
| #define BIT_AMX_ENBD 0x00060000 | |||||
| int support_amx_bf16() { | |||||
| #if !defined(NO_AVX) && !defined(NO_AVX512) | |||||
| int eax, ebx, ecx, edx; | |||||
| int ret=0; | |||||
| if (!support_avx512()) | |||||
| return 0; | |||||
| // CPUID.7.0:EDX indicates AMX support | |||||
| cpuid_count(7, 0, &eax, &ebx, &ecx, &edx); | |||||
| if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) { | |||||
| // CPUID.D.0:EAX[17:18] indicates AMX enabled | |||||
| cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); | |||||
| if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD) | |||||
| ret = 1; | |||||
| } | |||||
| return ret; | |||||
| #else | |||||
| return 0; | |||||
| #endif | |||||
| } | |||||
| extern void openblas_warning(int verbose, const char * msg); | extern void openblas_warning(int verbose, const char * msg); | ||||
| #define FALLBACK_VERBOSE 1 | #define FALLBACK_VERBOSE 1 | ||||
| #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" | #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" | ||||
| @@ -689,6 +730,8 @@ static gotoblas_t *get_coretype(void){ | |||||
| } | } | ||||
| } | } | ||||
| if (model == 15){ // Sapphire Rapids | if (model == 15){ // Sapphire Rapids | ||||
| if(support_amx_bf16()) | |||||
| return &gotoblas_SAPPHIRERAPIDS; | |||||
| if(support_avx512_bf16()) | if(support_avx512_bf16()) | ||||
| return &gotoblas_COOPERLAKE; | return &gotoblas_COOPERLAKE; | ||||
| if (support_avx512()) | if (support_avx512()) | ||||
| @@ -941,7 +984,8 @@ static char *corename[] = { | |||||
| "Excavator", | "Excavator", | ||||
| "Zen", | "Zen", | ||||
| "SkylakeX", | "SkylakeX", | ||||
| "Cooperlake" | |||||
| "Cooperlake", | |||||
| "SapphireRapids" | |||||
| }; | }; | ||||
| char *gotoblas_corename(void) { | char *gotoblas_corename(void) { | ||||
| @@ -1006,6 +1050,7 @@ char *gotoblas_corename(void) { | |||||
| if (gotoblas == &gotoblas_ZEN) return corename[23]; | if (gotoblas == &gotoblas_ZEN) return corename[23]; | ||||
| if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; | if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; | ||||
| if (gotoblas == &gotoblas_COOPERLAKE) return corename[25]; | if (gotoblas == &gotoblas_COOPERLAKE) return corename[25]; | ||||
| if (gotoblas == &gotoblas_SAPPHIRERAPIDS) return corename[26]; | |||||
| return corename[0]; | return corename[0]; | ||||
| } | } | ||||
| @@ -154,6 +154,23 @@ static size_t zgemm_small_kernel_b0[] = { | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) | |||||
| #define XFEATURE_XTILEDATA 18 | |||||
| #define ARCH_REQ_XCOMP_PERM 0x1023 | |||||
| static int openblas_amxtile_permission = 0; | |||||
| static int init_amxtile_permission() { | |||||
| long status = | |||||
| syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA); | |||||
| if (status != 0) { | |||||
| fprintf(stderr, "XTILEDATA permission not granted in your device(Linux, " | |||||
| "Intel Sapphier Rapids), skip sbgemm calculation\n"); | |||||
| return -1; | |||||
| } | |||||
| openblas_amxtile_permission = 1; | |||||
| return 0; | |||||
| } | |||||
| #endif | |||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| void NAME(char *TRANSA, char *TRANSB, | void NAME(char *TRANSA, char *TRANSB, | ||||
| @@ -455,6 +472,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| #endif | #endif | ||||
| #if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) | |||||
| #if defined(DYNAMIC_ARCH) | |||||
| if (gotoblas->need_amxtile_permission && | |||||
| openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) { | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| #if !defined(DYNAMIC_ARCH) && defined(SAPPHIRERAPIDS) | |||||
| if (openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) { | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| #endif // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) | |||||
| if ((args.m == 0) || (args.n == 0)) return; | if ((args.m == 0) || (args.n == 0)) return; | ||||
| #if 0 | #if 0 | ||||
| @@ -33,7 +33,7 @@ endif | |||||
| ifdef TARGET_CORE | ifdef TARGET_CORE | ||||
| ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | ||||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | ||||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||||
| ifeq ($(GCCVERSIONGTEQ11), 1) | |||||
| override CFLAGS += -march=sapphirerapids | override CFLAGS += -march=sapphirerapids | ||||
| else | else | ||||
| override CFLAGS += -march=skylake-avx512 -mavx512f | override CFLAGS += -march=skylake-avx512 -mavx512f | ||||
| @@ -66,6 +66,7 @@ gotoblas_t TABLE_NAME = { | |||||
| #endif | #endif | ||||
| SBGEMM_ALIGN_K, | SBGEMM_ALIGN_K, | ||||
| 0, // need_amxtile_permission | |||||
| sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, | sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, | ||||
| @@ -1809,6 +1810,12 @@ static void init_parameter(void) { | |||||
| #endif | #endif | ||||
| #ifdef SAPPHIRERAPIDS | |||||
| #if (BUILD_BFLOAT16 == 1) | |||||
| TABLE_NAME.need_amxtile_permission = 1; | |||||
| #endif | |||||
| #endif | |||||
| #if BUILD_COMPLEX==1 | #if BUILD_COMPLEX==1 | ||||
| #ifdef CGEMM3M_DEFAULT_P | #ifdef CGEMM3M_DEFAULT_P | ||||
| TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; | TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; | ||||
| @@ -1,6 +1,14 @@ | |||||
| include $(KERNELDIR)/KERNEL.COOPERLAKE | include $(KERNELDIR)/KERNEL.COOPERLAKE | ||||
| SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_spr.c | |||||
| SBGEMM_SMALL_M_PERMIT = | |||||
| SBGEMM_SMALL_K_NN = | |||||
| SBGEMM_SMALL_K_B0_NN = | |||||
| SBGEMM_SMALL_K_NT = | |||||
| SBGEMM_SMALL_K_B0_NT = | |||||
| SBGEMM_SMALL_K_TN = | |||||
| SBGEMM_SMALL_K_B0_TN = | |||||
| SBGEMM_SMALL_K_TT = | |||||
| SBGEMM_SMALL_K_B0_TT = | |||||
| SBGEMM_BETA = sgemm_beta_skylakex.c | SBGEMM_BETA = sgemm_beta_skylakex.c | ||||
| SBGEMMKERNEL = sbgemm_kernel_16x16_spr.c | SBGEMMKERNEL = sbgemm_kernel_16x16_spr.c | ||||
| @@ -97,33 +97,32 @@ typedef struct { | |||||
| #define T_C10 6 | #define T_C10 6 | ||||
| #define T_C11 7 | #define T_C11 7 | ||||
| // FIXME: gcc11 seem have problem in tile load/store address calc, | |||||
| // need to multiply with element size (2 or 4) here. | |||||
| #define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2) | #define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2) | ||||
| #define LOAD_A_TAIL(M, N) {\ | #define LOAD_A_TAIL(M, N) {\ | ||||
| __m256i ymm = _mm256_loadu_epi16(ptr_a##M); \ | __m256i ymm = _mm256_loadu_epi16(ptr_a##M); \ | ||||
| __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ | __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ | ||||
| _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ | _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ | ||||
| _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ | |||||
| _tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \ | |||||
| } | } | ||||
| #define MASK_LOAD_A_TAIL(M, N) {\ | #define MASK_LOAD_A_TAIL(M, N) {\ | ||||
| __m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \ | __m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \ | ||||
| __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ | __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ | ||||
| _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ | _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ | ||||
| _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ | |||||
| _tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \ | |||||
| } | } | ||||
| #define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2) | #define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2) | ||||
| #define LOAD_B_TAIL(M, N) {\ | #define LOAD_B_TAIL(M, N) {\ | ||||
| __m256i ymm = _mm256_loadu_epi16(ptr_b##N); \ | __m256i ymm = _mm256_loadu_epi16(ptr_b##N); \ | ||||
| __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ | __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ | ||||
| _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ | _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ | ||||
| _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ | |||||
| _tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \ | |||||
| } | } | ||||
| #define MASK_LOAD_B_TAIL(M, N) {\ | #define MASK_LOAD_B_TAIL(M, N) {\ | ||||
| __m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \ | __m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \ | ||||
| __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ | __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ | ||||
| _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ | _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ | ||||
| _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ | |||||
| _tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \ | |||||
| } | } | ||||
| #define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N) | #define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N) | ||||