| @@ -183,4 +183,7 @@ In chronological order: | |||||
| * Rajalakshmi Srinivasaraghavan <https://github.com/RajalakshmiSR> | * Rajalakshmi Srinivasaraghavan <https://github.com/RajalakshmiSR> | ||||
| * [2020-04-15] Half-precision GEMM for bfloat16 | * [2020-04-15] Half-precision GEMM for bfloat16 | ||||
| * Marius Hillenbrand <https://github.com/mhillenibm> | |||||
| * [2020-05-12] Revise dynamic architecture detection for IBM z | |||||
| * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 | |||||
| @@ -563,8 +563,27 @@ DYNAMIC_CORE += EMAG8180 | |||||
| endif | endif | ||||
| ifeq ($(ARCH), zarch) | ifeq ($(ARCH), zarch) | ||||
| DYNAMIC_CORE = Z13 | |||||
| DYNAMIC_CORE = ZARCH_GENERIC | |||||
| # Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer | |||||
| GCC_GE_52 := $(subst 0,,$(shell expr `$(CC) -dumpversion` \>= "5.2")) | |||||
| ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) | |||||
| RHEL_WITH_Z13 := $(subst 0,,$(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3")) | |||||
| endif | |||||
| ifeq ($(or $(GCC_GE_52),$(RHEL_WITH_Z13)), 1) | |||||
| DYNAMIC_CORE += Z13 | |||||
| else | |||||
| $(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) | |||||
| endif | |||||
| GCC_MAJOR_GE_7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) | |||||
| ifeq ($(GCC_MAJOR_GE_7), 1) | |||||
| DYNAMIC_CORE += Z14 | DYNAMIC_CORE += Z14 | ||||
| else | |||||
| $(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) | |||||
| endif | |||||
| endif | endif | ||||
| ifeq ($(ARCH), power) | ifeq ($(ARCH), power) | ||||
| @@ -855,7 +874,7 @@ ifneq ($(INTERFACE64), 0) | |||||
| FCOMMON_OPT += -i8 | FCOMMON_OPT += -i8 | ||||
| endif | endif | ||||
| endif | endif | ||||
| FCOMMON_OPT += -recursive | |||||
| FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens | |||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| FCOMMON_OPT += -fopenmp | FCOMMON_OPT += -fopenmp | ||||
| endif | endif | ||||
| @@ -5,6 +5,6 @@ FCOMMON_OPT += -march=z13 -mzvector | |||||
| endif | endif | ||||
| ifeq ($(CORE), Z14) | ifeq ($(CORE), Z14) | ||||
| CCOMMON_OPT += -march=z14 -mzvector | |||||
| CCOMMON_OPT += -march=z14 -mzvector -O3 | |||||
| FCOMMON_OPT += -march=z14 -mzvector | FCOMMON_OPT += -march=z14 -mzvector | ||||
| endif | endif | ||||
| @@ -49,6 +49,12 @@ else | |||||
| GOTO_LAPACK_TARGETS= | GOTO_LAPACK_TARGETS= | ||||
| endif | endif | ||||
| ifeq ($(BUILD_HALF),1) | |||||
| GOTO_HALF_TARGETS=shgemm.goto | |||||
| else | |||||
| GOTO_HALF_TARGETS= | |||||
| endif | |||||
| ifeq ($(OSNAME), WINNT) | ifeq ($(OSNAME), WINNT) | ||||
| goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | ||||
| @@ -91,7 +97,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||||
| sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ | sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ | ||||
| spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ | spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ | ||||
| ssymm.goto dsymm.goto csymm.goto zsymm.goto \ | ssymm.goto dsymm.goto csymm.goto zsymm.goto \ | ||||
| saxpby.goto daxpby.goto caxpby.goto zaxpby.goto | |||||
| saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) | |||||
| acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | ||||
| scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | ||||
| @@ -264,7 +270,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ | |||||
| samin.goto damin.goto camin.goto zamin.goto \ | samin.goto damin.goto camin.goto zamin.goto \ | ||||
| smin.goto dmin.goto \ | smin.goto dmin.goto \ | ||||
| saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ | saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ | ||||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) | |||||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) | |||||
| acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | ||||
| scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | ||||
| @@ -614,6 +620,11 @@ zcholesky.essl : zcholesky.$(SUFFIX) | |||||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | ||||
| ##################################### Sgemm #################################################### | ##################################### Sgemm #################################################### | ||||
| ifeq ($(BUILD_HALF),1) | |||||
| shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||||
| endif | |||||
| sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) | sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) | ||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | ||||
| @@ -2916,6 +2927,11 @@ ccholesky.$(SUFFIX) : cholesky.c | |||||
| zcholesky.$(SUFFIX) : cholesky.c | zcholesky.$(SUFFIX) : cholesky.c | ||||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | ||||
| ifeq ($(BUILD_HALF),1) | |||||
| shgemm.$(SUFFIX) : gemm.c | |||||
| $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||||
| endif | |||||
| sgemm.$(SUFFIX) : gemm.c | sgemm.$(SUFFIX) : gemm.c | ||||
| $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | ||||
| @@ -39,6 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #ifdef DOUBLE | #ifdef DOUBLE | ||||
| #define GEMM BLASFUNC(dgemm) | #define GEMM BLASFUNC(dgemm) | ||||
| #elif defined(HALF) | |||||
| #define GEMM BLASFUNC(shgemm) | |||||
| #else | #else | ||||
| #define GEMM BLASFUNC(sgemm) | #define GEMM BLASFUNC(sgemm) | ||||
| #endif | #endif | ||||
| @@ -120,7 +122,8 @@ static void *huge_malloc(BLASLONG size){ | |||||
| int main(int argc, char *argv[]){ | int main(int argc, char *argv[]){ | ||||
| FLOAT *a, *b, *c; | |||||
| IFLOAT *a, *b; | |||||
| FLOAT *c; | |||||
| FLOAT alpha[] = {1.0, 0.0}; | FLOAT alpha[] = {1.0, 0.0}; | ||||
| FLOAT beta [] = {0.0, 0.0}; | FLOAT beta [] = {0.0, 0.0}; | ||||
| char transa = 'N'; | char transa = 'N'; | ||||
| @@ -184,10 +187,10 @@ int main(int argc, char *argv[]){ | |||||
| k = to; | k = to; | ||||
| } | } | ||||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) { | |||||
| if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * m * k * COMPSIZE)) == NULL) { | |||||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | fprintf(stderr,"Out of Memory!!\n");exit(1); | ||||
| } | } | ||||
| if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) { | |||||
| if (( b = (IFLOAT *)malloc(sizeof(IFLOAT) * k * n * COMPSIZE)) == NULL) { | |||||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | fprintf(stderr,"Out of Memory!!\n");exit(1); | ||||
| } | } | ||||
| if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) { | if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) { | ||||
| @@ -199,10 +202,10 @@ int main(int argc, char *argv[]){ | |||||
| #endif | #endif | ||||
| for (i = 0; i < m * k * COMPSIZE; i++) { | for (i = 0; i < m * k * COMPSIZE; i++) { | ||||
| a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||||
| a[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5; | |||||
| } | } | ||||
| for (i = 0; i < k * n * COMPSIZE; i++) { | for (i = 0; i < k * n * COMPSIZE; i++) { | ||||
| b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||||
| b[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5; | |||||
| } | } | ||||
| for (i = 0; i < m * n * COMPSIZE; i++) { | for (i = 0; i < m * n * COMPSIZE; i++) { | ||||
| c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | ||||
| @@ -43,7 +43,8 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||||
| if (NOT "${line_match}" STREQUAL "") | if (NOT "${line_match}" STREQUAL "") | ||||
| #message(STATUS "match on ${line_match}") | #message(STATUS "match on ${line_match}") | ||||
| set(var_name ${CMAKE_MATCH_1}) | set(var_name ${CMAKE_MATCH_1}) | ||||
| set(var_value ${CMAKE_MATCH_2}) | |||||
| # set(var_value ${CMAKE_MATCH_2}) | |||||
| string(STRIP ${CMAKE_MATCH_2} var_value) | |||||
| # check for Makefile variables in the string, e.g. $(TSUFFIX) | # check for Makefile variables in the string, e.g. $(TSUFFIX) | ||||
| string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) | string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) | ||||
| foreach (make_var ${make_var_matches}) | foreach (make_var ${make_var_matches}) | ||||
| @@ -63,7 +64,7 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||||
| string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | ||||
| if (NOT "${line_match}" STREQUAL "") | if (NOT "${line_match}" STREQUAL "") | ||||
| # message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | # message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | ||||
| if (${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) | |||||
| if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) | |||||
| # message (STATUS "condition is true") | # message (STATUS "condition is true") | ||||
| set (IfElse 1) | set (IfElse 1) | ||||
| else () | else () | ||||
| @@ -1,12 +1,58 @@ | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <stdbool.h> | |||||
| // Gate kernels for z13 and z14 on gcc version | |||||
| #if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \ | |||||
| /* RHEL 7 since 7.3: */ \ | |||||
| (__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \ | |||||
| __GNUC_RH_RELEASE__ >= 11) | |||||
| #define HAVE_Z13_SUPPORT | |||||
| #endif | |||||
| #if __GNUC__ >= 7 | |||||
| #define HAVE_Z14_SUPPORT | |||||
| #endif | |||||
| // Guard the use of getauxval() on glibc version >= 2.16 | |||||
| #ifdef __GLIBC__ | |||||
| #include <features.h> | |||||
| #if __GLIBC_PREREQ(2, 16) | |||||
| #include <sys/auxv.h> | |||||
| #define HAVE_GETAUXVAL 1 | |||||
| static unsigned long get_hwcap(void) | |||||
| { | |||||
| unsigned long hwcap = getauxval(AT_HWCAP); | |||||
| char *maskenv; | |||||
| // honor requests for not using specific CPU features in LD_HWCAP_MASK | |||||
| maskenv = getenv("LD_HWCAP_MASK"); | |||||
| if (maskenv) | |||||
| hwcap &= strtoul(maskenv, NULL, 0); | |||||
| return hwcap; | |||||
| // note that a missing auxval is interpreted as no capabilities | |||||
| // available, which is safe. | |||||
| } | |||||
| #else // __GLIBC_PREREQ(2, 16) | |||||
| #warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" | |||||
| static unsigned long get_hwcap(void) { | |||||
| // treat missing support for getauxval() as no capabilities available, | |||||
| // which is safe. | |||||
| return 0; | |||||
| } | |||||
| #endif // __GLIBC_PREREQ(2, 16) | |||||
| #endif // __GLIBC | |||||
| extern gotoblas_t gotoblas_ZARCH_GENERIC; | |||||
| #ifdef HAVE_Z13_SUPPORT | |||||
| extern gotoblas_t gotoblas_Z13; | extern gotoblas_t gotoblas_Z13; | ||||
| #endif | |||||
| #ifdef HAVE_Z14_SUPPORT | |||||
| extern gotoblas_t gotoblas_Z14; | extern gotoblas_t gotoblas_Z14; | ||||
| //extern gotoblas_t gotoblas_Z15; | |||||
| //#if (!defined C_GCC) || (GCC_VERSION >= 60000) | |||||
| //extern gotoblas_t gotoblas_Z14; | |||||
| //#endif | |||||
| #endif | |||||
| #define NUM_CORETYPES 4 | #define NUM_CORETYPES 4 | ||||
| @@ -16,47 +62,50 @@ static char* corename[] = { | |||||
| "unknown", | "unknown", | ||||
| "Z13", | "Z13", | ||||
| "Z14", | "Z14", | ||||
| // "Z15", | |||||
| "ZARCH_GENERIC", | "ZARCH_GENERIC", | ||||
| }; | }; | ||||
| char* gotoblas_corename(void) { | char* gotoblas_corename(void) { | ||||
| #ifdef HAVE_Z13_SUPPORT | |||||
| if (gotoblas == &gotoblas_Z13) return corename[1]; | if (gotoblas == &gotoblas_Z13) return corename[1]; | ||||
| #endif | |||||
| #ifdef HAVE_Z14_SUPPORT | |||||
| if (gotoblas == &gotoblas_Z14) return corename[2]; | if (gotoblas == &gotoblas_Z14) return corename[2]; | ||||
| // if (gotoblas == &gotoblas_Z15) return corename[3]; | |||||
| //#if (!defined C_GCC) || (GCC_VERSION >= 60000) | |||||
| // if (gotoblas == &gotoblas_POWER9) return corename[3]; | |||||
| //#endif | |||||
| return corename[0]; // try generic? | |||||
| #endif | |||||
| if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; | |||||
| return corename[0]; | |||||
| } | } | ||||
| // __builtin_cpu_is is not supported by zarch | |||||
| /** | |||||
| * Detect the fitting set of kernels by retrieving the CPU features supported by | |||||
| * OS from the auxiliary value AT_HWCAP and choosing the set of kernels | |||||
| * ("coretype") that exploits most of the features and can be compiled with the | |||||
| * available gcc version. | |||||
| * Note that we cannot use vector registers on a z13 or newer unless supported | |||||
| * by the OS kernel (which needs to handle them properly during context switch). | |||||
| */ | |||||
| static gotoblas_t* get_coretype(void) { | static gotoblas_t* get_coretype(void) { | ||||
| FILE* infile; | |||||
| char buffer[512], * p; | |||||
| p = (char*)NULL; | |||||
| infile = fopen("/proc/sysinfo", "r"); | |||||
| while (fgets(buffer, sizeof(buffer), infile)) { | |||||
| if (!strncmp("Type", buffer, 4)) { | |||||
| p = strchr(buffer, ':') + 2; | |||||
| #if 0 | |||||
| fprintf(stderr, "%s\n", p); | |||||
| #endif | |||||
| break; | |||||
| } | |||||
| } | |||||
| fclose(infile); | |||||
| unsigned long hwcap __attribute__((unused)) = get_hwcap(); | |||||
| if (strstr(p, "2964")) return &gotoblas_Z13; | |||||
| if (strstr(p, "2965")) return &gotoblas_Z13; | |||||
| if (strstr(p, "3906")) return &gotoblas_Z14; | |||||
| if (strstr(p, "3907")) return &gotoblas_Z14; | |||||
| if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14 | |||||
| if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14 | |||||
| // z14 and z15 systems: exploit Vector Facility (SIMD) and | |||||
| // Vector-Enhancements Facility 1 (float SIMD instructions), if present. | |||||
| #ifdef HAVE_Z14_SUPPORT | |||||
| if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) | |||||
| return &gotoblas_Z14; | |||||
| #endif | |||||
| // z13: Vector Facility (SIMD for double) | |||||
| #ifdef HAVE_Z13_SUPPORT | |||||
| if (hwcap & HWCAP_S390_VX) | |||||
| return &gotoblas_Z13; | |||||
| #endif | |||||
| return NULL; // should be ZARCH_GENERIC | |||||
| // fallback in case of missing compiler support, systems before z13, or | |||||
| // when the OS does not advertise support for the Vector Facility (e.g., | |||||
| // missing support in the OS kernel) | |||||
| return &gotoblas_ZARCH_GENERIC; | |||||
| } | } | ||||
| static gotoblas_t* force_coretype(char* coretype) { | static gotoblas_t* force_coretype(char* coretype) { | ||||
| @@ -76,12 +125,13 @@ static gotoblas_t* force_coretype(char* coretype) { | |||||
| switch (found) | switch (found) | ||||
| { | { | ||||
| #ifdef HAVE_Z13_SUPPORT | |||||
| case 1: return (&gotoblas_Z13); | case 1: return (&gotoblas_Z13); | ||||
| #endif | |||||
| #ifdef HAVE_Z14_SUPPORT | |||||
| case 2: return (&gotoblas_Z14); | case 2: return (&gotoblas_Z14); | ||||
| // case 3: return (&gotoblas_Z15); | |||||
| //#if (!defined C_GCC) || (GCC_VERSION >= 60000) | |||||
| // case 3: return (&gotoblas_POWER9); | |||||
| //#endif | |||||
| #endif | |||||
| case 3: return (&gotoblas_ZARCH_GENERIC); | |||||
| default: return NULL; | default: return NULL; | ||||
| } | } | ||||
| snprintf(message, 128, "Core not found: %s\n", coretype); | snprintf(message, 128, "Core not found: %s\n", coretype); | ||||
| @@ -109,9 +159,9 @@ void gotoblas_dynamic_init(void) { | |||||
| if (gotoblas == NULL) | if (gotoblas == NULL) | ||||
| { | { | ||||
| snprintf(coremsg, 128, "Falling back to Z14 core\n"); | |||||
| snprintf(coremsg, 128, "Failed to detect system, falling back to generic z support.\n"); | |||||
| openblas_warning(1, coremsg); | openblas_warning(1, coremsg); | ||||
| gotoblas = &gotoblas_Z14; | |||||
| gotoblas = &gotoblas_ZARCH_GENERIC; | |||||
| } | } | ||||
| if (gotoblas && gotoblas->init) { | if (gotoblas && gotoblas->init) { | ||||
| @@ -24,3 +24,6 @@ DGEMM_BETA = dgemm_beta_skylakex.c | |||||
| CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c | CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c | ||||
| ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c | ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c | ||||
| CSCALKERNEL = ../arm/zscal.c | |||||
| ZSCALKERNEL = ../arm/zscal.c | |||||
| @@ -86,23 +86,23 @@ DGEMVTKERNEL = dgemv_t_4.c | |||||
| CGEMVTKERNEL = cgemv_t_4.c | CGEMVTKERNEL = cgemv_t_4.c | ||||
| ZGEMVTKERNEL = zgemv_t_4.c | ZGEMVTKERNEL = zgemv_t_4.c | ||||
| STRMMKERNEL = strmm8x4V.S | |||||
| STRMMKERNEL = gemm_vec.c | |||||
| DTRMMKERNEL = trmm8x4V.S | DTRMMKERNEL = trmm8x4V.S | ||||
| CTRMMKERNEL = ctrmm4x4V.S | CTRMMKERNEL = ctrmm4x4V.S | ||||
| ZTRMMKERNEL = ztrmm4x4V.S | ZTRMMKERNEL = ztrmm4x4V.S | ||||
| SGEMMKERNEL = strmm8x4V.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMKERNEL = gemm_vec.c | |||||
| ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N)) | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMKERNEL = gemm8x4V.S | DGEMMKERNEL = gemm8x4V.S | ||||
| DGEMMINCOPY = ../generic/gemm_ncopy_8.c | DGEMMINCOPY = ../generic/gemm_ncopy_8.c | ||||
| DGEMMITCOPY = ../generic/gemm_tcopy_8.c | DGEMMITCOPY = ../generic/gemm_tcopy_8.c | ||||
| @@ -145,7 +145,3 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| @@ -0,0 +1,472 @@ | |||||
| /* | |||||
| * Copyright (c) IBM Corporation 2020. | |||||
| * All rights reserved. | |||||
| * | |||||
| * Redistribution and use in source and binary forms, with or without | |||||
| * modification, are permitted provided that the following conditions are | |||||
| * met: | |||||
| * | |||||
| * 1. Redistributions of source code must retain the above copyright | |||||
| * notice, this list of conditions and the following disclaimer. | |||||
| * | |||||
| * 2. Redistributions in binary form must reproduce the above copyright | |||||
| * notice, this list of conditions and the following disclaimer in | |||||
| * the documentation and/or other materials provided with the | |||||
| * distribution. | |||||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||||
| * its contributors may be used to endorse or promote products | |||||
| * derived from this software without specific prior written | |||||
| * permission. | |||||
| * | |||||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| */ | |||||
| #include "common.h" | |||||
| #include <vecintrin.h> | |||||
| #include <stdbool.h> | |||||
| #include <stdio.h> | |||||
| #include <stdlib.h> | |||||
| #ifdef COMPLEX | |||||
| #error "Handling for complex numbers is not supported in this kernel" | |||||
| #endif | |||||
| #ifdef DOUBLE | |||||
| #define UNROLL_M DGEMM_DEFAULT_UNROLL_M | |||||
| #define UNROLL_N DGEMM_DEFAULT_UNROLL_N | |||||
| #else | |||||
| #define UNROLL_M SGEMM_DEFAULT_UNROLL_M | |||||
| #define UNROLL_N SGEMM_DEFAULT_UNROLL_N | |||||
| #endif | |||||
| static const size_t unroll_m = UNROLL_M; | |||||
| static const size_t unroll_n = UNROLL_N; | |||||
| /* Handling of triangular matrices */ | |||||
| #ifdef TRMMKERNEL | |||||
| static const bool trmm = true; | |||||
| static const bool left = | |||||
| #ifdef LEFT | |||||
| true; | |||||
| #else | |||||
| false; | |||||
| #endif | |||||
| static const bool backwards = | |||||
| #if defined(LEFT) != defined(TRANSA) | |||||
| true; | |||||
| #else | |||||
| false; | |||||
| #endif | |||||
| #else | |||||
| static const bool trmm = false; | |||||
| static const bool left = false; | |||||
| static const bool backwards = false; | |||||
| #endif /* TRMMKERNEL */ | |||||
| /* | |||||
| * Background: | |||||
| * | |||||
| * The algorithm of GotoBLAS / OpenBLAS breaks down the matrix multiplication | |||||
| * problem by splitting all matrices into partitions multiple times, so that the | |||||
| * submatrices fit into the L1 or L2 caches. As a result, each multiplication of | |||||
| * submatrices can stream data fast from L1 and L2 caches. Inbetween, it copies | |||||
| * and rearranges the submatrices to enable contiguous memory accesses to | |||||
| * improve locality in both caches and TLBs. | |||||
| * | |||||
| * At the heart of the algorithm is this kernel, which multiplies, a "Block | |||||
| * matrix" A (small dimensions) with a "Panel matrix" B (number of rows is | |||||
| * small) and adds the result into a "Panel matrix" C; GotoBLAS calls this | |||||
| * operation GEBP. This kernel further partitions GEBP twice, such that (1) | |||||
| * submatrices of C and B fit into the L1 caches (GEBP_column_block) and (2) a | |||||
| * block of C fits into the registers, while multiplying panels from A and B | |||||
| * streamed from the L2 and L1 cache, respectively (GEBP_block). | |||||
| * | |||||
| * | |||||
| * Algorithm GEBP(A, B, C, m, n, k, alpha): | |||||
| * | |||||
| * The problem is calculating C += alpha * (A * B) | |||||
| * C is an m x n matrix, A is an m x k matrix, B is an k x n matrix. | |||||
| * | |||||
| * - C is in column-major-order, with an offset of ldc to the element in the | |||||
| * next column (same row). | |||||
| * - A is in row-major-order yet stores SGEMM_UNROLL_M elements of each column | |||||
| * contiguously while walking along rows. | |||||
| * - B is in column-major-order but packs SGEMM_UNROLL_N elements of a row | |||||
| * contiguously. | |||||
| * If the numbers of rows and columns are not multiples of SGEMM_UNROLL_M or | |||||
| * SGEMM_UNROLL_N, the remaining elements are arranged in blocks with power-of-2 | |||||
| * dimensions (e.g., 5 remaining columns would be in a block-of-4 and a | |||||
| * block-of-1). | |||||
| * | |||||
| * Note that packing A and B into that form is taken care of by the caller in | |||||
| * driver/level3/level3.c (actually done by "copy kernels"). | |||||
| * | |||||
| * Steps: | |||||
| * - Partition C and B into blocks of n_r (SGEMM_UNROLL_N) columns, C_j and B_j. | |||||
| * Now, B_j should fit into the L1 cache. | |||||
| * - For each partition, calculate C_j += alpha * (A * B_j) by | |||||
| * (1) Calculate C_aux := A * B_j (see below) | |||||
| * (2) unpack C_j = C_j + alpha * C_aux | |||||
| * | |||||
| * | |||||
| * Algorithm for Calculating C_aux: | |||||
| * | |||||
| * - Further partition C_aux and A into groups of m_r (SGEMM_UNROLL_M) rows, | |||||
| * such that the m_r x n_r-submatrix of C_aux can be held in registers. Each | |||||
| * submatrix of C_aux can be calculated independently, and the registers are | |||||
| * added back into C_j. | |||||
| * | |||||
| * - For each row-block of C_aux: | |||||
| * (uses a row block of A and full B_j) | |||||
| * - stream over all columns of A, multiply with elements from B and | |||||
| * accumulate in registers. (use different inner-kernels to exploit | |||||
| * vectorization for varying block sizes) | |||||
| * - add alpha * row block of C_aux back into C_j. | |||||
| * | |||||
| * Note that there are additional mechanics for handling triangular matrices, | |||||
| * calculating B := alpha (A * B) where either of the matrices A or B can be | |||||
| * triangular. In case of A, the macro "LEFT" is defined. In addition, A can | |||||
| * optionally be transposed. | |||||
| * The code effectively skips an "offset" number of columns in A and rows of B | |||||
| * in each block, to save unnecessary work by exploiting the triangular nature. | |||||
| * To handle all cases, the code discerns (1) a "left" mode when A is triangular | |||||
| * and (2) "forward" / "backwards" modes where only the first "offset" | |||||
| * columns/rows of A/B are used or where the first "offset" columns/rows are | |||||
| * skipped, respectively. | |||||
| * | |||||
| * Reference: | |||||
| * | |||||
| * The summary above is based on staring at various kernel implementations and: | |||||
| * K. Goto and R. A. Van de Geijn, Anatomy of High-Performance Matrix | |||||
| * Multiplication, in ACM Transactions of Mathematical Software, Vol. 34, No. | |||||
| * 3, May 2008. | |||||
| */ | |||||
| #define VLEN_BYTES 16 | |||||
| #define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT)) | |||||
| typedef FLOAT vector_float __attribute__ ((vector_size (16))); | |||||
| /** | |||||
| * Load a vector into register, and hint on 8-byte alignment to improve | |||||
| * performance. gcc-9 and newer will create these hints by itself. For older | |||||
| * compiler versions, use inline assembly to explicitly express the hint. | |||||
| * Provide explicit hex encoding to cater for binutils versions that do not know | |||||
| * about vector-load with alignment hints yet. | |||||
| * | |||||
| * Note that, for block sizes where we apply vectorization, vectors in A will | |||||
| * always be 8-byte aligned. | |||||
| */ | |||||
| static inline vector_float vec_load_hinted(FLOAT const *restrict a) { | |||||
| vector_float const *restrict addr = (vector_float const *restrict)a; | |||||
| vector_float y; | |||||
| #if __GNUC__ < 9 | |||||
| // hex-encode vl %[out],%[addr],3 | |||||
| asm(".insn vrx,0xe70000003006,%[out],%[addr],3" | |||||
| : [ out ] "=v"(y) | |||||
| : [ addr ] "R"(*addr)); | |||||
| #else | |||||
| y = *addr; | |||||
| #endif | |||||
| return y; | |||||
| } | |||||
| /** | |||||
| * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics. | |||||
| * | |||||
| * @param[in] A Pointer current block of input matrix A. | |||||
| * @param[in] k Number of columns in A. | |||||
| * @param[in] B Pointer current block of input matrix B. | |||||
| * @param[inout] C Pointer current block of output matrix C. | |||||
| * @param[in] ldc Offset between elements in adjacent columns in C. | |||||
| * @param[in] alpha Scalar factor. | |||||
| */ | |||||
| #define VECTOR_BLOCK(ROWS, COLS) \ | |||||
| static inline void GEBP_block_##ROWS##_##COLS( \ | |||||
| FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B, \ | |||||
| FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \ | |||||
| _Static_assert( \ | |||||
| ROWS % VLEN_FLOATS == 0, \ | |||||
| "rows in block must be multiples of vector length"); \ | |||||
| vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \ | |||||
| \ | |||||
| for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) \ | |||||
| for (BLASLONG j = 0; j < COLS; j++) \ | |||||
| Caux[i][j] = vec_splats(ZERO); \ | |||||
| \ | |||||
| /* \ | |||||
| * Stream over the row-block of A, which is packed \ | |||||
| * column-by-column, multiply by coefficients in B and add up \ | |||||
| * into temporaries Caux (which the compiler will hold in \ | |||||
| * registers). Vectorization: Multiply column vectors from A \ | |||||
| * with scalars from B and add up in column vectors of Caux. \ | |||||
| * That equates to unrolling the loop over rows (in i) and \ | |||||
| * executing each unrolled iteration as a vector element. \ | |||||
| */ \ | |||||
| for (BLASLONG k = 0; k < bk; k++) { \ | |||||
| for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ | |||||
| vector_float Ak = vec_load_hinted( \ | |||||
| A + i * VLEN_FLOATS + k * ROWS); \ | |||||
| \ | |||||
| for (BLASLONG j = 0; j < COLS; j++) \ | |||||
| Caux[i][j] += Ak * B[j + k * COLS]; \ | |||||
| } \ | |||||
| } \ | |||||
| \ | |||||
| /* \ | |||||
| * Unpack row-block of C_aux into outer C_i, multiply by \ | |||||
| * alpha and add up. \ | |||||
| */ \ | |||||
| for (BLASLONG j = 0; j < COLS; j++) { \ | |||||
| for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ | |||||
| vector_float *C_ij = \ | |||||
| (vector_float *)(C + i * VLEN_FLOATS + \ | |||||
| j * ldc); \ | |||||
| if (trmm) { \ | |||||
| *C_ij = alpha * Caux[i][j]; \ | |||||
| } else { \ | |||||
| *C_ij += alpha * Caux[i][j]; \ | |||||
| } \ | |||||
| } \ | |||||
| } \ | |||||
| } | |||||
| #if UNROLL_M == 16 | |||||
| VECTOR_BLOCK(16, 4) | |||||
| VECTOR_BLOCK(16, 2) | |||||
| VECTOR_BLOCK(16, 1) | |||||
| #endif | |||||
| #if UNROLL_N == 8 | |||||
| VECTOR_BLOCK(8, 8) | |||||
| VECTOR_BLOCK(4, 8) | |||||
| #endif | |||||
| VECTOR_BLOCK(8, 4) | |||||
| VECTOR_BLOCK(8, 2) | |||||
| VECTOR_BLOCK(8, 1) | |||||
| VECTOR_BLOCK(4, 4) | |||||
| VECTOR_BLOCK(4, 2) | |||||
| VECTOR_BLOCK(4, 1) | |||||
| #ifdef DOUBLE | |||||
| VECTOR_BLOCK(2, 4) | |||||
| VECTOR_BLOCK(2, 2) | |||||
| #endif | |||||
| /** | |||||
| * Handle calculation for row blocks in C_i of any size by dispatching into | |||||
| * macro-defined (inline) functions or by deferring to a simple generic | |||||
| * implementation. Note that the compiler can remove this awkward-looking | |||||
| * dispatching code while inlineing. | |||||
| * | |||||
| * @param[in] m Number of rows in block C_i. | |||||
| * @param[in] n Number of columns in block C_i. | |||||
| * @param[in] first_row Index of first row of the block C_i (relative to C). | |||||
| * @param[in] A Pointer to input matrix A (note: all of it). | |||||
| * @param[in] k Number of columns in A and rows in B. | |||||
| * @param[in] B Pointer to current column block (panel) of input matrix B. | |||||
| * @param[inout] C Pointer to current column block (panel) of output matrix C. | |||||
| * @param[in] ldc Offset between elements in adjacent columns in C. | |||||
| * @param[in] alpha Scalar factor. | |||||
| * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). | |||||
| * @param[in] off Running offset for handling triangular matrices. | |||||
| */ | |||||
| static inline void GEBP_block(BLASLONG m, BLASLONG n, | |||||
| BLASLONG first_row, | |||||
| const FLOAT * restrict A, BLASLONG k, | |||||
| const FLOAT * restrict B, | |||||
| FLOAT *restrict C, BLASLONG ldc, | |||||
| FLOAT alpha, | |||||
| BLASLONG offset, BLASLONG off) | |||||
| { | |||||
| if (trmm && left) | |||||
| off = offset + first_row; | |||||
| A += first_row * k; | |||||
| C += first_row; | |||||
| if (trmm) { | |||||
| if (backwards) { | |||||
| A += off * m; | |||||
| B += off * n; | |||||
| k -= off; | |||||
| } else { | |||||
| if (left) { | |||||
| k = off + m; | |||||
| } else { | |||||
| k = off + n; | |||||
| } | |||||
| } | |||||
| } | |||||
| #define BLOCK(bm, bn) \ | |||||
| if (m == bm && n == bn) { \ | |||||
| GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \ | |||||
| return; \ | |||||
| } | |||||
| #if UNROLL_M == 16 | |||||
| BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1); | |||||
| #endif | |||||
| #if UNROLL_N == 8 | |||||
| BLOCK(8, 8); BLOCK(4, 8); | |||||
| #endif | |||||
| BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1); | |||||
| BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1); | |||||
| #ifdef DOUBLE | |||||
| BLOCK(2, 4); | |||||
| BLOCK(2, 2); | |||||
| #endif | |||||
| #undef BLOCK | |||||
| /* simple implementation for smaller block sizes: */ | |||||
| FLOAT Caux[m][n] __attribute__ ((aligned (16))); | |||||
| /* | |||||
| * Peel off first iteration (i.e., column of A) for initializing Caux | |||||
| */ | |||||
| for (BLASLONG i = 0; i < m; i++) | |||||
| for (BLASLONG j = 0; j < n; j++) | |||||
| Caux[i][j] = A[i] * B[j]; | |||||
| for (BLASLONG kk = 1; kk < k; kk++) | |||||
| for (BLASLONG i = 0; i < m; i++) | |||||
| for (BLASLONG j = 0; j < n; j++) | |||||
| Caux[i][j] += A[i + kk * m] * B[j + kk * n]; | |||||
| for (BLASLONG i = 0; i < m; i++) | |||||
| for (BLASLONG j = 0; j < n; j++) | |||||
| if (trmm) { | |||||
| C[i + j * ldc] = alpha * Caux[i][j]; | |||||
| } else { | |||||
| C[i + j * ldc] += alpha * Caux[i][j]; | |||||
| } | |||||
| } | |||||
| /** | |||||
| * Handle a column block (panel) of C and B while calculating C += alpha(A * B). | |||||
| * | |||||
| * @param[in] num_cols Number of columns in the block (in C and B). | |||||
| * @param[in] first_col First column of the current block (in C and B). | |||||
| * @param[in] A Pointer to input matrix A. | |||||
| * @param[in] bk Number of columns in A and rows in B. | |||||
| * @param[in] B Pointer to input matrix B (note: all of it). | |||||
| * @param[in] bm Number of rows in C and A. | |||||
| * @param[inout] C Pointer to output matrix C (note: all of it). | |||||
| * @param[in] ldc Offset between elements in adjacent columns in C. | |||||
| * @param[in] alpha Scalar factor. | |||||
| * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). | |||||
| */ | |||||
| static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, | |||||
| const FLOAT *restrict A, BLASLONG bk, | |||||
| const FLOAT *restrict B, BLASLONG bm, | |||||
| FLOAT *restrict C, BLASLONG ldc, | |||||
| FLOAT alpha, | |||||
| BLASLONG const offset) { | |||||
| FLOAT *restrict C_i = C + first_col * ldc; | |||||
| /* | |||||
| * B is in column-order with n_r packed row elements, which does | |||||
| * not matter -- we always move in full such blocks of | |||||
| * column*pack | |||||
| */ | |||||
| const FLOAT *restrict B_i = B + first_col * bk; | |||||
| BLASLONG off = 0; | |||||
| if (trmm) { | |||||
| if (left) { | |||||
| off = offset; | |||||
| } else { | |||||
| off = -offset + first_col; | |||||
| } | |||||
| } | |||||
| /* | |||||
| * Calculate C_aux := A * B_j | |||||
| * then unpack C_i += alpha * C_aux. | |||||
| * | |||||
| * For that purpose, further partition C_aux and A into blocks | |||||
| * of m_r (unroll_m) rows, or powers-of-2 if smaller. | |||||
| */ | |||||
| BLASLONG row = 0; | |||||
| for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2) | |||||
| for (; bm - row >= block_size; row += block_size) | |||||
| GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i, | |||||
| ldc, alpha, offset, off); | |||||
| } | |||||
| /** | |||||
| * Inner kernel for matrix-matrix multiplication. C += alpha (A * B) | |||||
| * where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and | |||||
| * C are pointers to submatrices of the actual matrices. | |||||
| * | |||||
| * For triangular matrix multiplication, calculate B := alpha (A * B) where A | |||||
| * or B can be triangular (in case of A, the macro LEFT will be defined). | |||||
| * | |||||
| * @param[in] bm Number of rows in C and A. | |||||
| * @param[in] bn Number of columns in C and B. | |||||
| * @param[in] bk Number of columns in A and rows in B. | |||||
| * @param[in] alpha Scalar factor. | |||||
| * @param[in] ba Pointer to input matrix A. | |||||
| * @param[in] bb Pointer to input matrix B. | |||||
| * @param[inout] C Pointer to output matrix C. | |||||
| * @param[in] ldc Offset between elements in adjacent columns in C. | |||||
| * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). | |||||
| * @returns 0 on success. | |||||
| */ | |||||
| int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, | |||||
| FLOAT *restrict ba, FLOAT *restrict bb, | |||||
| FLOAT *restrict C, BLASLONG ldc | |||||
| #ifdef TRMMKERNEL | |||||
| , BLASLONG offset | |||||
| #endif | |||||
| ) | |||||
| { | |||||
| if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO)) | |||||
| return 0; | |||||
| /* | |||||
| * interface code allocates buffers for ba and bb at page | |||||
| * granularity (i.e., using mmap(MAP_ANONYMOUS), so enable the compiler | |||||
| * to make use of the fact in vector load operations. | |||||
| */ | |||||
| ba = __builtin_assume_aligned(ba, 16); | |||||
| bb = __builtin_assume_aligned(bb, 16); | |||||
| /* | |||||
| * Use offset and off even when compiled as SGEMMKERNEL to simplify | |||||
| * function signatures and function calls. | |||||
| */ | |||||
| #ifndef TRMMKERNEL | |||||
| BLASLONG const offset = 0; | |||||
| #endif | |||||
| /* | |||||
| * Partition B and C into blocks of n_r (unroll_n) columns, called B_i | |||||
| * and C_i. For each partition, calculate C_i += alpha * (A * B_j). | |||||
| * | |||||
| * For remaining columns that do not fill up a block of n_r, iteratively | |||||
| * use smaller block sizes of powers of 2. | |||||
| */ | |||||
| BLASLONG col = 0; | |||||
| for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2) | |||||
| for (; bn - col >= block_size; col += block_size) | |||||
| GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset); | |||||
| return 0; | |||||
| } | |||||
| @@ -2999,7 +2999,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | #define GEMM_DEFAULT_ALIGN 0x03fffUL | ||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | #define DGEMM_DEFAULT_UNROLL_M 8 | ||||
| @@ -46,6 +46,27 @@ typedef union | |||||
| } bits; | } bits; | ||||
| } bfloat16_bits; | } bfloat16_bits; | ||||
| typedef union | |||||
| { | |||||
| float v; | |||||
| struct | |||||
| { | |||||
| uint32_t m:23; | |||||
| uint32_t e:8; | |||||
| uint32_t s:1; | |||||
| } bits; | |||||
| } float32_bits; | |||||
| float | |||||
| float16to32 (bfloat16_bits f16) | |||||
| { | |||||
| float32_bits f32; | |||||
| f32.bits.s = f16.bits.s; | |||||
| f32.bits.e = f16.bits.e; | |||||
| f32.bits.m = (uint32_t) f16.bits.m << 16; | |||||
| return f32.v; | |||||
| } | |||||
| int | int | ||||
| main (int argc, char *argv[]) | main (int argc, char *argv[]) | ||||
| { | { | ||||
| @@ -55,8 +76,6 @@ main (int argc, char *argv[]) | |||||
| int loop = 100; | int loop = 100; | ||||
| char transA = 'N', transB = 'N'; | char transA = 'N', transB = 'N'; | ||||
| float alpha = 1.0, beta = 0.0; | float alpha = 1.0, beta = 0.0; | ||||
| char transa = 'N'; | |||||
| char transb = 'N'; | |||||
| for (int x = 0; x <= loop; x++) | for (int x = 0; x <= loop; x++) | ||||
| { | { | ||||
| @@ -65,30 +84,45 @@ main (int argc, char *argv[]) | |||||
| float B[k * n]; | float B[k * n]; | ||||
| float C[m * n]; | float C[m * n]; | ||||
| bfloat16_bits AA[m * k], BB[k * n]; | bfloat16_bits AA[m * k], BB[k * n]; | ||||
| float CC[m * n]; | |||||
| float DD[m * n], CC[m * n]; | |||||
| for (int j = 0; j < m; j++) | for (int j = 0; j < m; j++) | ||||
| { | { | ||||
| for (int i = 0; i < m; i++) | for (int i = 0; i < m; i++) | ||||
| { | { | ||||
| A[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; | |||||
| B[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; | |||||
| A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; | |||||
| B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; | |||||
| C[j * k + i] = 0; | C[j * k + i] = 0; | ||||
| AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; | AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; | ||||
| BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; | BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; | ||||
| CC[j * k + i] = 0; | CC[j * k + i] = 0; | ||||
| DD[j * k + i] = 0; | |||||
| } | } | ||||
| } | } | ||||
| SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, | SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, | ||||
| &m, B, &k, &beta, C, &m); | |||||
| &m, B, &k, &beta, C, &m); | |||||
| SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, | SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, | ||||
| &m, BB, &k, &beta, CC, &m); | |||||
| &m, BB, &k, &beta, CC, &m); | |||||
| for (i = 0; i < n; i++) | for (i = 0; i < n; i++) | ||||
| for (j = 0; j < m; j++) | |||||
| for (l = 0; l < k; l++) | |||||
| if (fabs(CC[i * m + j]-C[i * m + j]) > 1.0) | |||||
| ret++; | |||||
| for (j = 0; j < m; j++) | |||||
| for (l = 0; l < k; l++) | |||||
| if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) | |||||
| ret++; | |||||
| if (transA == 'N' && transB == 'N') | |||||
| { | |||||
| for (i = 0; i < n; i++) | |||||
| for (j = 0; j < m; j++) | |||||
| for (l = 0; l < k; l++) | |||||
| { | |||||
| DD[i * m + j] += | |||||
| float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]); | |||||
| } | |||||
| for (i = 0; i < n; i++) | |||||
| for (j = 0; j < m; j++) | |||||
| for (l = 0; l < k; l++) | |||||
| if (CC[i * m + j] != DD[i * m + j]) | |||||
| ret++; | |||||
| } | |||||
| } | } | ||||
| if (ret != 0) | if (ret != 0) | ||||
| fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); | fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); | ||||