| @@ -29,10 +29,8 @@ option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding proc | |||
| else() | |||
| set(NO_AFFINITY 1) | |||
| endif() | |||
| option(BUILD_SINGLE "Single precision" OFF) | |||
| option(BUILD_DOUBLE "Double precision" OFF) | |||
| option(BUILD_COMPLEX "Single precision" OFF) | |||
| option(BUILD_COMPLEX16 "Single precision" OFF) | |||
| option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) | |||
| option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) | |||
| # Add a prefix or suffix to all exported symbol names in the shared library. | |||
| # Avoids conflicts with other BLAS libraries, especially when using | |||
| @@ -91,13 +89,13 @@ if (NOT NO_LAPACK) | |||
| list(APPEND SUBDIRS lapack) | |||
| endif () | |||
| if (NOT DEFINED BUILD_HALF) | |||
| set (BUILD_HALF false) | |||
| if (NOT DEFINED BUILD_BFLOAT16) | |||
| set (BUILD_BFLOAT16 false) | |||
| endif () | |||
| # set which float types we want to build for | |||
| if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) | |||
| # if none are defined, build for all | |||
| # set(BUILD_HALF true) | |||
| # set(BUILD_BFLOAT16 true) | |||
| set(BUILD_SINGLE true) | |||
| set(BUILD_DOUBLE true) | |||
| set(BUILD_COMPLEX true) | |||
| @@ -110,33 +108,28 @@ endif() | |||
| set(FLOAT_TYPES "") | |||
| if (BUILD_SINGLE) | |||
| message(STATUS "Building Songle Precision") | |||
| list(APPEND FLOAT_TYPES "SINGLE") | |||
| # set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") | |||
| message(STATUS "Building Single Precision") | |||
| list(APPEND FLOAT_TYPES "SINGLE") # defines nothing | |||
| endif () | |||
| if (BUILD_DOUBLE) | |||
| message(STATUS "Building Double Precision") | |||
| list(APPEND FLOAT_TYPES "DOUBLE") | |||
| #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1") | |||
| list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE | |||
| endif () | |||
| if (BUILD_COMPLEX) | |||
| message(STATUS "Building Complex Precision") | |||
| list(APPEND FLOAT_TYPES "COMPLEX") | |||
| #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1") | |||
| endif () | |||
| list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX | |||
| endif () | |||
| if (BUILD_COMPLEX16) | |||
| message(STATUS "Building Double Complex Precision") | |||
| list(APPEND FLOAT_TYPES "ZCOMPLEX") | |||
| #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1") | |||
| list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE | |||
| endif () | |||
| if (BUILD_HALF) | |||
| if (BUILD_BFLOAT16) | |||
| message(STATUS "Building Half Precision") | |||
| list(APPEND FLOAT_TYPES "HALF") | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_HALF") | |||
| list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing | |||
| endif () | |||
| if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") | |||
| @@ -243,6 +236,9 @@ if (NOT MSVC AND NOT NOFORTRAN) | |||
| add_subdirectory(ctest) | |||
| endif() | |||
| add_subdirectory(lapack-netlib/TESTING) | |||
| if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) | |||
| add_subdirectory(cpp_thread_test) | |||
| endif() | |||
| endif() | |||
| set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES | |||
| @@ -272,17 +272,33 @@ COMMON_PROF = -pg | |||
| # work at all. | |||
| # | |||
| # CPP_THREAD_SAFETY_TEST = 1 | |||
| # | |||
| # use this to run only the less memory-hungry GEMV test | |||
| # CPP_THREAD_SAFETY_GEMV = 1 | |||
| # If you want to enable the experimental BFLOAT16 support | |||
| # BUILD_HALF = 1 | |||
| # | |||
| # Select if you need to build only select types | |||
| # BUILD_SINGLE = 1 | |||
| # BUILD_DOUBLE = 1 | |||
| # BUILD_COMPLEX = 1 | |||
| # BUILD_COMPLEX16 = 1 | |||
| # | |||
| # | |||
| # BUILD_BFLOAT16 = 1 | |||
| # Set the thread number threshold beyond which the job array for the threaded level3 BLAS | |||
| # will be allocated on the heap rather than the stack. (This array alone requires | |||
| # NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu | |||
| # counts, but obviously it is not the only item that ends up on the stack. | |||
| # The default value of 32 ensures that the overall requirement is compatible | |||
| # with the default 1MB stacksize imposed by having the Java VM loaded without use | |||
| # of its -Xss parameter. | |||
| # The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible | |||
| # with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java | |||
| # VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code | |||
| # BLAS3_MEM_ALLOC_THRESHOLD = 160 | |||
| # the below is not yet configurable, use cmake if you need to build only select types | |||
| BUILD_SINGLE = 1 | |||
| BUILD_DOUBLE = 1 | |||
| BUILD_COMPLEX = 1 | |||
| BUILD_COMPLEX16 = 1 | |||
| # End of user configuration | |||
| # | |||
| @@ -1232,8 +1232,8 @@ ifeq ($(USE_TLS), 1) | |||
| CCOMMON_OPT += -DUSE_TLS | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| CCOMMON_OPT += -DBUILD_HALF | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| CCOMMON_OPT += -DBUILD_BFLOAT16 | |||
| endif | |||
| ifeq ($(BUILD_SINGLE), 1) | |||
| CCOMMON_OPT += -DBUILD_SINGLE=1 | |||
| @@ -1521,10 +1521,10 @@ export KERNELDIR | |||
| export FUNCTION_PROFILE | |||
| export TARGET_CORE | |||
| export NO_AVX512 | |||
| export BUILD_HALF | |||
| export BUILD_BFLOAT16 | |||
| export SHGEMM_UNROLL_M | |||
| export SHGEMM_UNROLL_N | |||
| export SBGEMM_UNROLL_M | |||
| export SBGEMM_UNROLL_N | |||
| export SGEMM_UNROLL_M | |||
| export SGEMM_UNROLL_N | |||
| export DGEMM_UNROLL_M | |||
| @@ -24,14 +24,14 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) | |||
| BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) | |||
| endif | |||
| $(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX | |||
| $(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX | |||
| $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX | |||
| $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX | |||
| $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX | |||
| $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX | |||
| $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX | |||
| $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX | |||
| $(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX | |||
| $(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX | |||
| $(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| @@ -49,8 +49,8 @@ else | |||
| GOTO_LAPACK_TARGETS= | |||
| endif | |||
| ifeq ($(BUILD_HALF),1) | |||
| GOTO_HALF_TARGETS=shgemm.goto | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| GOTO_HALF_TARGETS=sbgemm.goto | |||
| else | |||
| GOTO_HALF_TARGETS= | |||
| endif | |||
| @@ -620,8 +620,8 @@ zcholesky.essl : zcholesky.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Sgemm #################################################### | |||
| ifeq ($(BUILD_HALF),1) | |||
| shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| endif | |||
| @@ -2927,8 +2927,8 @@ ccholesky.$(SUFFIX) : cholesky.c | |||
| zcholesky.$(SUFFIX) : cholesky.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| ifeq ($(BUILD_HALF),1) | |||
| shgemm.$(SUFFIX) : gemm.c | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| sbgemm.$(SUFFIX) : gemm.c | |||
| $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| endif | |||
| @@ -40,7 +40,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef DOUBLE | |||
| #define GEMM BLASFUNC(dgemm) | |||
| #elif defined(HALF) | |||
| #define GEMM BLASFUNC(shgemm) | |||
| #define GEMM BLASFUNC(sbgemm) | |||
| #else | |||
| #define GEMM BLASFUNC(sgemm) | |||
| #endif | |||
| @@ -392,7 +392,7 @@ void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE | |||
| /* convert BFLOAT16 array to double array */ | |||
| void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout); | |||
| /* dot production of BFLOAT16 input arrays, and output as float */ | |||
| float cblas_shdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); | |||
| float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); | |||
| #ifdef __cplusplus | |||
| } | |||
| @@ -113,7 +113,7 @@ macro(SetDefaultL1) | |||
| set(ZSUMKERNEL zsum.S) | |||
| set(QSUMKERNEL sum.S) | |||
| set(XSUMKERNEL zsum.S) | |||
| if (BUILD_HALF) | |||
| if (BUILD_BFLOAT16) | |||
| set(SHAMINKERNEL ../arm/amin.c) | |||
| set(SHAMAXKERNEL ../arm/amax.c) | |||
| set(SHMAXKERNEL ../arm/max.c) | |||
| @@ -126,7 +126,7 @@ if (BUILD_HALF) | |||
| set(SHAXPYKERNEL ../arm/axpy.c) | |||
| set(SHAXPBYKERNEL ../arm/axpby.c) | |||
| set(SHCOPYKERNEL ../arm/copy.c) | |||
| set(SHDOTKERNEL ../x86_64/shdot.c) | |||
| set(SBDOTKERNEL ../x86_64/sbdot.c) | |||
| set(SHROTKERNEL ../arm/rot.c) | |||
| set(SHSCALKERNEL ../arm/scal.c) | |||
| set(SHNRM2KERNEL ../arm/nrm2.c) | |||
| @@ -183,9 +183,9 @@ macro(SetDefaultL2) | |||
| set(XHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| if (BUILD_HALF) | |||
| set(SHGEMVNKERNEL ../arm/gemv_n.c) | |||
| set(SHGEMVTKERNEL ../arm/gemv_t.c) | |||
| if (BUILD_BFLOAT16) | |||
| set(SBGEMVNKERNEL ../arm/gemv_n.c) | |||
| set(SBGEMVTKERNEL ../arm/gemv_t.c) | |||
| set(SHGERKERNEL ../generic/ger.c) | |||
| endif () | |||
| endmacro () | |||
| @@ -195,18 +195,18 @@ macro(SetDefaultL3) | |||
| set(DGEADD_KERNEL ../generic/geadd.c) | |||
| set(CGEADD_KERNEL ../generic/zgeadd.c) | |||
| set(ZGEADD_KERNEL ../generic/zgeadd.c) | |||
| if (BUILD_HALF) | |||
| if (BUILD_BFLOAT16) | |||
| set(SHGEADD_KERNEL ../generic/geadd.c) | |||
| set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c) | |||
| set(SHGEMM_BETA ../generic/gemm_beta.c) | |||
| set(SHGEMMINCOPY ../generic/gemm_ncopy_2.c) | |||
| set(SHGEMMITCOPY ../generic/gemm_tcopy_2.c) | |||
| set(SHGEMMONCOPY ../generic/gemm_ncopy_2.c) | |||
| set(SHGEMMOTCOPY ../generic/gemm_tcopy_2.c) | |||
| set(SHGEMMINCOPYOBJ shgemm_incopy.o) | |||
| set(SHGEMMITCOPYOBJ shgemm_itcopy.o) | |||
| set(SHGEMMONCOPYOBJ shgemm_oncopy.o) | |||
| set(SHGEMMOTCOPYOBJ shgemm_otcopy.o) | |||
| set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) | |||
| set(SBGEMM_BETA ../generic/gemm_beta.c) | |||
| set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) | |||
| set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) | |||
| set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) | |||
| set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) | |||
| set(SBGEMMINCOPYOBJ sbgemm_incopy.o) | |||
| set(SBGEMMITCOPYOBJ sbgemm_itcopy.o) | |||
| set(SBGEMMONCOPYOBJ sbgemm_oncopy.o) | |||
| set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) | |||
| endif () | |||
| endmacro () | |||
| @@ -16,8 +16,8 @@ | |||
| # HAVE_SSE2 | |||
| # HAVE_SSE3 | |||
| # MAKE | |||
| # SHGEMM_UNROLL_M | |||
| # SHGEMM_UNROLL_N | |||
| # SBGEMM_UNROLL_M | |||
| # SBGEMM_UNROLL_N | |||
| # SGEMM_UNROLL_M | |||
| # SGEMM_UNROLL_N | |||
| # DGEMM_UNROLL_M | |||
| @@ -471,8 +471,8 @@ endif () | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(SYMV_P 8) | |||
| endif() | |||
| set(SHGEMM_UNROLL_M 8) | |||
| set(SHGEMM_UNROLL_N 4) | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| # Or should this actually be NUM_CORES? | |||
| if (${NUM_THREADS} GREATER 0) | |||
| @@ -70,6 +70,9 @@ if (DEFINED TARGET) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
| endif() | |||
| endif() | |||
| if (DEFINED HAVE_SSE3) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") | |||
| endif() | |||
| endif() | |||
| if (DEFINED TARGET) | |||
| @@ -323,7 +326,13 @@ else () | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048") | |||
| endif () | |||
| endif () | |||
| if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
| if (DEFINED BLAS3_MEM_ALLOC_THRESHOLD) | |||
| if (NOT ${BLAS3_MEM_ALLOC_THRESHOLD} EQUAL 32) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_THRESHOLD}") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (DEFINED LIBNAMESUFFIX) | |||
| set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") | |||
| else () | |||
| @@ -401,20 +410,16 @@ if (NOT BUILD_SINGLE AND NOT BUILD_DOUBLE AND NOT BUILD_COMPLEX AND NOT BUILD_CO | |||
| set (BUILD_COMPLEX16 ON) | |||
| endif() | |||
| if (BUILD_SINGLE) | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE=1") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE") | |||
| endif() | |||
| if (BUILD_DOUBLE) | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE") | |||
| endif() | |||
| if (BUILD_COMPLEX) | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX=1") | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX") | |||
| endif() | |||
| if (BUILD_COMPLEX16) | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX16=1") | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") | |||
| endif() | |||
| if(NOT MSVC) | |||
| set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") | |||
| @@ -588,8 +593,8 @@ endif () | |||
| #export FUNCTION_PROFILE | |||
| #export TARGET_CORE | |||
| # | |||
| #export SHGEMM_UNROLL_M | |||
| #export SHGEMM_UNROLL_N | |||
| #export SBGEMM_UNROLL_M | |||
| #export SBGEMM_UNROLL_N | |||
| #export SGEMM_UNROLL_M | |||
| #export SGEMM_UNROLL_N | |||
| #export DGEMM_UNROLL_M | |||
| @@ -211,7 +211,7 @@ function(GenerateNamedObjects sources_in) | |||
| if (complex_only) | |||
| list(REMOVE_ITEM float_list "SINGLE") | |||
| list(REMOVE_ITEM float_list "DOUBLE") | |||
| list(REMOVE_ITEM float_list "HALF") | |||
| list(REMOVE_ITEM float_list "BFLOAT16") | |||
| elseif (real_only) | |||
| list(REMOVE_ITEM float_list "COMPLEX") | |||
| list(REMOVE_ITEM float_list "ZCOMPLEX") | |||
| @@ -225,8 +225,8 @@ function(GenerateNamedObjects sources_in) | |||
| if (NOT no_float_type) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| string(TOLOWER ${float_char} float_char) | |||
| if (${float_type} STREQUAL "HALF") | |||
| set (float_char "sh") | |||
| if (${float_type} STREQUAL "BFLOAT16") | |||
| set (float_char "sb") | |||
| endif () | |||
| endif () | |||
| @@ -262,8 +262,8 @@ function(GenerateNamedObjects sources_in) | |||
| if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
| list(APPEND obj_defines "DOUBLE") | |||
| endif () | |||
| if (${float_type} STREQUAL "HALF") | |||
| list(APPEND obj_defines "HALF") | |||
| if (${float_type} STREQUAL "BFLOAT16") | |||
| list(APPEND obj_defines "BFLOAT16") | |||
| endif () | |||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
| list(APPEND obj_defines "COMPLEX") | |||
| @@ -260,7 +260,7 @@ typedef unsigned long BLASULONG; | |||
| #ifndef BFLOAT16 | |||
| #include <stdint.h> | |||
| typedef uint16_t bfloat16; | |||
| #define HALFCONVERSION 1 | |||
| #define BFLOAT16CONVERSION 1 | |||
| #endif | |||
| #ifdef USE64BITINT | |||
| @@ -303,7 +303,7 @@ typedef int blasint; | |||
| #define SIZE 8 | |||
| #define BASE_SHIFT 3 | |||
| #define ZBASE_SHIFT 4 | |||
| #elif defined(HALF) | |||
| #elif defined(BFLOAT16) | |||
| #define IFLOAT bfloat16 | |||
| #define XFLOAT IFLOAT | |||
| #define FLOAT float | |||
| @@ -54,7 +54,7 @@ double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *); | |||
| double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *); | |||
| xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *); | |||
| float BLASFUNC(shdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *); | |||
| float BLASFUNC(sbdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *); | |||
| void BLASFUNC(shstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *); | |||
| void BLASFUNC(shdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *); | |||
| void BLASFUNC(sbf16tos) (blasint *, bfloat16 *, blasint *, float *, blasint *); | |||
| @@ -474,7 +474,7 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint | |||
| /* Level 3 routines */ | |||
| void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *, | |||
| void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *, | |||
| bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *); | |||
| void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *, | |||
| float *, blasint *, float *, blasint *, float *, float *, blasint *); | |||
| @@ -46,7 +46,7 @@ float sdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| float shdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); | |||
| float sbdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); | |||
| void shstobf16_k(BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); | |||
| void shdtobf16_k(BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); | |||
| @@ -55,7 +55,7 @@ void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K, | |||
| int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); | |||
| int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, | |||
| int sbgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, | |||
| bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); | |||
| int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| @@ -78,10 +78,10 @@ int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, | |||
| xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| #endif | |||
| int shgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); | |||
| int shgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); | |||
| int shgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); | |||
| int shgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); | |||
| int sbgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); | |||
| int sbgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); | |||
| int sbgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); | |||
| int sbgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); | |||
| int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); | |||
| int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); | |||
| int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); | |||
| @@ -505,7 +505,7 @@ int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl | |||
| int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); | |||
| int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); | |||
| int shgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); | |||
| int sbgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); | |||
| int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); | |||
| int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); | |||
| @@ -534,10 +534,10 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float | |||
| int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); | |||
| int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); | |||
| int shgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int shgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int sbgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int sbgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int sbgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int sbgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); | |||
| int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); | |||
| @@ -631,10 +631,10 @@ int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLON | |||
| int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); | |||
| #endif | |||
| int shgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int shgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int shgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int shgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int sbgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int sbgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int sbgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int sbgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); | |||
| int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); | |||
| @@ -39,7 +39,7 @@ | |||
| #ifndef COMMON_MACRO | |||
| #define COMMON_MACRO | |||
| #include "common_sh.h" | |||
| #include "common_sb.h" | |||
| #include "common_s.h" | |||
| #include "common_d.h" | |||
| #include "common_q.h" | |||
| @@ -644,7 +644,7 @@ | |||
| #define GEADD_K DGEADD_K | |||
| #elif defined(HALF) | |||
| #elif defined(BFLOAT16) | |||
| #define D_TO_BF16_K SHDTOBF16_K | |||
| #define D_BF16_TO_K DBF16TOD_K | |||
| @@ -662,7 +662,7 @@ | |||
| #define ASUM_K SASUM_K | |||
| #define DOTU_K SDOTU_K | |||
| #define DOTC_K SDOTC_K | |||
| #define BF16_DOT_K SHDOT_K | |||
| #define BF16_DOT_K SBDOT_K | |||
| #define AXPYU_K SAXPYU_K | |||
| #define AXPYC_K SAXPYC_K | |||
| #define AXPBY_K SAXPBY_K | |||
| @@ -682,32 +682,32 @@ | |||
| #define NRM2_K SNRM2_K | |||
| #define SYMV_THREAD_U SSYMV_THREAD_U | |||
| #define SYMV_THREAD_L SSYMV_THREAD_L | |||
| #define GEMM_BETA SHGEMM_BETA | |||
| #define GEMM_KERNEL_N SHGEMM_KERNEL | |||
| #define GEMM_KERNEL_L SHGEMM_KERNEL | |||
| #define GEMM_KERNEL_R SHGEMM_KERNEL | |||
| #define GEMM_KERNEL_B SHGEMM_KERNEL | |||
| #define GEMM_NN SHGEMM_NN | |||
| #define GEMM_CN SHGEMM_TN | |||
| #define GEMM_TN SHGEMM_TN | |||
| #define GEMM_NC SHGEMM_NT | |||
| #define GEMM_NT SHGEMM_NT | |||
| #define GEMM_CC SHGEMM_TT | |||
| #define GEMM_CT SHGEMM_TT | |||
| #define GEMM_TC SHGEMM_TT | |||
| #define GEMM_TT SHGEMM_TT | |||
| #define GEMM_NR SHGEMM_NN | |||
| #define GEMM_TR SHGEMM_TN | |||
| #define GEMM_CR SHGEMM_TN | |||
| #define GEMM_RN SHGEMM_NN | |||
| #define GEMM_RT SHGEMM_NT | |||
| #define GEMM_RC SHGEMM_NT | |||
| #define GEMM_RR SHGEMM_NN | |||
| #define GEMM_ONCOPY SHGEMM_ONCOPY | |||
| #define GEMM_OTCOPY SHGEMM_OTCOPY | |||
| #define GEMM_INCOPY SHGEMM_INCOPY | |||
| #define GEMM_ITCOPY SHGEMM_ITCOPY | |||
| #define GEMM_BETA SBGEMM_BETA | |||
| #define GEMM_KERNEL_N SBGEMM_KERNEL | |||
| #define GEMM_KERNEL_L SBGEMM_KERNEL | |||
| #define GEMM_KERNEL_R SBGEMM_KERNEL | |||
| #define GEMM_KERNEL_B SBGEMM_KERNEL | |||
| #define GEMM_NN SBGEMM_NN | |||
| #define GEMM_CN SBGEMM_TN | |||
| #define GEMM_TN SBGEMM_TN | |||
| #define GEMM_NC SBGEMM_NT | |||
| #define GEMM_NT SBGEMM_NT | |||
| #define GEMM_CC SBGEMM_TT | |||
| #define GEMM_CT SBGEMM_TT | |||
| #define GEMM_TC SBGEMM_TT | |||
| #define GEMM_TT SBGEMM_TT | |||
| #define GEMM_NR SBGEMM_NN | |||
| #define GEMM_TR SBGEMM_TN | |||
| #define GEMM_CR SBGEMM_TN | |||
| #define GEMM_RN SBGEMM_NN | |||
| #define GEMM_RT SBGEMM_NT | |||
| #define GEMM_RC SBGEMM_NT | |||
| #define GEMM_RR SBGEMM_NN | |||
| #define GEMM_ONCOPY SBGEMM_ONCOPY | |||
| #define GEMM_OTCOPY SBGEMM_OTCOPY | |||
| #define GEMM_INCOPY SBGEMM_INCOPY | |||
| #define GEMM_ITCOPY SBGEMM_ITCOPY | |||
| #define SYMM_THREAD_LU SSYMM_THREAD_LU | |||
| #define SYMM_THREAD_LL SSYMM_THREAD_LL | |||
| #define SYMM_THREAD_RU SSYMM_THREAD_RU | |||
| @@ -723,22 +723,22 @@ | |||
| #define HEMM_THREAD_RU SHEMM_THREAD_RU | |||
| #define HEMM_THREAD_RL SHEMM_THREAD_RL | |||
| #define GEMM_THREAD_NN SHGEMM_THREAD_NN | |||
| #define GEMM_THREAD_CN SHGEMM_THREAD_TN | |||
| #define GEMM_THREAD_TN SHGEMM_THREAD_TN | |||
| #define GEMM_THREAD_NC SHGEMM_THREAD_NT | |||
| #define GEMM_THREAD_NT SHGEMM_THREAD_NT | |||
| #define GEMM_THREAD_CC SHGEMM_THREAD_TT | |||
| #define GEMM_THREAD_CT SHGEMM_THREAD_TT | |||
| #define GEMM_THREAD_TC SHGEMM_THREAD_TT | |||
| #define GEMM_THREAD_TT SHGEMM_THREAD_TT | |||
| #define GEMM_THREAD_NR SHGEMM_THREAD_NN | |||
| #define GEMM_THREAD_TR SHGEMM_THREAD_TN | |||
| #define GEMM_THREAD_CR SHGEMM_THREAD_TN | |||
| #define GEMM_THREAD_RN SHGEMM_THREAD_NN | |||
| #define GEMM_THREAD_RT SHGEMM_THREAD_NT | |||
| #define GEMM_THREAD_RC SHGEMM_THREAD_NT | |||
| #define GEMM_THREAD_RR SHGEMM_THREAD_NN | |||
| #define GEMM_THREAD_NN SBGEMM_THREAD_NN | |||
| #define GEMM_THREAD_CN SBGEMM_THREAD_TN | |||
| #define GEMM_THREAD_TN SBGEMM_THREAD_TN | |||
| #define GEMM_THREAD_NC SBGEMM_THREAD_NT | |||
| #define GEMM_THREAD_NT SBGEMM_THREAD_NT | |||
| #define GEMM_THREAD_CC SBGEMM_THREAD_TT | |||
| #define GEMM_THREAD_CT SBGEMM_THREAD_TT | |||
| #define GEMM_THREAD_TC SBGEMM_THREAD_TT | |||
| #define GEMM_THREAD_TT SBGEMM_THREAD_TT | |||
| #define GEMM_THREAD_NR SBGEMM_THREAD_NN | |||
| #define GEMM_THREAD_TR SBGEMM_THREAD_TN | |||
| #define GEMM_THREAD_CR SBGEMM_THREAD_TN | |||
| #define GEMM_THREAD_RN SBGEMM_THREAD_NN | |||
| #define GEMM_THREAD_RT SBGEMM_THREAD_NT | |||
| #define GEMM_THREAD_RC SBGEMM_THREAD_NT | |||
| #define GEMM_THREAD_RR SBGEMM_THREAD_NN | |||
| #ifdef UNIT | |||
| @@ -2491,9 +2491,9 @@ | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||
| extern BLASLONG gemm_offset_a; | |||
| extern BLASLONG gemm_offset_b; | |||
| extern BLASLONG shgemm_p; | |||
| extern BLASLONG shgemm_q; | |||
| extern BLASLONG shgemm_r; | |||
| extern BLASLONG sbgemm_p; | |||
| extern BLASLONG sbgemm_q; | |||
| extern BLASLONG sbgemm_r; | |||
| extern BLASLONG sgemm_p; | |||
| extern BLASLONG sgemm_q; | |||
| extern BLASLONG sgemm_r; | |||
| @@ -47,114 +47,114 @@ typedef struct { | |||
| int dtb_entries; | |||
| int offsetA, offsetB, align; | |||
| #ifdef BUILD_HALF | |||
| int shgemm_p, shgemm_q, shgemm_r; | |||
| int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; | |||
| #ifdef BUILD_BFLOAT16 | |||
| int sbgemm_p, sbgemm_q, sbgemm_r; | |||
| int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; | |||
| void (*shstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); | |||
| void (*shdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); | |||
| void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); | |||
| void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); | |||
| void (*sbf16tos_k) (BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); | |||
| void (*dbf16tod_k) (BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG); | |||
| float (*shamax_k) (BLASLONG, float *, BLASLONG); | |||
| float (*shamin_k) (BLASLONG, float *, BLASLONG); | |||
| float (*shmax_k) (BLASLONG, float *, BLASLONG); | |||
| float (*shmin_k) (BLASLONG, float *, BLASLONG); | |||
| BLASLONG (*ishamax_k)(BLASLONG, float *, BLASLONG); | |||
| BLASLONG (*ishamin_k)(BLASLONG, float *, BLASLONG); | |||
| BLASLONG (*ishmax_k) (BLASLONG, float *, BLASLONG); | |||
| BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); | |||
| float (*shnrm2_k) (BLASLONG, float *, BLASLONG); | |||
| float (*shasum_k) (BLASLONG, float *, BLASLONG); | |||
| float (*shsum_k) (BLASLONG, float *, BLASLONG); | |||
| int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| float (*shdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); | |||
| double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | |||
| int (*shaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); | |||
| int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); | |||
| int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); | |||
| int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); | |||
| int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); | |||
| int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); | |||
| int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); | |||
| #endif | |||
| #if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) | |||
| float (*sbamax_k) (BLASLONG, float *, BLASLONG); | |||
| float (*sbamin_k) (BLASLONG, float *, BLASLONG); | |||
| float (*sbmax_k) (BLASLONG, float *, BLASLONG); | |||
| float (*sbmin_k) (BLASLONG, float *, BLASLONG); | |||
| BLASLONG (*isbamax_k)(BLASLONG, float *, BLASLONG); | |||
| BLASLONG (*isbamin_k)(BLASLONG, float *, BLASLONG); | |||
| BLASLONG (*isbmax_k) (BLASLONG, float *, BLASLONG); | |||
| BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); | |||
| float (*sbnrm2_k) (BLASLONG, float *, BLASLONG); | |||
| float (*sbasum_k) (BLASLONG, float *, BLASLONG); | |||
| float (*sbsum_k) (BLASLONG, float *, BLASLONG); | |||
| int (*sbcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| float (*sbdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); | |||
| double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*sbrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | |||
| int (*sbaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*sbger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*sbsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*sbsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*sbgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); | |||
| int (*sbgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); | |||
| int (*sbgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); | |||
| int (*sbgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); | |||
| int (*sbgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); | |||
| int (*sbgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); | |||
| int (*sbtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*sbtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*sbtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*sbtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*sbtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*sbtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*sbtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*sbtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*sbtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*sbneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| int (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); | |||
| #endif | |||
| #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) | |||
| int sgemm_p, sgemm_q, sgemm_r; | |||
| int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; | |||
| #endif | |||
| int exclusive_cache; | |||
| #if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) | |||
| #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) | |||
| float (*samax_k) (BLASLONG, float *, BLASLONG); | |||
| float (*samin_k) (BLASLONG, float *, BLASLONG); | |||
| float (*smax_k) (BLASLONG, float *, BLASLONG); | |||
| @@ -167,11 +167,10 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
| float (*snrm2_k) (BLASLONG, float *, BLASLONG); | |||
| float (*sasum_k) (BLASLONG, float *, BLASLONG); | |||
| #endif | |||
| #if BUILD_SINGLE | |||
| #ifdef BUILD_SINGLE | |||
| float (*ssum_k) (BLASLONG, float *, BLASLONG); | |||
| #endif | |||
| #if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) | |||
| #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) | |||
| int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| @@ -179,26 +178,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
| int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | |||
| int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| #endif | |||
| #if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) | |||
| int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| #endif | |||
| #if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) | |||
| int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| #endif | |||
| #if BUILD_SINGLE | |||
| #ifdef BUILD_SINGLE | |||
| int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| #endif | |||
| #if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) | |||
| #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) | |||
| #ifdef ARCH_X86_64 | |||
| void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | |||
| int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); | |||
| @@ -213,8 +206,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
| int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| #endif | |||
| #if (BUILD_SINGLE) || (BUILD_DOUBLE) | |||
| #ifdef BUILD_SINGLE | |||
| int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| @@ -236,8 +228,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
| int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| #endif | |||
| #if BUILD_SINGLE | |||
| int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| @@ -264,18 +255,17 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
| int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| #endif | |||
| #if (BUILD_SINGLE) || (BUILD_DOUBLE) | |||
| int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); | |||
| #endif | |||
| #if (BUILD_DOUBLE) || (BUILD_COMPLEX16) | |||
| #if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) | |||
| int dgemm_p, dgemm_q, dgemm_r; | |||
| int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; | |||
| #endif | |||
| #if (BUILD_DOUBLE) || (BUILD_COMPLEX16) | |||
| #if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) | |||
| double (*damax_k) (BLASLONG, double *, BLASLONG); | |||
| double (*damin_k) (BLASLONG, double *, BLASLONG); | |||
| double (*dmax_k) (BLASLONG, double *, BLASLONG); | |||
| @@ -286,21 +276,21 @@ BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); | |||
| BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||
| #endif | |||
| #if (BUILD_DOUBLE) || (BUILD_COMPLEX16) | |||
| #if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) | |||
| double (*dnrm2_k) (BLASLONG, double *, BLASLONG); | |||
| double (*dasum_k) (BLASLONG, double *, BLASLONG); | |||
| #endif | |||
| #if BUILD_DOUBLE | |||
| #ifdef BUILD_DOUBLE | |||
| double (*dsum_k) (BLASLONG, double *, BLASLONG); | |||
| #endif | |||
| #if (BUILD_DOUBLE) || (BUILD_COMPLEX16) | |||
| #if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) | |||
| int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| #endif | |||
| #if (BUILD_SINGLE) || (BUILD_DOUBLE) | |||
| #if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE) | |||
| double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| #endif | |||
| #if (BUILD_DOUBLE) || (BUILD_COMPLEX16) | |||
| #if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) | |||
| int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | |||
| int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| @@ -308,15 +298,13 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||
| int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||
| int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||
| #endif | |||
| #if BUILD_DOUBLE | |||
| #ifdef BUILD_DOUBLE | |||
| int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||
| int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||
| int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||
| #endif | |||
| #if (BUILD_DOUBLE) || (BUILD_COMPLEX16) | |||
| #if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) | |||
| int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); | |||
| int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| @@ -325,8 +313,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||
| int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); | |||
| int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); | |||
| #endif | |||
| #if BUILD_DOUBLE | |||
| #ifdef BUILD_DOUBLE | |||
| int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); | |||
| int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); | |||
| int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); | |||
| @@ -473,30 +460,23 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); | |||
| #endif | |||
| #if (BUILD_COMPLEX) || (BUILD_COMPLEX16) | |||
| #ifdef BUILD_COMPLEX | |||
| int cgemm_p, cgemm_q, cgemm_r; | |||
| int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; | |||
| float (*camax_k) (BLASLONG, float *, BLASLONG); | |||
| float (*camin_k) (BLASLONG, float *, BLASLONG); | |||
| BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); | |||
| BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); | |||
| #endif | |||
| #if BUILD_COMPLEX | |||
| float (*cnrm2_k) (BLASLONG, float *, BLASLONG); | |||
| float (*casum_k) (BLASLONG, float *, BLASLONG); | |||
| float (*csum_k) (BLASLONG, float *, BLASLONG); | |||
| #endif | |||
| #if (BUILD_COMPLEX)|| (BUILD_COMPLEX16) | |||
| int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| #endif | |||
| #if BUILD_COMPLEX | |||
| int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | |||
| #endif | |||
| #if (BUILD_COMPLEX)|| (BUILD_COMPLEX16) | |||
| int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| @@ -510,8 +490,6 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); | |||
| int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| #endif | |||
| #if (BUILD_COMPLEX) | |||
| int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| @@ -523,14 +501,13 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); | |||
| int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| #endif | |||
| #if (BUILD_COMPLEX) || (BUILD_COMPLEX16) | |||
| int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); | |||
| int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); | |||
| int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); | |||
| int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); | |||
| int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| @@ -561,8 +538,6 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); | |||
| int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| #endif | |||
| #if (BUILD_COMPLEX) | |||
| int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| @@ -646,14 +621,12 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); | |||
| int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); | |||
| int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); | |||
| int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); | |||
| #endif | |||
| #if (BUILD_COMPLEX) || (BUILD_COMPLEX16) | |||
| int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); | |||
| #endif | |||
| #if BUILD_COMPLEX16 | |||
| #ifdef BUILD_COMPLEX16 | |||
| int zgemm_p, zgemm_q, zgemm_r; | |||
| int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; | |||
| @@ -991,35 +964,34 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||
| void (*init)(void); | |||
| int snum_opt, dnum_opt, qnum_opt; | |||
| #if BUILD_SINGLE | |||
| #ifdef BUILD_SINGLE | |||
| int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); | |||
| #endif | |||
| #if BUILD_DOUBLE | |||
| #ifdef BUILD_DOUBLE | |||
| int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); | |||
| #endif | |||
| #if BUILD_COMPLEX | |||
| #ifdef BUILD_COMPLEX | |||
| int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); | |||
| #endif | |||
| #if BUILD_COMPLEX16 | |||
| #ifdef BUILD_COMPLEX16 | |||
| int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); | |||
| #endif | |||
| #if BUILD_SINGLE | |||
| #ifdef BUILD_SINGLE | |||
| int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); | |||
| int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); | |||
| int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); | |||
| int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); | |||
| #endif | |||
| #if BUILD_DOUBLE | |||
| #ifdef BUILD_DOUBLE | |||
| int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); | |||
| int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); | |||
| int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); | |||
| int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); | |||
| #endif | |||
| #if BUILD_COMPLEX | |||
| #ifdef BUILD_COMPLEX | |||
| int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); | |||
| int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); | |||
| int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); | |||
| @@ -1031,7 +1003,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||
| int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); | |||
| #endif | |||
| #if BUILD_COMPLEX16 | |||
| #ifdef BUILD_COMPLEX16 | |||
| int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); | |||
| int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); | |||
| int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); | |||
| @@ -1043,21 +1015,21 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||
| int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); | |||
| #endif | |||
| #if BUILD_SINGLE | |||
| #ifdef BUILD_SINGLE | |||
| int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); | |||
| int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); | |||
| int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); | |||
| int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); | |||
| #endif | |||
| #if BUILD_DOUBLE | |||
| #ifdef BUILD_DOUBLE | |||
| int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); | |||
| int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); | |||
| int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); | |||
| int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); | |||
| #endif | |||
| #if BUILD_COMPLEX | |||
| #ifdef BUILD_COMPLEX | |||
| int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); | |||
| int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); | |||
| int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); | |||
| @@ -1069,7 +1041,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||
| int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); | |||
| #endif | |||
| #if BUILD_COMPLEX16 | |||
| #ifdef BUILD_COMPLEX16 | |||
| int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); | |||
| int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); | |||
| int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); | |||
| @@ -1081,16 +1053,16 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||
| int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); | |||
| #endif | |||
| #if BUILD_SINGLE | |||
| #ifdef BUILD_SINGLE | |||
| int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); | |||
| #endif | |||
| #if BUILD_DOUBLE | |||
| #ifdef BUILD_DOUBLE | |||
| int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); | |||
| #endif | |||
| #if BUILD_COMPLEX | |||
| #ifdef BUILD_COMPLEX | |||
| int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); | |||
| #endif | |||
| #if BUILD_COMPLEX16 | |||
| #ifdef BUILD_COMPLEX16 | |||
| int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); | |||
| #endif | |||
| } gotoblas_t; | |||
| @@ -1104,16 +1076,16 @@ extern gotoblas_t *gotoblas; | |||
| #define HAVE_EX_L2 gotoblas -> exclusive_cache | |||
| #ifdef BUILD_HALF | |||
| #define SHGEMM_P gotoblas -> shgemm_p | |||
| #define SHGEMM_Q gotoblas -> shgemm_q | |||
| #define SHGEMM_R gotoblas -> shgemm_r | |||
| #define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m | |||
| #define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n | |||
| #define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn | |||
| #ifdef BUILD_BFLOAT16 | |||
| #define SBGEMM_P gotoblas -> sbgemm_p | |||
| #define SBGEMM_Q gotoblas -> sbgemm_q | |||
| #define SBGEMM_R gotoblas -> sbgemm_r | |||
| #define SBGEMM_UNROLL_M gotoblas -> sbgemm_unroll_m | |||
| #define SBGEMM_UNROLL_N gotoblas -> sbgemm_unroll_n | |||
| #define SBGEMM_UNROLL_MN gotoblas -> sbgemm_unroll_mn | |||
| #endif | |||
| #if (BUILD_SINGLE) | |||
| #if defined (BUILD_SINGLE) | |||
| #define SGEMM_P gotoblas -> sgemm_p | |||
| #define SGEMM_Q gotoblas -> sgemm_q | |||
| #define SGEMM_R gotoblas -> sgemm_r | |||
| @@ -1122,21 +1094,13 @@ extern gotoblas_t *gotoblas; | |||
| #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn | |||
| #endif | |||
| #if (BUILD_DOUBLE) | |||
| #if defined (BUILD_DOUBLE) | |||
| #define DGEMM_P gotoblas -> dgemm_p | |||
| #define DGEMM_Q gotoblas -> dgemm_q | |||
| #define DGEMM_R gotoblas -> dgemm_r | |||
| #define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m | |||
| #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n | |||
| #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn | |||
| #if ! (BUILD_SINGLE) | |||
| #define SGEMM_P gotoblas -> sgemm_p | |||
| #define SGEMM_Q gotoblas -> sgemm_q | |||
| #define SGEMM_R gotoblas -> sgemm_r | |||
| #define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m | |||
| #define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n | |||
| #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn | |||
| #endif | |||
| #endif | |||
| #define QGEMM_P gotoblas -> qgemm_p | |||
| @@ -1146,7 +1110,7 @@ extern gotoblas_t *gotoblas; | |||
| #define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n | |||
| #define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn | |||
| #if BUILD_COMPLEX | |||
| #ifdef BUILD_COMPLEX | |||
| #define CGEMM_P gotoblas -> cgemm_p | |||
| #define CGEMM_Q gotoblas -> cgemm_q | |||
| #define CGEMM_R gotoblas -> cgemm_r | |||
| @@ -1163,7 +1127,7 @@ extern gotoblas_t *gotoblas; | |||
| #endif | |||
| #endif | |||
| #if BUILD_COMPLEX16 | |||
| #ifdef BUILD_COMPLEX16 | |||
| #define ZGEMM_P gotoblas -> zgemm_p | |||
| #define ZGEMM_Q gotoblas -> zgemm_q | |||
| #define ZGEMM_R gotoblas -> zgemm_r | |||
| @@ -1178,14 +1142,6 @@ extern gotoblas_t *gotoblas; | |||
| #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n | |||
| #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn | |||
| #endif | |||
| #ifndef BUILD_COMPLEX | |||
| #define CGEMM_P gotoblas -> cgemm_p | |||
| #define CGEMM_Q gotoblas -> cgemm_q | |||
| #define CGEMM_R gotoblas -> cgemm_r | |||
| #define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m | |||
| #define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n | |||
| #define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn | |||
| #endif | |||
| #endif | |||
| #define XGEMM_P gotoblas -> xgemm_p | |||
| @@ -1230,16 +1186,16 @@ extern gotoblas_t *gotoblas; | |||
| #define HAVE_EX_L2 0 | |||
| #endif | |||
| #ifdef BUILD_HALF | |||
| #define SHGEMM_P SHGEMM_DEFAULT_P | |||
| #define SHGEMM_Q SHGEMM_DEFAULT_Q | |||
| #define SHGEMM_R SHGEMM_DEFAULT_R | |||
| #define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M | |||
| #define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N | |||
| #ifdef SHGEMM_DEFAULT_UNROLL_MN | |||
| #define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN | |||
| #ifdef BUILD_BFLOAT16 | |||
| #define SBGEMM_P SBGEMM_DEFAULT_P | |||
| #define SBGEMM_Q SBGEMM_DEFAULT_Q | |||
| #define SBGEMM_R SBGEMM_DEFAULT_R | |||
| #define SBGEMM_UNROLL_M SBGEMM_DEFAULT_UNROLL_M | |||
| #define SBGEMM_UNROLL_N SBGEMM_DEFAULT_UNROLL_N | |||
| #ifdef SBGEMM_DEFAULT_UNROLL_MN | |||
| #define SBGEMM_UNROLL_MN SBGEMM_DEFAULT_UNROLL_MN | |||
| #else | |||
| #define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) | |||
| #define SBGEMM_UNROLL_MN MAX((SBGEMM_UNROLL_M), (SBGEMM_UNROLL_N)) | |||
| #endif | |||
| #endif | |||
| @@ -1354,7 +1310,7 @@ extern gotoblas_t *gotoblas; | |||
| #endif | |||
| #ifndef COMPLEX | |||
| #if (XDOUBLE) | |||
| #if defined(XDOUBLE) | |||
| #define GEMM_P QGEMM_P | |||
| #define GEMM_Q QGEMM_Q | |||
| #define GEMM_R QGEMM_R | |||
| @@ -1378,18 +1334,18 @@ extern gotoblas_t *gotoblas; | |||
| #define GEMM_DEFAULT_R DGEMM_DEFAULT_R | |||
| #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M | |||
| #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N | |||
| #elif (HALF) | |||
| #define GEMM_P SHGEMM_P | |||
| #define GEMM_Q SHGEMM_Q | |||
| #define GEMM_R SHGEMM_R | |||
| #define GEMM_UNROLL_M SHGEMM_UNROLL_M | |||
| #define GEMM_UNROLL_N SHGEMM_UNROLL_N | |||
| #define GEMM_UNROLL_MN SHGEMM_UNROLL_MN | |||
| #define GEMM_DEFAULT_P SHGEMM_DEFAULT_P | |||
| #define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q | |||
| #define GEMM_DEFAULT_R SHGEMM_DEFAULT_R | |||
| #define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M | |||
| #define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N | |||
| #elif defined(BFLOAT16) | |||
| #define GEMM_P SBGEMM_P | |||
| #define GEMM_Q SBGEMM_Q | |||
| #define GEMM_R SBGEMM_R | |||
| #define GEMM_UNROLL_M SBGEMM_UNROLL_M | |||
| #define GEMM_UNROLL_N SBGEMM_UNROLL_N | |||
| #define GEMM_UNROLL_MN SBGEMM_UNROLL_MN | |||
| #define GEMM_DEFAULT_P SBGEMM_DEFAULT_P | |||
| #define GEMM_DEFAULT_Q SBGEMM_DEFAULT_Q | |||
| #define GEMM_DEFAULT_R SBGEMM_DEFAULT_R | |||
| #define GEMM_DEFAULT_UNROLL_M SBGEMM_DEFAULT_UNROLL_M | |||
| #define GEMM_DEFAULT_UNROLL_N SBGEMM_DEFAULT_UNROLL_N | |||
| #else | |||
| #define GEMM_P SGEMM_P | |||
| #define GEMM_Q SGEMM_Q | |||
| @@ -1404,7 +1360,7 @@ extern gotoblas_t *gotoblas; | |||
| #define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N | |||
| #endif | |||
| #else | |||
| #if (XDOUBLE) | |||
| #if defined(XDOUBLE) | |||
| #define GEMM_P XGEMM_P | |||
| #define GEMM_Q XGEMM_Q | |||
| #define GEMM_R XGEMM_R | |||
| @@ -1475,8 +1431,8 @@ extern gotoblas_t *gotoblas; | |||
| #define GEMM_THREAD gemm_thread_n | |||
| #endif | |||
| #ifndef SHGEMM_DEFAULT_R | |||
| #define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) | |||
| #ifndef SBGEMM_DEFAULT_R | |||
| #define SBGEMM_DEFAULT_R (((BUFFER_SIZE - ((SBGEMM_DEFAULT_P * SBGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SBGEMM_DEFAULT_Q * 4) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef SGEMM_DEFAULT_R | |||
| @@ -1518,7 +1474,7 @@ extern gotoblas_t *gotoblas; | |||
| #ifndef GEMM3M_P | |||
| #ifdef XDOUBLE | |||
| #define GEMM3M_P XGEMM3M_P | |||
| #elif defined (DOUBLE) | |||
| #elif defined(DOUBLE) | |||
| #define GEMM3M_P ZGEMM3M_P | |||
| #else | |||
| #define GEMM3M_P CGEMM3M_P | |||
| @@ -1528,7 +1484,7 @@ extern gotoblas_t *gotoblas; | |||
| #ifndef GEMM3M_Q | |||
| #ifdef XDOUBLE | |||
| #define GEMM3M_Q XGEMM3M_Q | |||
| #elif defined (DOUBLE) | |||
| #elif defined(DOUBLE) | |||
| #define GEMM3M_Q ZGEMM3M_Q | |||
| #else | |||
| #define GEMM3M_Q CGEMM3M_Q | |||
| @@ -1538,7 +1494,7 @@ extern gotoblas_t *gotoblas; | |||
| #ifndef GEMM3M_R | |||
| #ifdef XDOUBLE | |||
| #define GEMM3M_R XGEMM3M_R | |||
| #elif defined (DOUBLE) | |||
| #elif defined(DOUBLE) | |||
| #define GEMM3M_R ZGEMM3M_R | |||
| #else | |||
| #define GEMM3M_R CGEMM3M_R | |||
| @@ -0,0 +1,77 @@ | |||
| #ifndef COMMON_SB_H | |||
| #define COMMON_SB_H | |||
| #ifndef DYNAMIC_ARCH | |||
| #define SBDOT_K sbdot_k | |||
| #define SBSTOBF16_K sbstobf16_k | |||
| #define SBDTOBF16_K sbdtobf16_k | |||
| #define SBF16TOS_K sbf16tos_k | |||
| #define DBF16TOD_K dbf16tod_k | |||
| #define SBGEMM_ONCOPY sbgemm_oncopy | |||
| #define SBGEMM_OTCOPY sbgemm_otcopy | |||
| #if SBGEMM_DEFAULT_UNROLL_M == SBGEMM_DEFAULT_UNROLL_N | |||
| #define SBGEMM_INCOPY sbgemm_oncopy | |||
| #define SBGEMM_ITCOPY sbgemm_otcopy | |||
| #else | |||
| #define SBGEMM_INCOPY sbgemm_incopy | |||
| #define SBGEMM_ITCOPY sbgemm_itcopy | |||
| #endif | |||
| #define SBGEMM_BETA sbgemm_beta | |||
| #define SBGEMM_KERNEL sbgemm_kernel | |||
| #else | |||
| #define SBDOT_K gotoblas -> sbdot_k | |||
| #define SBSTOBF16_K gotoblas -> sbstobf16_k | |||
| #define SBDTOBF16_K gotoblas -> sbdtobf16_k | |||
| #define SBF16TOS_K gotoblas -> sbf16tos_k | |||
| #define DBF16TOD_K gotoblas -> dbf16tod_k | |||
| #define SBGEMM_ONCOPY gotoblas -> sbgemm_oncopy | |||
| #define SBGEMM_OTCOPY gotoblas -> sbgemm_otcopy | |||
| #define SBGEMM_INCOPY gotoblas -> sbgemm_incopy | |||
| #define SBGEMM_ITCOPY gotoblas -> sbgemm_itcopy | |||
| #define SBGEMM_BETA gotoblas -> sbgemm_beta | |||
| #define SBGEMM_KERNEL gotoblas -> sbgemm_kernel | |||
| #endif | |||
| #define SBGEMM_NN sbgemm_nn | |||
| #define SBGEMM_CN sbgemm_tn | |||
| #define SBGEMM_TN sbgemm_tn | |||
| #define SBGEMM_NC sbgemm_nt | |||
| #define SBGEMM_NT sbgemm_nt | |||
| #define SBGEMM_CC sbgemm_tt | |||
| #define SBGEMM_CT sbgemm_tt | |||
| #define SBGEMM_TC sbgemm_tt | |||
| #define SBGEMM_TT sbgemm_tt | |||
| #define SBGEMM_NR sbgemm_nn | |||
| #define SBGEMM_TR sbgemm_tn | |||
| #define SBGEMM_CR sbgemm_tn | |||
| #define SBGEMM_RN sbgemm_nn | |||
| #define SBGEMM_RT sbgemm_nt | |||
| #define SBGEMM_RC sbgemm_nt | |||
| #define SBGEMM_RR sbgemm_nn | |||
| #define SBGEMM_THREAD_NN sbgemm_thread_nn | |||
| #define SBGEMM_THREAD_CN sbgemm_thread_tn | |||
| #define SBGEMM_THREAD_TN sbgemm_thread_tn | |||
| #define SBGEMM_THREAD_NC sbgemm_thread_nt | |||
| #define SBGEMM_THREAD_NT sbgemm_thread_nt | |||
| #define SBGEMM_THREAD_CC sbgemm_thread_tt | |||
| #define SBGEMM_THREAD_CT sbgemm_thread_tt | |||
| #define SBGEMM_THREAD_TC sbgemm_thread_tt | |||
| #define SBGEMM_THREAD_TT sbgemm_thread_tt | |||
| #define SBGEMM_THREAD_NR sbgemm_thread_nn | |||
| #define SBGEMM_THREAD_TR sbgemm_thread_tn | |||
| #define SBGEMM_THREAD_CR sbgemm_thread_tn | |||
| #define SBGEMM_THREAD_RN sbgemm_thread_nn | |||
| #define SBGEMM_THREAD_RT sbgemm_thread_nt | |||
| #define SBGEMM_THREAD_RC sbgemm_thread_nt | |||
| #define SBGEMM_THREAD_RR sbgemm_thread_nn | |||
| #endif | |||
| @@ -1,77 +0,0 @@ | |||
| #ifndef COMMON_SH_H | |||
| #define COMMON_SH_H | |||
| #ifndef DYNAMIC_ARCH | |||
| #define SHDOT_K shdot_k | |||
| #define SHSTOBF16_K shstobf16_k | |||
| #define SHDTOBF16_K shdtobf16_k | |||
| #define SBF16TOS_K sbf16tos_k | |||
| #define DBF16TOD_K dbf16tod_k | |||
| #define SHGEMM_ONCOPY shgemm_oncopy | |||
| #define SHGEMM_OTCOPY shgemm_otcopy | |||
| #if SHGEMM_DEFAULT_UNROLL_M == SHGEMM_DEFAULT_UNROLL_N | |||
| #define SHGEMM_INCOPY shgemm_oncopy | |||
| #define SHGEMM_ITCOPY shgemm_otcopy | |||
| #else | |||
| #define SHGEMM_INCOPY shgemm_incopy | |||
| #define SHGEMM_ITCOPY shgemm_itcopy | |||
| #endif | |||
| #define SHGEMM_BETA shgemm_beta | |||
| #define SHGEMM_KERNEL shgemm_kernel | |||
| #else | |||
| #define SHDOT_K gotoblas -> shdot_k | |||
| #define SHSTOBF16_K gotoblas -> shstobf16_k | |||
| #define SHDTOBF16_K gotoblas -> shdtobf16_k | |||
| #define SBF16TOS_K gotoblas -> sbf16tos_k | |||
| #define DBF16TOD_K gotoblas -> dbf16tod_k | |||
| #define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy | |||
| #define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy | |||
| #define SHGEMM_INCOPY gotoblas -> shgemm_incopy | |||
| #define SHGEMM_ITCOPY gotoblas -> shgemm_itcopy | |||
| #define SHGEMM_BETA gotoblas -> shgemm_beta | |||
| #define SHGEMM_KERNEL gotoblas -> shgemm_kernel | |||
| #endif | |||
| #define SHGEMM_NN shgemm_nn | |||
| #define SHGEMM_CN shgemm_tn | |||
| #define SHGEMM_TN shgemm_tn | |||
| #define SHGEMM_NC shgemm_nt | |||
| #define SHGEMM_NT shgemm_nt | |||
| #define SHGEMM_CC shgemm_tt | |||
| #define SHGEMM_CT shgemm_tt | |||
| #define SHGEMM_TC shgemm_tt | |||
| #define SHGEMM_TT shgemm_tt | |||
| #define SHGEMM_NR shgemm_nn | |||
| #define SHGEMM_TR shgemm_tn | |||
| #define SHGEMM_CR shgemm_tn | |||
| #define SHGEMM_RN shgemm_nn | |||
| #define SHGEMM_RT shgemm_nt | |||
| #define SHGEMM_RC shgemm_nt | |||
| #define SHGEMM_RR shgemm_nn | |||
| #define SHGEMM_THREAD_NN shgemm_thread_nn | |||
| #define SHGEMM_THREAD_CN shgemm_thread_tn | |||
| #define SHGEMM_THREAD_TN shgemm_thread_tn | |||
| #define SHGEMM_THREAD_NC shgemm_thread_nt | |||
| #define SHGEMM_THREAD_NT shgemm_thread_nt | |||
| #define SHGEMM_THREAD_CC shgemm_thread_tt | |||
| #define SHGEMM_THREAD_CT shgemm_thread_tt | |||
| #define SHGEMM_THREAD_TC shgemm_thread_tt | |||
| #define SHGEMM_THREAD_TT shgemm_thread_tt | |||
| #define SHGEMM_THREAD_NR shgemm_thread_nn | |||
| #define SHGEMM_THREAD_TR shgemm_thread_tn | |||
| #define SHGEMM_THREAD_CR shgemm_thread_tn | |||
| #define SHGEMM_THREAD_RN shgemm_thread_nn | |||
| #define SHGEMM_THREAD_RT shgemm_thread_nt | |||
| #define SHGEMM_THREAD_RC shgemm_thread_nt | |||
| #define SHGEMM_THREAD_RR shgemm_thread_nn | |||
| #endif | |||
| @@ -19,8 +19,8 @@ ifeq ($(ARCH), MIPS) | |||
| USE_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| SHBLASOBJS += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX) | |||
| endif | |||
| SBLASOBJS += \ | |||
| @@ -207,8 +207,8 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$( | |||
| COMMONOBJS += syrk_thread.$(SUFFIX) | |||
| ifndef USE_SIMPLE_THREADED_LEVEL3 | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| SHBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX) | |||
| endif | |||
| SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) | |||
| DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) | |||
| @@ -343,16 +343,16 @@ endif | |||
| all :: | |||
| shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| sbgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) | |||
| shgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| sbgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) | |||
| shgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| sbgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) | |||
| shgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| sbgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) | |||
| sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| @@ -550,16 +550,16 @@ gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h | |||
| beta_thread.$(SUFFIX) : beta_thread.c ../../common.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| shgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| sbgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) | |||
| shgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| sbgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) | |||
| shgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| sbgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) | |||
| shgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| sbgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) | |||
| sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| @@ -2735,16 +2735,16 @@ xtrsm_RCLU.$(SUFFIX) : trsm_R.c | |||
| xtrsm_RCLN.$(SUFFIX) : trsm_R.c | |||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) | |||
| shgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| sbgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) | |||
| shgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| sbgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) | |||
| shgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| sbgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) | |||
| shgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| sbgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) | |||
| sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| @@ -2943,16 +2943,16 @@ beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h | |||
| $(CC) -c $(PFLAGS) $< -o $(@F) | |||
| shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| sbgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) | |||
| shgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| sbgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) | |||
| shgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| sbgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) | |||
| shgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| sbgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) | |||
| sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| @@ -227,7 +227,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> a, args -> lda, | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| #ifdef BUILD_HALF | |||
| #ifdef BUILD_BFLOAT16 | |||
| } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ | |||
| /* REAL / BFLOAT16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | |||
| @@ -192,7 +192,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> a, args -> lda, | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| #ifdef BUILD_HALF | |||
| #ifdef BUILD_BFLOAT16 | |||
| } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ | |||
| /* REAL / BFLOAT16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | |||
| @@ -112,7 +112,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> a, args -> lda, | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| #ifdef BUILD_HALF | |||
| #ifdef BUILD_BFLOAT16 | |||
| } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ | |||
| /* REAL / BFLOAT16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | |||
| @@ -62,10 +62,10 @@ BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; | |||
| BLASLONG gemm_offset_b = GEMM_OFFSET_B; | |||
| #endif | |||
| #if SHGEMM_P == shgemm_p | |||
| BLASLONG shgemm_p = DEFAULT_GEMM_P; | |||
| #if SBGEMM_P == sbgemm_p | |||
| BLASLONG sbgemm_p = DEFAULT_GEMM_P; | |||
| #else | |||
| BLASLONG shgemm_p = SHGEMM_P; | |||
| BLASLONG sbgemm_p = SBGEMM_P; | |||
| #endif | |||
| #if SGEMM_P == sgemm_p | |||
| BLASLONG sgemm_p = DEFAULT_GEMM_P; | |||
| @@ -88,10 +88,10 @@ BLASLONG zgemm_p = DEFAULT_GEMM_P; | |||
| BLASLONG zgemm_p = ZGEMM_P; | |||
| #endif | |||
| #if SHGEMM_Q == shgemm_q | |||
| BLASLONG shgemm_q = DEFAULT_GEMM_Q; | |||
| #if SBGEMM_Q == sbgemm_q | |||
| BLASLONG sbgemm_q = DEFAULT_GEMM_Q; | |||
| #else | |||
| BLASLONG shgemm_q = SHGEMM_Q; | |||
| BLASLONG sbgemm_q = SBGEMM_Q; | |||
| #endif | |||
| #if SGEMM_Q == sgemm_q | |||
| BLASLONG sgemm_q = DEFAULT_GEMM_Q; | |||
| @@ -114,10 +114,10 @@ BLASLONG zgemm_q = DEFAULT_GEMM_Q; | |||
| BLASLONG zgemm_q = ZGEMM_Q; | |||
| #endif | |||
| #if SHGEMM_R == shgemm_r | |||
| BLASLONG shgemm_r = DEFAULT_GEMM_R; | |||
| #if SBGEMM_R == sbgemm_r | |||
| BLASLONG sbgemm_r = DEFAULT_GEMM_R; | |||
| #else | |||
| BLASLONG shgemm_r = SHGEMM_R; | |||
| BLASLONG sbgemm_r = SBGEMM_R; | |||
| #endif | |||
| #if SGEMM_R == sgemm_r | |||
| BLASLONG sgemm_r = DEFAULT_GEMM_R; | |||
| @@ -615,7 +615,7 @@ void blas_set_parameter(void){ | |||
| size = BITMASK(cpuid3, 16, 0xff); | |||
| shgemm_p = 192 * (size + 1); | |||
| sbgemm_p = 192 * (size + 1); | |||
| sgemm_p = 192 * (size + 1); | |||
| dgemm_p = 96 * (size + 1); | |||
| cgemm_p = 96 * (size + 1); | |||
| @@ -629,7 +629,7 @@ void blas_set_parameter(void){ | |||
| xgemm_p = 16 * (size + 1); | |||
| #endif | |||
| shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15; | |||
| sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; | |||
| sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; | |||
| dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; | |||
| cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; | |||
| @@ -30,8 +30,8 @@ ifndef BUILD_LAPACK_DEPRECATED | |||
| BUILD_LAPACK_DEPRECATED = 0 | |||
| endif | |||
| ifndef BUILD_HALF | |||
| BUILD_HALF = 0 | |||
| ifndef BUILD_BFLOAT16 | |||
| BUILD_BFLOAT16 = 0 | |||
| endif | |||
| ifndef BUILD_SINGLE | |||
| BUILD_SINGLE = 0 | |||
| @@ -120,10 +120,10 @@ dll : ../$(LIBDLLNAME) | |||
| -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) | |||
| $(LIBPREFIX).def : gensymbol | |||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| libgoto_hpl.def : gensymbol | |||
| perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| ifeq ($(OSNAME), Darwin) | |||
| INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib | |||
| @@ -258,23 +258,23 @@ static : ../$(LIBNAME) | |||
| rm -f goto.$(SUFFIX) | |||
| osx.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| aix.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| objcopy.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| objconv.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) | |||
| test : linktest.c | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | |||
| rm -f linktest | |||
| linktest.c : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c | |||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c | |||
| clean :: | |||
| @rm -f *.def *.dylib __.SYMDEF* *.renamed | |||
| @@ -51,7 +51,7 @@ | |||
| zgeadd, dzsum); | |||
| @cblasobjs = (lsame, xerbla); | |||
| @halfblasobjs = (shgemm, shdot, shstobf16, shdtobf16, sbf16tos, dbf16tod); | |||
| @halfblasobjs = (sbgemm, sbdot, shstobf16, shdtobf16, sbf16tos, dbf16tod); | |||
| @cblasobjsc = ( | |||
| cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, | |||
| cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, | |||
| @@ -94,7 +94,7 @@ | |||
| @cblasobjs = ( cblas_xerbla ); | |||
| @halfcblasobjs = (cblas_shgemm, cblas_shdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod); | |||
| @halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod); | |||
| @exblasobjs = ( | |||
| qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, | |||
| @@ -9,8 +9,8 @@ | |||
| int main(int argc, char **argv) { | |||
| if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) { | |||
| printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M); | |||
| printf("SHGEMM_UNROLL_N=%d\n", SHGEMM_DEFAULT_UNROLL_N); | |||
| printf("SBGEMM_UNROLL_M=%d\n", SBGEMM_DEFAULT_UNROLL_M); | |||
| printf("SBGEMM_UNROLL_N=%d\n", SBGEMM_DEFAULT_UNROLL_N); | |||
| printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); | |||
| printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); | |||
| printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); | |||
| @@ -46,10 +46,10 @@ SBLAS3OBJS = \ | |||
| somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ | |||
| sgeadd.$(SUFFIX) | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLAS1OBJS = shdot.$(SUFFIX) | |||
| SHBLAS3OBJS = shgemm.$(SUFFIX) | |||
| SHEXTOBJS = shstobf16.$(SUFFIX) shdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| SBBLAS1OBJS = sbdot.$(SUFFIX) | |||
| SBBLAS3OBJS = sbgemm.$(SUFFIX) | |||
| SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) | |||
| endif | |||
| DBLAS1OBJS = \ | |||
| @@ -282,10 +282,10 @@ CSBLAS3OBJS = \ | |||
| cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ | |||
| cblas_sgeadd.$(SUFFIX) | |||
| ifeq ($(BUILD_HALF),1) | |||
| CSHBLAS1OBJS = cblas_shdot.$(SUFFIX) | |||
| CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) | |||
| CSHEXTOBJS = cblas_shstobf16.$(SUFFIX) cblas_shdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| CBHBLAS1OBJS = cblas_sbdot.$(SUFFIX) | |||
| CBHBLAS3OBJS = cblas_sbgemm.$(SUFFIX) | |||
| CBHEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) | |||
| endif | |||
| CDBLAS1OBJS = \ | |||
| @@ -381,8 +381,8 @@ override CFLAGS += -I. | |||
| SBLAS1OBJS += $(CSBLAS1OBJS) | |||
| SBLAS2OBJS += $(CSBLAS2OBJS) | |||
| SBLAS3OBJS += $(CSBLAS3OBJS) | |||
| SHBLAS1OBJS += $(CSHBLAS1OBJS) | |||
| SHBLAS3OBJS += $(CSHBLAS3OBJS) | |||
| SBBLAS1OBJS += $(CSBBLAS1OBJS) | |||
| SBBLAS3OBJS += $(CSBBLAS3OBJS) | |||
| DBLAS1OBJS += $(CDBLAS1OBJS) | |||
| DBLAS2OBJS += $(CDBLAS2OBJS) | |||
| DBLAS3OBJS += $(CDBLAS3OBJS) | |||
| @@ -393,13 +393,13 @@ ZBLAS1OBJS += $(CZBLAS1OBJS) | |||
| ZBLAS2OBJS += $(CZBLAS2OBJS) | |||
| ZBLAS3OBJS += $(CZBLAS3OBJS) | |||
| SHEXTOBJS += $(CSHEXTOBJS) | |||
| SBEXTOBJS += $(CSBEXTOBJS) | |||
| CBAUXOBJS += $(CXERBLAOBJ) | |||
| endif | |||
| SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) | |||
| SHBLASOBJS = $(SHBLAS1OBJS) $(SHBLAS3OBJS) | |||
| SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS3OBJS) | |||
| DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) | |||
| QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) | |||
| CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) | |||
| @@ -506,7 +506,7 @@ ifneq ($(BUILD_COMPLEX16),1) | |||
| ZBLASOBJS= | |||
| endif | |||
| FUNCOBJS = $(SHEXTOBJS) $(CXERBLAOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) | |||
| FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) | |||
| $(info FUNCOBJS = {[$(FUNCOBJS)]} ) | |||
| ifdef EXPRECISION | |||
| FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) | |||
| @@ -772,8 +772,8 @@ sdsdot.$(SUFFIX) sdsdot.$(PSUFFIX) : sdsdot.c | |||
| dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| ifeq ($(BUILD_HALF),1) | |||
| shdot.$(SUFFIX) shdot.$(PSUFFIX) : bf16dot.c | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| sbdot.$(SUFFIX) sbdot.$(PSUFFIX) : bf16dot.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| shstobf16.$(SUFFIX) shstobf16.$(PSUFFIX) : tobf16.c | |||
| $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) | |||
| @@ -1278,8 +1278,8 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c | |||
| xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ifeq ($(BUILD_HALF),1) | |||
| shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| @@ -1523,8 +1523,8 @@ cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c | |||
| cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| ifeq ($(BUILD_HALF),1) | |||
| cblas_shdot.$(SUFFIX) cblas_shdot.$(PSUFFIX) : bf16dot.c | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| cblas_sbdot.$(SUFFIX) cblas_sbdot.$(PSUFFIX) : bf16dot.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_shstobf16.$(SUFFIX) cblas_shstobf16.$(PSUFFIX) : tobf16.c | |||
| $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) | |||
| @@ -1857,8 +1857,8 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c | |||
| cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| ifeq ($(BUILD_HALF),1) | |||
| cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| cblas_sbgemm.$(SUFFIX) cblas_sbgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| @@ -41,8 +41,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| # a bit of metaprogramming here to pull out the appropriate KERNEL var | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "HALF") | |||
| set (float_char "SH") | |||
| if (${float_type} STREQUAL "BFLOAT16") | |||
| set (float_char "SB") | |||
| endif () | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type}) | |||
| @@ -149,8 +149,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "HALF") | |||
| set (float_char "SH") | |||
| if (${float_type} STREQUAL "BFLOAT16") | |||
| set (float_char "SB") | |||
| endif () | |||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) | |||
| @@ -208,13 +208,13 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) | |||
| endif() | |||
| foreach (float_type SINGLE DOUBLE HALF) | |||
| foreach (float_type SINGLE DOUBLE BFLOAT16) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "HALF") | |||
| if (NOT ${BUILD_HALF}) | |||
| if (${float_type} STREQUAL "BFLOAT16") | |||
| if (NOT ${BUILD_BFLOAT16}) | |||
| continue () | |||
| else () | |||
| set (float_char "SH") | |||
| set (float_char "SB") | |||
| endif () | |||
| endif () | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) | |||
| @@ -254,8 +254,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "HALF") | |||
| set (float_char "SH") | |||
| if (${float_type} STREQUAL "BFLOAT16") | |||
| set (float_char "SB") | |||
| endif () | |||
| if (${float_char}GEMMINCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) | |||
| @@ -620,8 +620,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| # Makefile.LA | |||
| if(NOT NO_LAPACK) | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| if (${float_type} STREQUAL "HALF") | |||
| set (float_char "SH") | |||
| if (${float_type} STREQUAL "BFLOAT16") | |||
| set (float_char "SB") | |||
| endif () | |||
| if (NOT DEFINED ${float_char}NEG_TCOPY) | |||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X") | |||
| @@ -688,8 +688,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| # a bit of metaprogramming here to pull out the appropriate KERNEL var | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "HALF") | |||
| set (float_char "SH") | |||
| if (${float_type} STREQUAL "BFLOAT16") | |||
| set (float_char "SB") | |||
| endif () | |||
| GenerateNamedObjects("generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false ${float_type}) | |||
| GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type}) | |||
| @@ -262,9 +262,9 @@ ifndef XDOTKERNEL | |||
| XDOTKERNEL = zdot.S | |||
| endif | |||
| ifeq ($(BUILD_HALF),1) | |||
| ifndef SHDOTKERNEL | |||
| SHDOTKERNEL = ../x86_64/shdot.c | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| ifndef SBDOTKERNEL | |||
| SBDOTKERNEL = ../x86_64/sbdot.c | |||
| endif | |||
| ifndef TOBF16KERNEL | |||
| @@ -530,11 +530,11 @@ XBLASOBJS += \ | |||
| xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ | |||
| xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) | |||
| ifeq ($(BUILD_HALF),1) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| SHBLASOBJS += \ | |||
| shdot_k$(TSUFFIX).$(SUFFIX) | |||
| sbdot_k$(TSUFFIX).$(SUFFIX) | |||
| SHEXTOBJS += \ | |||
| shstobf16_k$(TSUFFIX).$(SUFFIX) shdtobf16_k$(TSUFFIX).$(SUFFIX) | |||
| sbstobf16_k$(TSUFFIX).$(SUFFIX) sbdtobf16_k$(TSUFFIX).$(SUFFIX) | |||
| SHEXTOBJS += \ | |||
| sbf16tos_k$(TSUFFIX).$(SUFFIX) dbf16tod_k$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| @@ -757,12 +757,12 @@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL | |||
| $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ | |||
| ifeq ($(BUILD_HALF),1) | |||
| $(KDIR)shdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)shdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHDOTKERNEL) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| $(KDIR)sbdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sbdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBDOTKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ | |||
| $(KDIR)shstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) | |||
| $(KDIR)sbstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) | |||
| $(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@ | |||
| $(KDIR)shdtobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) | |||
| $(KDIR)sbdtobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) | |||
| $(CC) -c $(CFLAGS) -DDOUBLE -USINGLE $< -o $@ | |||
| $(KDIR)sbf16tos_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BF16TOKERNEL) | |||
| $(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@ | |||
| @@ -80,24 +80,24 @@ SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c | |||
| endif | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| ifndef SHGEMMKERNEL | |||
| SHGEMM_BETA = ../generic/gemm_beta.c | |||
| SHGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SHGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
| SHGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
| SHGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| ifndef SBGEMMKERNEL | |||
| SBGEMM_BETA = ../generic/gemm_beta.c | |||
| SBGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SBGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
| SBGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
| SBGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SBGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| SHKERNELOBJS += \ | |||
| shgemm_kernel$(TSUFFIX).$(SUFFIX) \ | |||
| $(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ | |||
| $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) | |||
| sbgemm_kernel$(TSUFFIX).$(SUFFIX) \ | |||
| $(SBGEMMINCOPYOBJ) $(SBGEMMITCOPYOBJ) \ | |||
| $(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ) | |||
| endif | |||
| ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" | |||
| @@ -149,7 +149,7 @@ XKERNELOBJS += \ | |||
| $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ | |||
| $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) | |||
| ifeq ($(BUILD_HALF),1) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| SHBLASOBJS += $(SHKERNELOBJS) | |||
| endif | |||
| SBLASOBJS += $(SKERNELOBJS) | |||
| @@ -159,8 +159,8 @@ CBLASOBJS += $(CKERNELOBJS) | |||
| ZBLASOBJS += $(ZKERNELOBJS) | |||
| XBLASOBJS += $(XKERNELOBJS) | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| SHBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" | |||
| @@ -492,11 +492,11 @@ ZBLASOBJS += \ | |||
| zgeadd_k$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| SBGEMMINCOPYOBJ_P = $(SBGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SBGEMMITCOPYOBJ_P = $(SBGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| endif | |||
| SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| @@ -524,9 +524,9 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| ifeq ($(BUILD_HALF),1) | |||
| $(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| $(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) | |||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) | |||
| @@ -548,35 +548,35 @@ $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) | |||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_HALF), 1) | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY) | |||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) | |||
| $(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s | |||
| m4 shgemmotcopy.s > shgemmotcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ | |||
| rm shgemmotcopy.s shgemmotcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s | |||
| m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@ | |||
| rm sbgemmotcopy.s sbgemmotcopy_nomacros.s | |||
| else | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | |||
| ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) | |||
| $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY) | |||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) | |||
| $(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s | |||
| m4 shgemmitcopy.s > shgemmitcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ | |||
| rm shgemmitcopy.s shgemmitcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s | |||
| m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@ | |||
| rm sbgemmitcopy.s sbgemmitcopy_nomacros.s | |||
| else | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| endif | |||
| @@ -746,16 +746,16 @@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) | |||
| $(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s | |||
| m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ | |||
| rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s | |||
| $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemm_kernel$(TSUFFIX).s | |||
| m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@ | |||
| rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s | |||
| else | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| endif | |||
| @@ -2375,9 +2375,9 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_ | |||
| $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_HALF),1) | |||
| $(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| $(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) | |||
| @@ -2396,19 +2396,19 @@ $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_HALF), 1) | |||
| $(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| $(SBGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMONCOPY) | |||
| $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(SBGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMOTCOPY) | |||
| $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | |||
| $(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) | |||
| $(SBGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMINCOPY) | |||
| $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY) | |||
| $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| endif | |||
| @@ -2518,9 +2518,9 @@ endif | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| $(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| $(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) | |||
| $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) | |||
| @@ -29,23 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| * trivial copy of asum.c with the ABS() removed * | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #include "../simd/intrin.h" | |||
| #include <math.h> | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG i = 0; | |||
| FLOAT sumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| if (n <= 0 || inc_x <= 0) | |||
| return (sumf); | |||
| n *= inc_x; | |||
| while(i < n) | |||
| if (inc_x == 1) | |||
| { | |||
| #if V_SIMD | |||
| const int vstep = v_nlanes_f32; | |||
| const int unrollx4 = n & (-vstep * 4); | |||
| const int unrollx = n & -vstep; | |||
| v_f32 vsum0 = v_zero_f32(); | |||
| v_f32 vsum1 = v_zero_f32(); | |||
| v_f32 vsum2 = v_zero_f32(); | |||
| v_f32 vsum3 = v_zero_f32(); | |||
| while (i < unrollx4) | |||
| { | |||
| vsum0 = v_add_f32(vsum0, v_loadu_f32(x)); | |||
| vsum1 = v_add_f32(vsum1, v_loadu_f32(x + vstep)); | |||
| vsum2 = v_add_f32(vsum2, v_loadu_f32(x + vstep * 2)); | |||
| vsum3 = v_add_f32(vsum3, v_loadu_f32(x + vstep * 3)); | |||
| i += vstep * 4; | |||
| } | |||
| vsum0 = v_add_f32( | |||
| v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3)); | |||
| while (i < unrollx) | |||
| { | |||
| vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i)); | |||
| i += vstep; | |||
| } | |||
| sumf = v_sum_f32(vsum0); | |||
| #else | |||
| int n1 = n & -4; | |||
| for (; i < n1; i += 4) | |||
| { | |||
| sumf += x[i] + x[i + 1] + x[i + 2] + x[i + 3]; | |||
| } | |||
| #endif | |||
| } | |||
| while (i < n) | |||
| { | |||
| sumf += x[i]; | |||
| i += inc_x; | |||
| } | |||
| return(sumf); | |||
| return (sumf); | |||
| } | |||
| @@ -1,5 +1,5 @@ | |||
| #include "common.h" | |||
| #if defined(HALF) && defined(HALFCONVERSION) | |||
| #if defined(BFLOAT16) && defined(BFLOAT16CONVERSION) | |||
| static float | |||
| bfloat16tof32 (bfloat16 f16) | |||
| { | |||
| @@ -7,16 +7,16 @@ else | |||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||
| #ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| SHGEMM_BETA = ../generic/gemm_beta.c | |||
| SHGEMMKERNEL = shgemm_kernel_power10.c | |||
| SHGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SHGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| SHGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
| SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMM_BETA = ../generic/gemm_beta.c | |||
| SBGEMMKERNEL = sbgemm_kernel_power10.c | |||
| SBGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SBGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| SBGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SBGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
| SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMKERNEL = sgemm_kernel_power10.c | |||
| DTRMMKERNEL = dgemm_kernel_power10.c | |||
| @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| #if defined(HALF) && defined(HALFCONVERSION) | |||
| #if defined(BFLOAT16) && defined(BFLOAT16CONVERSION) | |||
| static float | |||
| bfloat16tof32 (bfloat16 f16) | |||
| { | |||
| @@ -131,7 +131,7 @@ vector char mask = | |||
| #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); | |||
| /************************************************************************************* | |||
| * SHGEMM Kernel | |||
| * SBGEMM Kernel | |||
| *************************************************************************************/ | |||
| int | |||
| CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| @@ -53,32 +53,32 @@ gotoblas_t TABLE_NAME = { | |||
| GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, | |||
| #ifdef BUILD_HALF | |||
| #ifdef BUILD_BFLOAT16 | |||
| 0, 0, 0, | |||
| SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N, | |||
| #ifdef SHGEMM_DEFAULT_UNROLL_MN | |||
| SHGEMM_DEFAULT_UNROLL_MN, | |||
| SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N, | |||
| #ifdef SBGEMM_DEFAULT_UNROLL_MN | |||
| SBGEMM_DEFAULT_UNROLL_MN, | |||
| #else | |||
| MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N), | |||
| MAX(SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N), | |||
| #endif | |||
| shstobf16_kTS, shdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, | |||
| sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, | |||
| samax_kTS, samin_kTS, smax_kTS, smin_kTS, | |||
| isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, | |||
| snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, shdot_kTS, | |||
| snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS, | |||
| dsdot_kTS, | |||
| srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, | |||
| sgemv_nTS, sgemv_tTS, sger_kTS, | |||
| ssymv_LTS, ssymv_UTS, | |||
| shgemm_kernelTS, shgemm_betaTS, | |||
| #if SHGEMM_DEFAULT_UNROLL_M != SHGEMM_DEFAULT_UNROLL_N | |||
| shgemm_incopyTS, shgemm_itcopyTS, | |||
| sbgemm_kernelTS, sbgemm_betaTS, | |||
| #if SBGEMM_DEFAULT_UNROLL_M != SBGEMM_DEFAULT_UNROLL_N | |||
| sbgemm_incopyTS, sbgemm_itcopyTS, | |||
| #else | |||
| shgemm_oncopyTS, shgemm_otcopyTS, | |||
| sbgemm_oncopyTS, sbgemm_otcopyTS, | |||
| #endif | |||
| shgemm_oncopyTS, shgemm_otcopyTS, | |||
| sbgemm_oncopyTS, sbgemm_otcopyTS, | |||
| strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, | |||
| #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N | |||
| @@ -830,8 +830,8 @@ gotoblas_t TABLE_NAME = { | |||
| #if (ARCH_ARM64) | |||
| static void init_parameter(void) { | |||
| #if (BUILD_HALF) | |||
| TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; | |||
| #if (BUILD_BFLOAT16) | |||
| TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; | |||
| #endif | |||
| #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| @@ -846,8 +846,8 @@ static void init_parameter(void) { | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #endif | |||
| #if (BUILD_HALF) | |||
| TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; | |||
| #if (BUILD_BFLOAT16) | |||
| TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; | |||
| #endif | |||
| #if BUILD_SINGLE == 1 | |||
| TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||
| @@ -862,8 +862,8 @@ static void init_parameter(void) { | |||
| TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; | |||
| #endif | |||
| #if (BUILD_HALF) | |||
| TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; | |||
| #if (BUILD_BFLOAT16) | |||
| TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; | |||
| #endif | |||
| #if BUILD_SINGLE == 1 | |||
| TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; | |||
| @@ -936,16 +936,16 @@ static void init_parameter(void) { | |||
| #if (ARCH_POWER) | |||
| static void init_parameter(void) { | |||
| #ifdef BUILD_HALF | |||
| TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; | |||
| #endif | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #ifdef BUILD_HALF | |||
| TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; | |||
| #endif | |||
| TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; | |||
| TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; | |||
| @@ -953,8 +953,8 @@ static void init_parameter(void) { | |||
| TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; | |||
| #ifdef BUILD_HALF | |||
| TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; | |||
| #endif | |||
| TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||
| TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; | |||
| @@ -965,16 +965,16 @@ static void init_parameter(void) { | |||
| #if (ARCH_ZARCH) | |||
| static void init_parameter(void) { | |||
| #ifdef BUILD_HALF | |||
| TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; | |||
| #endif | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #ifdef BUILD_HALF | |||
| TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; | |||
| #endif | |||
| TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; | |||
| TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; | |||
| @@ -982,8 +982,8 @@ static void init_parameter(void) { | |||
| TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; | |||
| #ifdef BUILD_HALF | |||
| TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; | |||
| #endif | |||
| TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||
| TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; | |||
| @@ -1124,10 +1124,10 @@ static void init_parameter(void) { | |||
| (void) l2; /* dirty trick to suppress unused variable warning for targets */ | |||
| /* where the GEMM unrolling parameters do not depend on l2 */ | |||
| #ifdef BUILD_HALF | |||
| TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; | |||
| TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; | |||
| TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; | |||
| TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; | |||
| TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; | |||
| #endif | |||
| #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) | |||
| TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||
| @@ -146,8 +146,8 @@ ifndef XDOTKERNEL | |||
| XDOTKERNEL = zdot.S | |||
| endif | |||
| ifndef SHDOTKERNEL | |||
| SHDOTKERNEL = shdot.c | |||
| ifndef SBDOTKERNEL | |||
| SBDOTKERNEL = sbdot.c | |||
| endif | |||
| ifndef TOBF16KERNEL | |||
| @@ -54,6 +54,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| salq $BASE_SHIFT, INCX | |||
| @@ -49,6 +49,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| fldz | |||
| testq M, M | |||
| @@ -50,6 +50,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| salq $BASE_SHIFT, INCX | |||
| salq $BASE_SHIFT, INCY | |||
| @@ -60,6 +60,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| salq $BASE_SHIFT, INCX | |||
| fldz | |||
| @@ -60,6 +60,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| salq $ZBASE_SHIFT, INCX | |||
| fldz | |||
| @@ -50,6 +50,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| fldz | |||
| testq M, M | |||
| jle .L999 | |||
| @@ -42,6 +42,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| fldz | |||
| FLD 1 * SIZE(ARG1) | |||
| fsubrp %st, %st(1) | |||
| @@ -58,6 +58,10 @@ | |||
| PROLOGUE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| pushl %edi | |||
| pushl %esi | |||
| pushl %ebx | |||
| @@ -74,6 +74,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| subq $STACKSIZE, %rsp | |||
| movq %rbx, 0(%rsp) | |||
| movq %rbp, 8(%rsp) | |||
| @@ -76,6 +76,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| subq $STACKSIZE, %rsp | |||
| movq %rbx, 0(%rsp) | |||
| movq %rbp, 8(%rsp) | |||
| @@ -75,6 +75,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| subq $STACKSIZE, %rsp | |||
| movq %rbx, 0(%rsp) | |||
| movq %rbp, 8(%rsp) | |||
| @@ -74,6 +74,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| subq $STACKSIZE, %rsp | |||
| movq %rbx, 0(%rsp) | |||
| movq %rbp, 8(%rsp) | |||
| @@ -74,6 +74,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| subq $STACKSIZE, %rsp | |||
| movq %rbx, 0(%rsp) | |||
| movq %rbp, 8(%rsp) | |||
| @@ -74,6 +74,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| subq $STACKSIZE, %rsp | |||
| movq %rbx, 0(%rsp) | |||
| movq %rbp, 8(%rsp) | |||
| @@ -28,16 +28,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(COOPERLAKE) | |||
| #include "shdot_microk_cooperlake.c" | |||
| #include "sbdot_microk_cooperlake.c" | |||
| #endif | |||
| static float shdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y) | |||
| static float sbdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y) | |||
| { | |||
| float d = 0.0; | |||
| #ifdef HAVE_SHDOT_ACCL_KERNEL | |||
| #ifdef HAVE_SBDOT_ACCL_KERNEL | |||
| if ((inc_x == 1) && (inc_y == 1)) { | |||
| return shdot_accl_kernel(n, x, y); | |||
| return sbdot_accl_kernel(n, x, y); | |||
| } | |||
| #endif | |||
| @@ -56,11 +56,11 @@ static float shdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, | |||
| } | |||
| #if defined(SMP) | |||
| static int shdot_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, bfloat16 dummy2, | |||
| static int sbdot_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, bfloat16 dummy2, | |||
| bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y, | |||
| float *result, BLASLONG dummy3) | |||
| { | |||
| *(float *)result = shdot_compute(n, x, inc_x, y, inc_y); | |||
| *(float *)result = sbdot_compute(n, x, inc_x, y, inc_y); | |||
| return 0; | |||
| } | |||
| @@ -94,13 +94,13 @@ float CNAME(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y | |||
| } | |||
| if (nthreads <= 1) { | |||
| dot_result = shdot_compute(n, x, inc_x, y, inc_y); | |||
| dot_result = sbdot_compute(n, x, inc_x, y, inc_y); | |||
| } else { | |||
| char thread_result[MAX_CPU_NUMBER * sizeof(double) * 2]; | |||
| int mode = BLAS_BFLOAT16 | BLAS_REAL; | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | |||
| x, inc_x, y, inc_y, thread_result, 0, | |||
| (void *)shdot_thread_func, nthreads); | |||
| (void *)sbdot_thread_func, nthreads); | |||
| float * ptr = (float *)thread_result; | |||
| for (int i = 0; i < nthreads; i++) { | |||
| dot_result += (*ptr); | |||
| @@ -108,7 +108,7 @@ float CNAME(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y | |||
| } | |||
| } | |||
| #else | |||
| dot_result = shdot_compute(n, x, inc_x, y, inc_y); | |||
| dot_result = sbdot_compute(n, x, inc_x, y, inc_y); | |||
| #endif | |||
| return dot_result; | |||
| @@ -28,11 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) | |||
| #define HAVE_SHDOT_ACCL_KERNEL 1 | |||
| #define HAVE_SBDOT_ACCL_KERNEL 1 | |||
| #include "common.h" | |||
| #include <immintrin.h> | |||
| static float shdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) | |||
| static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) | |||
| { | |||
| __m128 accum128 = _mm_setzero_ps(); | |||
| if (n> 127) { /* n range from 128 to inf. */ | |||
| @@ -50,6 +50,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| fldz | |||
| testq M, M | |||
| jle .L999 | |||
| @@ -59,6 +59,11 @@ | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| #define N %ebx | |||
| #define X %esi | |||
| #define INCX %ecx | |||
| @@ -78,6 +78,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| subq $STACKSIZE, %rsp | |||
| movq %rbx, 0(%rsp) | |||
| movq %rbp, 8(%rsp) | |||
| @@ -97,6 +97,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| subq $STACKSIZE, %rsp | |||
| movq %rbx, 0(%rsp) | |||
| movq %rbp, 8(%rsp) | |||
| @@ -76,6 +76,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| subq $STACKSIZE, %rsp | |||
| movq %rbx, 0(%rsp) | |||
| movq %rbp, 8(%rsp) | |||
| @@ -75,6 +75,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| subq $STACKSIZE, %rsp | |||
| movq %rbx, 0(%rsp) | |||
| movq %rbp, 8(%rsp) | |||
| @@ -90,6 +90,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| subq $STACKSIZE, %rsp | |||
| movq %rbx, 0(%rsp) | |||
| movq %rbp, 8(%rsp) | |||
| @@ -55,6 +55,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| salq $ZBASE_SHIFT, INCX | |||
| fldz | |||
| @@ -50,6 +50,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| fldz | |||
| testq M, M | |||
| jle .L999 | |||
| @@ -55,6 +55,8 @@ | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| movq 40(%rsp), INCY | |||
| #endif | |||
| @@ -50,6 +50,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| fldz | |||
| testq M, M | |||
| jle .L999 | |||
| @@ -50,6 +50,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| salq $ZBASE_SHIFT, INCX | |||
| FLD 8(%rsp) | |||
| @@ -50,6 +50,10 @@ | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| emms | |||
| #endif | |||
| fldz | |||
| testq M, M | |||
| jle .L999 | |||
| @@ -382,7 +382,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ | |||
| mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; | |||
| #elif defined(HALF) | |||
| mode = BLAS_HALF | BLAS_REAL; | |||
| mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; | |||
| mask = MAX(SBGEMM_UNROLL_M, SBGEMM_UNROLL_N) - 1; | |||
| #else | |||
| mode = BLAS_SINGLE | BLAS_REAL; | |||
| mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; | |||
| @@ -72,12 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifndef PARAM_H | |||
| #define PARAM_H | |||
| #define SHGEMM_DEFAULT_UNROLL_N 4 | |||
| #define SHGEMM_DEFAULT_UNROLL_M 8 | |||
| #define SHGEMM_DEFAULT_UNROLL_MN 32 | |||
| #define SHGEMM_DEFAULT_P 256 | |||
| #define SHGEMM_DEFAULT_R 256 | |||
| #define SHGEMM_DEFAULT_Q 256 | |||
| #define SBGEMM_DEFAULT_UNROLL_N 4 | |||
| #define SBGEMM_DEFAULT_UNROLL_M 8 | |||
| #define SBGEMM_DEFAULT_UNROLL_MN 32 | |||
| #define SBGEMM_DEFAULT_P 256 | |||
| #define SBGEMM_DEFAULT_R 256 | |||
| #define SBGEMM_DEFAULT_Q 256 | |||
| #ifdef OPTERON | |||
| #define SNUMOPT 4 | |||
| @@ -2426,16 +2426,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER10) | |||
| #undef SHGEMM_DEFAULT_UNROLL_N | |||
| #undef SHGEMM_DEFAULT_UNROLL_M | |||
| #undef SHGEMM_DEFAULT_P | |||
| #undef SHGEMM_DEFAULT_R | |||
| #undef SHGEMM_DEFAULT_Q | |||
| #define SHGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SHGEMM_DEFAULT_UNROLL_N 8 | |||
| #define SHGEMM_DEFAULT_P 832 | |||
| #define SHGEMM_DEFAULT_Q 1026 | |||
| #define SHGEMM_DEFAULT_R 4096 | |||
| #undef SBGEMM_DEFAULT_UNROLL_N | |||
| #undef SBGEMM_DEFAULT_UNROLL_M | |||
| #undef SBGEMM_DEFAULT_P | |||
| #undef SBGEMM_DEFAULT_R | |||
| #undef SBGEMM_DEFAULT_Q | |||
| #define SBGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SBGEMM_DEFAULT_UNROLL_N 8 | |||
| #define SBGEMM_DEFAULT_P 832 | |||
| #define SBGEMM_DEFAULT_Q 1026 | |||
| #define SBGEMM_DEFAULT_R 4096 | |||
| #endif | |||
| #if defined(SPARC) && defined(V7) | |||
| @@ -214,16 +214,16 @@ endif | |||
| #ifeq ($(BUILD_HALF),1) | |||
| #level3 : test_shgemm sblat3 dblat3 cblat3 zblat3 | |||
| #ifeq ($(BUILD_BFLOAT16),1) | |||
| #level3 : test_sbgemm sblat3 dblat3 cblat3 zblat3 | |||
| #else | |||
| #level3 : sblat3 dblat3 cblat3 zblat3 | |||
| #endif | |||
| ifndef CROSS | |||
| rm -f ?BLAT3.SUMM | |||
| ifeq ($(BUILD_HALF),1) | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_shgemm > SHBLAT3.SUMM | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SHBLAT3.SUMM | |||
| @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 | |||
| endif | |||
| ifeq ($(BUILD_SINGLE),1) | |||
| @@ -245,8 +245,8 @@ endif | |||
| ifdef SMP | |||
| rm -f ?BLAT3.SUMM | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(BUILD_HALF),1) | |||
| OMP_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| OMP_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM | |||
| @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 | |||
| endif | |||
| ifeq ($(BUILD_SINGLE),1) | |||
| @@ -266,8 +266,8 @@ ifeq ($(BUILD_COMPLEX16),1) | |||
| @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 | |||
| endif | |||
| else | |||
| ifeq ($(BUILD_HALF),1) | |||
| OPENBLAS_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM | |||
| @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 | |||
| endif | |||
| ifeq ($(BUILD_SINGLE),1) | |||
| @@ -377,9 +377,9 @@ zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) | |||
| endif | |||
| ifeq ($(BUILD_HALF),1) | |||
| test_shgemm : compare_sgemm_shgemm.c ../$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o test_shgemm compare_sgemm_shgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) | |||
| endif | |||
| ifeq ($(BUILD_COMPLEX),1) | |||
| @@ -398,7 +398,7 @@ clean: | |||
| @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ | |||
| sblat1 dblat1 cblat1 zblat1 \ | |||
| sblat2 dblat2 cblat2 zblat2 \ | |||
| test_shgemm sblat3 dblat3 cblat3 zblat3 \ | |||
| test_sbgemm sblat3 dblat3 cblat3 zblat3 \ | |||
| sblat1p dblat1p cblat1p zblat1p \ | |||
| sblat2p dblat2p cblat2p zblat2p \ | |||
| sblat3p dblat3p cblat3p zblat3p \ | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <stdint.h> | |||
| #include "../common.h" | |||
| #define SGEMM BLASFUNC(sgemm) | |||
| #define SHGEMM BLASFUNC(shgemm) | |||
| #define SBGEMM BLASFUNC(sbgemm) | |||
| typedef union | |||
| { | |||
| unsigned short v; | |||
| @@ -102,7 +102,7 @@ main (int argc, char *argv[]) | |||
| } | |||
| SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, | |||
| &m, B, &k, &beta, C, &m); | |||
| SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, | |||
| SBGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, | |||
| &m, BB, &k, &beta, CC, &m); | |||
| for (i = 0; i < n; i++) | |||
| for (j = 0; j < m; j++) | |||
| @@ -126,6 +126,6 @@ main (int argc, char *argv[]) | |||
| } | |||
| } | |||
| if (ret != 0) | |||
| fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); | |||
| fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret); | |||
| return ret; | |||
| } | |||