| @@ -70,6 +70,7 @@ test/SBLAT2.SUMM | |||
| test/SBLAT3.SUMM | |||
| test/ZBLAT2.SUMM | |||
| test/ZBLAT3.SUMM | |||
| test/SHBLAT3.SUMM | |||
| test/cblat1 | |||
| test/cblat2 | |||
| test/cblat3 | |||
| @@ -79,6 +80,7 @@ test/dblat3 | |||
| test/sblat1 | |||
| test/sblat2 | |||
| test/sblat3 | |||
| test/test_shgemm | |||
| test/zblat1 | |||
| test/zblat2 | |||
| test/zblat3 | |||
| @@ -86,10 +86,13 @@ if (NOT NO_LAPACK) | |||
| list(APPEND SUBDIRS lapack) | |||
| endif () | |||
| if (NOT DEFINED BUILD_HALF) | |||
| set (BUILD_HALF false) | |||
| endif () | |||
| # set which float types we want to build for | |||
| if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) | |||
| # if none are defined, build for all | |||
| set(BUILD_HALF true) | |||
| # set(BUILD_HALF true) | |||
| set(BUILD_SINGLE true) | |||
| set(BUILD_DOUBLE true) | |||
| set(BUILD_COMPLEX true) | |||
| @@ -121,7 +124,7 @@ if (BUILD_COMPLEX16) | |||
| list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE | |||
| endif () | |||
| if (BUILD_SINGLE OR BUILD_HALF) | |||
| if (BUILD_HALF) | |||
| message(STATUS "Building Half Precision") | |||
| list(APPEND FLOAT_TYPES "HALF") # defines nothing | |||
| endif () | |||
| @@ -273,6 +273,9 @@ COMMON_PROF = -pg | |||
| # | |||
| # CPP_THREAD_SAFETY_TEST = 1 | |||
| # If you want to enable the experimental BFLOAT16 support | |||
| # BUILD_HALF = 1 | |||
| # | |||
| # End of user configuration | |||
| # | |||
| @@ -1124,6 +1124,10 @@ ifeq ($(USE_TLS), 1) | |||
| CCOMMON_OPT += -DUSE_TLS | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| CCOMMON_OPT += -DBUILD_HALF | |||
| endif | |||
| CCOMMON_OPT += -DVERSION=\"$(VERSION)\" | |||
| ifndef SYMBOLPREFIX | |||
| @@ -1395,6 +1399,7 @@ export KERNELDIR | |||
| export FUNCTION_PROFILE | |||
| export TARGET_CORE | |||
| export NO_AVX512 | |||
| export BUILD_HALF | |||
| export SHGEMM_UNROLL_M | |||
| export SHGEMM_UNROLL_N | |||
| @@ -113,6 +113,7 @@ macro(SetDefaultL1) | |||
| set(ZSUMKERNEL zsum.S) | |||
| set(QSUMKERNEL sum.S) | |||
| set(XSUMKERNEL zsum.S) | |||
| if (BUILD_HALF) | |||
| set(SHAMINKERNEL ../arm/amin.c) | |||
| set(SHAMAXKERNEL ../arm/amax.c) | |||
| set(SHMAXKERNEL ../arm/max.c) | |||
| @@ -131,6 +132,7 @@ macro(SetDefaultL1) | |||
| set(SHNRM2KERNEL ../arm/nrm2.c) | |||
| set(SHSUMKERNEL ../arm/sum.c) | |||
| set(SHSWAPKERNEL ../arm/swap.c) | |||
| endif () | |||
| endmacro () | |||
| macro(SetDefaultL2) | |||
| @@ -179,10 +181,11 @@ macro(SetDefaultL2) | |||
| set(XHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| if (BUILD_HALF) | |||
| set(SHGEMVNKERNEL ../arm/gemv_n.c) | |||
| set(SHGEMVTKERNEL ../arm/gemv_t.c) | |||
| set(SHGERKERNEL ../generic/ger.c) | |||
| endif () | |||
| endmacro () | |||
| macro(SetDefaultL3) | |||
| @@ -190,6 +193,7 @@ macro(SetDefaultL3) | |||
| set(DGEADD_KERNEL ../generic/geadd.c) | |||
| set(CGEADD_KERNEL ../generic/zgeadd.c) | |||
| set(ZGEADD_KERNEL ../generic/zgeadd.c) | |||
| if (BUILD_HALF) | |||
| set(SHGEADD_KERNEL ../generic/geadd.c) | |||
| set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c) | |||
| set(SHGEMM_BETA ../generic/gemm_beta.c) | |||
| @@ -201,6 +205,6 @@ macro(SetDefaultL3) | |||
| set(SHGEMMITCOPYOBJ shgemm_itcopy.o) | |||
| set(SHGEMMONCOPYOBJ shgemm_oncopy.o) | |||
| set(SHGEMMOTCOPYOBJ shgemm_otcopy.o) | |||
| endif () | |||
| endmacro () | |||
| @@ -47,7 +47,7 @@ typedef struct { | |||
| int dtb_entries; | |||
| int offsetA, offsetB, align; | |||
| #if 1 | |||
| #ifdef BUILD_HALF | |||
| int shgemm_p, shgemm_q, shgemm_r; | |||
| int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; | |||
| @@ -1002,12 +1002,14 @@ extern gotoblas_t *gotoblas; | |||
| #define HAVE_EX_L2 gotoblas -> exclusive_cache | |||
| #ifdef BUILD_HALF | |||
| #define SHGEMM_P gotoblas -> shgemm_p | |||
| #define SHGEMM_Q gotoblas -> shgemm_q | |||
| #define SHGEMM_R gotoblas -> shgemm_r | |||
| #define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m | |||
| #define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n | |||
| #define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn | |||
| #endif | |||
| #define SGEMM_P gotoblas -> sgemm_p | |||
| #define SGEMM_Q gotoblas -> sgemm_q | |||
| @@ -1086,6 +1088,7 @@ extern gotoblas_t *gotoblas; | |||
| #define HAVE_EX_L2 0 | |||
| #endif | |||
| #ifdef BUILD_HALF | |||
| #define SHGEMM_P SHGEMM_DEFAULT_P | |||
| #define SHGEMM_Q SHGEMM_DEFAULT_Q | |||
| #define SHGEMM_R SHGEMM_DEFAULT_R | |||
| @@ -1096,6 +1099,7 @@ extern gotoblas_t *gotoblas; | |||
| #else | |||
| #define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) | |||
| #endif | |||
| #endif | |||
| #define SGEMM_P SGEMM_DEFAULT_P | |||
| #define SGEMM_Q SGEMM_DEFAULT_Q | |||
| @@ -1330,31 +1334,31 @@ extern gotoblas_t *gotoblas; | |||
| #endif | |||
| #ifndef SHGEMM_DEFAULT_R | |||
| #define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15) | |||
| #define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef SGEMM_DEFAULT_R | |||
| #define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15) | |||
| #define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef DGEMM_DEFAULT_R | |||
| #define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15) | |||
| #define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef QGEMM_DEFAULT_R | |||
| #define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15) | |||
| #define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef CGEMM_DEFAULT_R | |||
| #define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15) | |||
| #define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef ZGEMM_DEFAULT_R | |||
| #define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15) | |||
| #define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef XGEMM_DEFAULT_R | |||
| #define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15) | |||
| #define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef SNUMOPT | |||
| @@ -19,7 +19,10 @@ ifeq ($(ARCH), MIPS) | |||
| USE_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX) | |||
| endif | |||
| SBLASOBJS += \ | |||
| sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \ | |||
| strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \ | |||
| @@ -204,8 +207,9 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$( | |||
| COMMONOBJS += syrk_thread.$(SUFFIX) | |||
| ifndef USE_SIMPLE_THREADED_LEVEL3 | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX) | |||
| endif | |||
| SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) | |||
| DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) | |||
| QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX) | |||
| @@ -272,7 +272,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| } | |||
| } | |||
| #if defined(OS_LINUX) && !defined(NO_AFFINITY) | |||
| #if defined(OS_LINUX) && !defined(NO_AFFINITY) | |||
| int gotoblas_set_affinity(int); | |||
| int gotoblas_set_affinity2(int); | |||
| int get_node(void); | |||
| @@ -281,6 +281,8 @@ int get_node(void); | |||
| static int increased_threads = 0; | |||
| #ifdef OS_LINUX | |||
| extern int openblas_get_num_threads(void); | |||
| int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { | |||
| const int active_threads = openblas_get_num_threads(); | |||
| @@ -602,7 +604,7 @@ int blas_thread_init(void){ | |||
| if(ret!=0){ | |||
| struct rlimit rlim; | |||
| const char *msg = strerror(ret); | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg); | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %d: %s\n", i+1,blas_num_threads,msg); | |||
| #ifdef RLIMIT_NPROC | |||
| if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " | |||
| @@ -2070,7 +2070,7 @@ if (!release->address) return; | |||
| if (munmap(release -> address, BUFFER_SIZE)) { | |||
| int errsv=errno; | |||
| perror("OpenBLAS : munmap failed:"); | |||
| printf("error code=%d,\trelease->address=%lx\n",errsv,release->address); | |||
| printf("error code=%d,\trelease->address=%p\n",errsv,release->address); | |||
| } | |||
| } | |||
| @@ -30,6 +30,10 @@ ifndef BUILD_LAPACK_DEPRECATED | |||
| BUILD_LAPACK_DEPRECATED = 0 | |||
| endif | |||
| ifndef BUILD_HALF | |||
| BUILD_HALF = 0 | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifndef ONLY_CBLAS | |||
| @@ -234,23 +238,23 @@ static : ../$(LIBNAME) | |||
| rm -f goto.$(SUFFIX) | |||
| osx.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) | |||
| aix.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) | |||
| objcopy.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) | |||
| objconv.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) | |||
| test : linktest.c | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | |||
| rm -f linktest | |||
| linktest.c : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c | |||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > linktest.c | |||
| clean :: | |||
| @rm -f *.def *.dylib __.SYMDEF* *.renamed | |||
| @@ -30,7 +30,7 @@ | |||
| icamax,icamin,idamax,idamin,idmax,idmin,isamax,isamin,ismax,ismin, | |||
| izamax,izamin,lsame,samax,samin,sasum,saxpy,scabs1,scamax, | |||
| scamin,scasum,scnrm2,scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger, | |||
| shgemm, smax,smin,snrm2, | |||
| smax,smin,snrm2, | |||
| srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, | |||
| ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, | |||
| strmm,strmv,strsm,strsv,zaxpy,zcopy,zdotc,zdotu,zdrot, | |||
| @@ -51,6 +51,7 @@ | |||
| zimatcopy, | |||
| ); | |||
| @halfblasobjs = (shgemm); | |||
| @cblasobjs = ( | |||
| cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, | |||
| cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, | |||
| @@ -67,7 +68,7 @@ | |||
| cblas_isamax, cblas_izamax, | |||
| cblas_sasum, cblas_saxpy, | |||
| cblas_scasum, cblas_scnrm2, cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, | |||
| cblas_sgemv, cblas_sger, cblas_shgemm, cblas_snrm2, cblas_srot, cblas_srotg, | |||
| cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg, | |||
| cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, | |||
| cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, | |||
| cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, | |||
| @@ -83,6 +84,8 @@ | |||
| cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd | |||
| ); | |||
| @halfcblasobjs = (cblas_shgemm); | |||
| @exblasobjs = ( | |||
| qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, | |||
| qgemv,qger,qmax,qmin, | |||
| @@ -3454,6 +3457,10 @@ use File::Spec; | |||
| use File::Basename; | |||
| my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib"); | |||
| if ($ARGV[12] == 1) { | |||
| @blasobjs = (@blasobjs, @halfblasobjs); | |||
| @cblasobjs = (@cblasobjs, @halfcblasobjs); | |||
| } | |||
| if ($ARGV[8] == 1) { | |||
| #ONLY_CBLAS=1 | |||
| @underscore_objs = (@misc_underscore_objs); | |||
| @@ -46,7 +46,9 @@ SBLAS3OBJS = \ | |||
| somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ | |||
| sgeadd.$(SUFFIX) | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLAS3OBJS = shgemm.$(SUFFIX) | |||
| endif | |||
| DBLAS1OBJS = \ | |||
| daxpy.$(SUFFIX) dswap.$(SUFFIX) \ | |||
| @@ -278,7 +280,9 @@ CSBLAS3OBJS = \ | |||
| cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ | |||
| cblas_sgeadd.$(SUFFIX) | |||
| ifeq ($(BUILD_HALF),1) | |||
| CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) | |||
| endif | |||
| CDBLAS1OBJS = \ | |||
| cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | |||
| @@ -1214,8 +1218,10 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c | |||
| xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ifeq ($(BUILD_HALF),1) | |||
| shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| @@ -1778,8 +1784,10 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c | |||
| cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| ifeq ($(BUILD_HALF),1) | |||
| cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| @@ -137,7 +137,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| foreach (float_type SINGLE DOUBLE HALF) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "HALF") | |||
| set (float_char "SH") | |||
| if (NOT ${BUILD_HALF}) | |||
| continue () | |||
| else () | |||
| set (float_char "SH") | |||
| endif () | |||
| endif () | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) | |||
| endforeach() | |||
| @@ -59,7 +59,8 @@ ifeq ($(CORE), Z14) | |||
| USE_TRMM = 1 | |||
| endif | |||
| #ifndef SHGEMMKERNEL | |||
| ifeq ($(BUILD_HALF), 1) | |||
| ifndef SHGEMMKERNEL | |||
| SHGEMM_BETA = ../generic/gemm_beta.c | |||
| SHGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SHGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
| @@ -70,12 +71,13 @@ SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| #endif | |||
| endif | |||
| SHKERNELOBJS += \ | |||
| shgemm_kernel$(TSUFFIX).$(SUFFIX) \ | |||
| $(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ | |||
| $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) | |||
| endif | |||
| SKERNELOBJS += \ | |||
| sgemm_kernel$(TSUFFIX).$(SUFFIX) \ | |||
| @@ -110,7 +112,9 @@ XKERNELOBJS += \ | |||
| $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ | |||
| $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += $(SHKERNELOBJS) | |||
| endif | |||
| SBLASOBJS += $(SKERNELOBJS) | |||
| DBLASOBJS += $(DKERNELOBJS) | |||
| QBLASOBJS += $(QKERNELOBJS) | |||
| @@ -118,7 +122,10 @@ CBLASOBJS += $(CKERNELOBJS) | |||
| ZBLASOBJS += $(ZKERNELOBJS) | |||
| XBLASOBJS += $(XKERNELOBJS) | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| SBLASOBJS += \ | |||
| sgemm_beta$(TSUFFIX).$(SUFFIX) \ | |||
| strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ | |||
| @@ -408,11 +415,13 @@ ZBLASOBJS += \ | |||
| zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ | |||
| zgeadd_k$(TSUFFIX).$(SUFFIX) | |||
| ifeq ($(BUILD_HALF), 1) | |||
| SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| endif | |||
| SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| @@ -438,8 +447,10 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| ifeq ($(BUILD_HALF),1) | |||
| $(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -459,10 +470,14 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) | |||
| $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) | |||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_HALF), 1) | |||
| $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s | |||
| m4 shgemmotcopy.s > shgemmotcopy_nomacros.s | |||
| @@ -487,6 +502,7 @@ else | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| endif | |||
| endif | |||
| $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) | |||
| @@ -646,6 +662,8 @@ else | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemm_kernel$(TSUFFIX).s | |||
| @@ -655,6 +673,7 @@ ifeq ($(OS), AIX) | |||
| else | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| endif | |||
| $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| @@ -2272,8 +2291,10 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_ | |||
| $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_HALF),1) | |||
| $(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -2290,6 +2311,8 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) | |||
| $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_HALF), 1) | |||
| $(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -2304,6 +2327,8 @@ $(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| endif | |||
| $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) | |||
| $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -2408,8 +2433,11 @@ endif | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| $(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) | |||
| $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -53,6 +53,7 @@ gotoblas_t TABLE_NAME = { | |||
| GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, | |||
| #ifdef BUILD_HALF | |||
| 0, 0, 0, | |||
| SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N, | |||
| #ifdef SHGEMM_DEFAULT_UNROLL_MN | |||
| @@ -109,7 +110,7 @@ gotoblas_t TABLE_NAME = { | |||
| #else | |||
| NULL,NULL, | |||
| #endif | |||
| #endif | |||
| 0, 0, 0, | |||
| SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, | |||
| @@ -706,19 +707,25 @@ gotoblas_t TABLE_NAME = { | |||
| #if defined(ARCH_ARM64) | |||
| static void init_parameter(void) { | |||
| #if defined(BUILD_HALF) | |||
| TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; | |||
| #endif | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #if defined(BUILD_HALF) | |||
| TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; | |||
| #endif | |||
| TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||
| TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; | |||
| TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; | |||
| TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; | |||
| #if defined(BUILD_HALF) | |||
| TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; | |||
| #endif | |||
| TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; | |||
| TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; | |||
| TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; | |||
| @@ -782,20 +789,26 @@ static void init_parameter(void) { | |||
| #if defined(ARCH_POWER) | |||
| static void init_parameter(void) { | |||
| #ifdef BUILD_HALF | |||
| TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; | |||
| #endif | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #ifdef BUILD_HALF | |||
| TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; | |||
| #endif | |||
| TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; | |||
| TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; | |||
| TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; | |||
| TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; | |||
| #ifdef BUILD_HALF | |||
| TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; | |||
| #endif | |||
| TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||
| TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; | |||
| TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; | |||
| @@ -805,20 +818,26 @@ static void init_parameter(void) { | |||
| #if defined(ARCH_ZARCH) | |||
| static void init_parameter(void) { | |||
| #ifdef BUILD_HALF | |||
| TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; | |||
| #endif | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #ifdef BUILD_HALF | |||
| TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; | |||
| #endif | |||
| TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; | |||
| TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; | |||
| TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; | |||
| TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; | |||
| #ifdef BUILD_HALF | |||
| TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; | |||
| #endif | |||
| TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||
| TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; | |||
| TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; | |||
| @@ -958,9 +977,11 @@ static void init_parameter(void) { | |||
| (void) l2; /* dirty trick to suppress unused variable warning for targets */ | |||
| /* where the GEMM unrolling parameters do not depend on l2 */ | |||
| #ifdef BUILD_HALF | |||
| TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; | |||
| TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; | |||
| TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; | |||
| #endif | |||
| TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||
| TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; | |||
| TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; | |||
| @@ -64,9 +64,17 @@ endif | |||
| endif | |||
| endif | |||
| ifeq ($(BUILD_HALF),1) | |||
| level3 : test_shgemm sblat3 dblat3 cblat3 zblat3 | |||
| else | |||
| level3 : sblat3 dblat3 cblat3 zblat3 | |||
| endif | |||
| ifndef CROSS | |||
| rm -f ?BLAT3.SUMM | |||
| ifeq ($(BUILD_HALF),1) | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_shgemm > SHBLAT3.SUMM | |||
| @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 | |||
| endif | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat | |||
| @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat3 < ./dblat3.dat | |||
| @@ -78,6 +86,10 @@ ifndef CROSS | |||
| ifdef SMP | |||
| rm -f ?BLAT3.SUMM | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(BUILD_HALF),1) | |||
| OMP_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM | |||
| @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 | |||
| endif | |||
| OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat | |||
| @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 | |||
| OMP_NUM_THREADS=2 ./dblat3 < ./dblat3.dat | |||
| @@ -87,6 +99,10 @@ ifeq ($(USE_OPENMP), 1) | |||
| OMP_NUM_THREADS=2 ./zblat3 < ./zblat3.dat | |||
| @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 | |||
| else | |||
| ifeq ($(BUILD_HALF),1) | |||
| OPENBLAS_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM | |||
| @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 | |||
| endif | |||
| OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat | |||
| @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 | |||
| OPENBLAS_NUM_THREADS=2 ./dblat3 < ./dblat3.dat | |||
| @@ -165,6 +181,11 @@ zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME) | |||
| sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) | |||
| ifeq ($(BUILD_HALF),1) | |||
| test_shgemm : compare_sgemm_shgemm.c ../$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o test_shgemm compare_sgemm_shgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) | |||
| endif | |||
| dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) | |||
| @@ -187,7 +208,7 @@ clean: | |||
| @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ | |||
| sblat1 dblat1 cblat1 zblat1 \ | |||
| sblat2 dblat2 cblat2 zblat2 \ | |||
| sblat3 dblat3 cblat3 zblat3 \ | |||
| test_shgemm sblat3 dblat3 cblat3 zblat3 \ | |||
| sblat1p dblat1p cblat1p zblat1p \ | |||
| sblat2p dblat2p cblat2p zblat2p \ | |||
| sblat3p dblat3p cblat3p zblat3p \ | |||
| @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <stdint.h> | |||
| #include "common.h" | |||
| #include "../common.h" | |||
| #define SGEMM BLASFUNC(sgemm) | |||
| #define SHGEMM BLASFUNC(shgemm) | |||
| typedef union | |||
| @@ -52,7 +52,7 @@ main (int argc, char *argv[]) | |||
| int m, n, k; | |||
| int i, j, l; | |||
| int ret = 0; | |||
| int loop = 20; | |||
| int loop = 100; | |||
| char transA = 'N', transB = 'N'; | |||
| float alpha = 1.0, beta = 0.0; | |||
| char transa = 'N'; | |||
| @@ -71,8 +71,8 @@ main (int argc, char *argv[]) | |||
| { | |||
| for (int i = 0; i < m; i++) | |||
| { | |||
| A[j * k + i] = j * 9.0; | |||
| B[j * k + i] = i * 2.0; | |||
| A[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; | |||
| B[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; | |||
| C[j * k + i] = 0; | |||
| AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; | |||
| BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; | |||
| @@ -85,11 +85,12 @@ main (int argc, char *argv[]) | |||
| &m, BB, &k, &beta, CC, &m); | |||
| for (i = 0; i < n; i++) | |||
| for (j = 0; j < m; j++) | |||
| for (l = 0; l < k; l++) | |||
| if (CC[i * m + j] != C[i * m + j]) | |||
| ret++; | |||
| for (j = 0; j < m; j++) | |||
| for (l = 0; l < k; l++) | |||
| if (fabs(CC[i * m + j]-C[i * m + j]) > 1.0) | |||
| ret++; | |||
| } | |||
| fprintf (stderr, "Return code: %d\n", ret); | |||
| if (ret != 0) | |||
| fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); | |||
| return ret; | |||
| } | |||