Add dgemm kernel for arm64 SVEtags/v0.3.19
| @@ -197,3 +197,7 @@ In chronological order: | |||
| * River Dillon <oss@outerpassage.net> | |||
| * [2021-07-10] fix compilation with musl libc | |||
| * Bine Brank <https://github.com/binebrank> | |||
| * [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE | |||
| * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM | |||
| @@ -20,6 +20,13 @@ FCOMMON_OPT += -march=armv8-a | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), ARMV8SVE) | |||
| CCOMMON_OPT += -march=armv8-a+sve | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a+sve | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), CORTEXA53) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| @@ -144,6 +144,24 @@ if (${CORE} STREQUAL SAPPHIRERAPIDS) | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL A64FX) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL ARMV8SVE) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| endif () | |||
| endif () | |||
| if (NOT DYNAMIC_ARCH) | |||
| if (HAVE_AVX2) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") | |||
| @@ -1198,6 +1198,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_ARMV8SVE | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "ARMV8SVE" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DARMV8SVE " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" | |||
| #define LIBNAME "armv8sve" | |||
| #define CORENAME "ARMV8SVE" | |||
| #endif | |||
| #ifdef FORCE_ARMV8 | |||
| #define FORCE | |||
| @@ -1436,7 +1450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \ | |||
| "-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" | |||
| #define LIBNAME "a64fx" | |||
| #define CORENAME "A64FX" | |||
| #else | |||
| @@ -418,32 +418,50 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) | |||
| # symm for s and d | |||
| if (NOT DEFINED ${float_char}SYMMUCOPY_M) | |||
| set(SYMMUCOPY_M "generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(SYMMLCOPY_M "generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| else () | |||
| set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}") | |||
| set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}") | |||
| endif() | |||
| GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type}) | |||
| # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. | |||
| # Could simplify it a bit by pairing up by -UUNIT/-DUNIT. | |||
| GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) | |||
| if (NOT DEFINED ${float_char}TRMMUNCOPY_M) | |||
| set(TRMMUNCOPY_M "generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(TRMMLNCOPY_M "generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(TRMMUTCOPY_M "generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(TRMMLTCOPY_M "generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| else () | |||
| set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}") | |||
| set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}") | |||
| set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}") | |||
| set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}") | |||
| endif () | |||
| GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) | |||
| @@ -1531,29 +1531,61 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N | |||
| $(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | |||
| ifdef DTRMMUNCOPY_M | |||
| $(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef DTRMMLNCOPY_M | |||
| $(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef DTRMMUTCOPY_M | |||
| $(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef DTRMMLTCOPY_M | |||
| $(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| $(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
| @@ -1789,11 +1821,21 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N). | |||
| $(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ | |||
| ifdef DSYMMUCOPY_M | |||
| $(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
| else | |||
| $(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
| endif | |||
| ifdef DSYMMLCOPY_M | |||
| $(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
| else | |||
| $(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
| endif | |||
| $(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ | |||
| @@ -143,34 +143,28 @@ endif | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
| else | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| endif | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||
| DGEMMINCOPY = dgemm_ncopy_sve_v1.c | |||
| DGEMMITCOPY = dgemm_tcopy_sve_v1.c | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| else | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| endif | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| DSYMMUCOPY_M = symm_ucopy_sve.c | |||
| DSYMMLCOPY_M = symm_lcopy_sve.c | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| @@ -0,0 +1,191 @@ | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = axpy.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SASUMKERNEL = asum.S | |||
| DASUMKERNEL = asum.S | |||
| CASUMKERNEL = casum.S | |||
| ZASUMKERNEL = zasum.S | |||
| SCOPYKERNEL = copy.S | |||
| DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| SSWAPKERNEL = swap.S | |||
| DSWAPKERNEL = swap.S | |||
| CSWAPKERNEL = swap.S | |||
| ZSWAPKERNEL = swap.S | |||
| ISAMAXKERNEL = iamax.S | |||
| IDAMAXKERNEL = iamax.S | |||
| ICAMAXKERNEL = izamax.S | |||
| IZAMAXKERNEL = izamax.S | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| ifeq ($(SGEMM_UNROLL_M), 16) | |||
| SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_M), 4) | |||
| SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_N), 16) | |||
| SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S | |||
| else | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_N), 4) | |||
| SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||
| else | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| endif | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
| DGEMMINCOPY = dgemm_ncopy_sve_v1.c | |||
| DGEMMITCOPY = dgemm_tcopy_sve_v1.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| DSYMMUCOPY_M = symm_ucopy_sve.c | |||
| DSYMMLCOPY_M = symm_lcopy_sve.c | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -0,0 +1,874 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* X0 X1 X2 s0 X3 x4 x5 x6 */ | |||
| /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ | |||
| #define origM x0 | |||
| #define origN x1 | |||
| #define origK x2 | |||
| #define origPA x3 | |||
| #define origPB x4 | |||
| #define pC x5 | |||
| #define LDC x6 | |||
| #define temp x7 | |||
| #define counterL x8 | |||
| #define counterI x9 | |||
| #define counterJ x10 | |||
| #define pB x11 | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define lanes x15 | |||
| #define pA x16 | |||
| #define alpha x17 | |||
| #define alpha0 d10 | |||
| #define alphaZ z2.d | |||
| #define A_PRE_SIZE 1536 | |||
| #define B_PRE_SIZE 512 | |||
| #define C_PRE_SIZE 128 | |||
| // 00 origM | |||
| // 01 origN | |||
| // 02 origK | |||
| // 03 origPA | |||
| // 04 origPB | |||
| // 05 pC | |||
| // 06 origLDC -> LDC | |||
| // 07 temp | |||
| // 08 counterL | |||
| // 09 counterI | |||
| // 10 counterJ | |||
| // 11 pB | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 lanes | |||
| // 16 pA | |||
| // 17 | |||
| // 18 must save | |||
| // 19 must save | |||
| // 20 must save | |||
| // 21 must save | |||
| // 22 must save | |||
| // 23 must save | |||
| // 24 must save | |||
| // 25 must save | |||
| // 26 must save | |||
| // 27 must save | |||
| // 28 must save | |||
| // 29 frame | |||
| // 30 link | |||
| // 31 sp | |||
| //v00 ALPHA -> pA0_0 | |||
| //v01 pA0_1 | |||
| //v02 ALPHA0 | |||
| //v03 | |||
| //v04 | |||
| //v05 | |||
| //v06 | |||
| //v07 | |||
| //v08 must save pB0_0 | |||
| //v09 must save pB0_1 | |||
| //v10 must save pB0_2 | |||
| //v11 must save pB0_3 | |||
| //v12 must save pB0_4 | |||
| //v13 must save pB0_5 | |||
| //v14 must save pB0_6 | |||
| //v15 must save pB0_7 | |||
| //v16 must save C0 | |||
| //v17 must save C1 | |||
| //v18 must save C2 | |||
| //v19 must save C3 | |||
| //v20 must save C4 | |||
| //v21 must save C5 | |||
| //v22 must save C6 | |||
| //v23 must save C7 | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| .macro INITv1x8 | |||
| dup z16.d, #0 | |||
| dup z17.d, #0 | |||
| dup z18.d, #0 | |||
| dup z19.d, #0 | |||
| dup z20.d, #0 | |||
| dup z21.d, #0 | |||
| dup z22.d, #0 | |||
| dup z23.d, #0 | |||
| .endm | |||
| .macro KERNELv1x8_I | |||
| ld1d z0.d, p1/z, [pA] | |||
| ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one | |||
| add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| ld1rd z8.d, p0/z, [pB] | |||
| fmla z17.d, p1/m, z0.d, z9.d | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| fmla z18.d, p1/m, z0.d, z10.d | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| fmla z19.d, p1/m, z0.d, z11.d | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| fmla z20.d, p1/m, z0.d, z12.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| fmla z21.d, p1/m, z0.d, z13.d | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| fmla z22.d, p1/m, z0.d, z14.d | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| fmla z23.d, p1/m, z0.d, z15.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| .endm | |||
| .macro KERNELv1x8_M1 | |||
| ld1d z1.d, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| ld1rd z8.d, p0/z, [pB] | |||
| fmla z17.d, p1/m, z0.d, z9.d | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| fmla z18.d, p1/m, z0.d, z10.d | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| fmla z19.d, p1/m, z0.d, z11.d | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| fmla z20.d, p1/m, z0.d, z12.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| fmla z21.d, p1/m, z0.d, z13.d | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| fmla z22.d, p1/m, z0.d, z14.d | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| fmla z23.d, p1/m, z0.d, z15.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| .endm | |||
| .macro KERNELv1x8_M2 | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| fmla z16.d, p1/m, z1.d, z8.d | |||
| ld1rd z8.d, p0/z, [pB] | |||
| fmla z17.d, p1/m, z1.d, z9.d | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| fmla z18.d, p1/m, z1.d, z10.d | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| fmla z19.d, p1/m, z1.d, z11.d | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| fmla z20.d, p1/m, z1.d, z12.d | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z21.d, p1/m, z1.d, z13.d | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| fmla z22.d, p1/m, z1.d, z14.d | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| fmla z23.d, p1/m, z1.d, z15.d | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| .endm | |||
| .macro KERNELv1x8_E | |||
| fmla z16.d, p1/m, z1.d, z8.d | |||
| fmla z17.d, p1/m, z1.d, z9.d | |||
| fmla z18.d, p1/m, z1.d, z10.d | |||
| fmla z19.d, p1/m, z1.d, z11.d | |||
| fmla z20.d, p1/m, z1.d, z12.d | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z21.d, p1/m, z1.d, z13.d | |||
| fmla z22.d, p1/m, z1.d, z14.d | |||
| fmla z23.d, p1/m, z1.d, z15.d | |||
| .endm | |||
| .macro KERNELv1x8_SUB | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| fmla z17.d, p1/m, z0.d, z9.d | |||
| fmla z18.d, p1/m, z0.d, z10.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z19.d, p1/m, z0.d, z11.d | |||
| fmla z20.d, p1/m, z0.d, z12.d | |||
| fmla z21.d, p1/m, z0.d, z13.d | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z22.d, p1/m, z0.d, z14.d | |||
| fmla z23.d, p1/m, z0.d, z15.d | |||
| .endm | |||
| .macro SAVEv1x8 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1d z24.d, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaZ | |||
| st1d z24.d, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z25.d, p1/z, [pCRow1] | |||
| fmla z25.d, p1/m, z17.d, alphaZ | |||
| st1d z25.d, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1d z26.d, p1/z, [pCRow2] | |||
| fmla z26.d, p1/m, z18.d, alphaZ | |||
| st1d z26.d, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z27.d, p1/z, [pCRow1] | |||
| fmla z27.d, p1/m, z19.d, alphaZ | |||
| st1d z27.d, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1d z28.d, p1/z, [pCRow2] | |||
| fmla z28.d, p1/m, z20.d, alphaZ | |||
| st1d z28.d, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z29.d, p1/z, [pCRow1] | |||
| fmla z29.d, p1/m, z21.d, alphaZ | |||
| st1d z29.d, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1d z30.d, p1/z, [pCRow2] | |||
| fmla z30.d, p1/m, z22.d, alphaZ | |||
| st1d z30.d, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld1d z31.d, p1/z, [pCRow1] | |||
| fmla z31.d, p1/m, z23.d, alphaZ | |||
| st1d z31.d, p1, [pCRow1] | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x4 | |||
| dup z16.d, #0 | |||
| dup z17.d, #0 | |||
| dup z18.d, #0 | |||
| dup z19.d, #0 | |||
| .endm | |||
| .macro KERNELv1x4_SUB | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| add pB, pB, 32 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| fmla z17.d, p1/m, z0.d, z9.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z18.d, p1/m, z0.d, z10.d | |||
| fmla z19.d, p1/m, z0.d, z11.d | |||
| .endm | |||
| .macro SAVEv1x4 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1d z24.d, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaZ | |||
| st1d z24.d, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z25.d, p1/z, [pCRow1] | |||
| fmla z25.d, p1/m, z17.d, alphaZ | |||
| st1d z25.d, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1d z26.d, p1/z, [pCRow2] | |||
| fmla z26.d, p1/m, z18.d, alphaZ | |||
| st1d z26.d, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld1d z27.d, p1/z, [pCRow1] | |||
| fmla z27.d, p1/m, z19.d, alphaZ | |||
| st1d z27.d, p1, [pCRow1] | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x2 | |||
| dup z16.d, #0 | |||
| dup z17.d, #0 | |||
| .endm | |||
| .macro KERNELv1x2_SUB | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| add pB, pB, 16 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z17.d, p1/m, z0.d, z9.d | |||
| .endm | |||
| .macro SAVEv1x2 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1d z24.d, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaZ | |||
| st1d z24.d, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld1d z25.d, p1/z, [pCRow1] | |||
| fmla z25.d, p1/m, z17.d, alphaZ | |||
| st1d z25.d, p1, [pCRow1] | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x1 | |||
| dup z16.d, #0 | |||
| .endm | |||
| .macro KERNELv1x1_SUB | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| add pB, pB, 8 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| .endm | |||
| .macro SAVEv1x1 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld1d z24.d, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaZ | |||
| st1d z24.d, p1, [pCRow0] | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
| .endm | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alpha, d0 | |||
| dup alphaZ, alpha | |||
| lsl LDC, LDC, #3 // ldc = ldc * 8 | |||
| ptrue p0.d // create true predicate | |||
| mov pB, origPB | |||
| // Loop over N | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #3 // J = J / 8 | |||
| cmp counterJ, #0 | |||
| ble .Ldgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| /* Repeat this as long as there are 8 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_BEGIN: | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC, lsl #3 // add 8 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L8_Mv1_BEGIN: | |||
| /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM | |||
| cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x8 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt .Ldgemm_kernel_L8_Mv1_32 | |||
| KERNELv1x8_I | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble .Ldgemm_kernel_L8_Mv1_22a | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_22: | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L8_Mv1_22 | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_22a: | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_E | |||
| b .Ldgemm_kernel_L8_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_32: | |||
| tst counterL, #1 | |||
| ble .Ldgemm_kernel_L8_Mv1_40 | |||
| KERNELv1x8_I | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_E | |||
| b .Ldgemm_kernel_L8_Mv1_44 | |||
| .Ldgemm_kernel_L8_Mv1_40: | |||
| INITv1x8 | |||
| .Ldgemm_kernel_L8_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L8_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_46: | |||
| KERNELv1x8_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Ldgemm_kernel_L8_Mv1_46 | |||
| .Ldgemm_kernel_L8_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x8 | |||
| .Ldgemm_kernel_L8_Mv1_END: | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||
| b.any .Ldgemm_kernel_L8_Mv1_20 | |||
| .Ldgemm_kernel_L8_END: | |||
| lsl temp, origK, #6 | |||
| add origPB, origPB, temp // B = B + K * 8 * 8 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt .Ldgemm_kernel_L8_BEGIN | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 4 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #4 | |||
| ble .Ldgemm_kernel_L2_BEGIN | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC, lsl #2 // add 4 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L4_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x4 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 4 to do? | |||
| ble .Ldgemm_kernel_L4_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L4_Mv1_22 | |||
| .Ldgemm_kernel_L4_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L4_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Ldgemm_kernel_L4_Mv1_46 | |||
| .Ldgemm_kernel_L4_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x4 | |||
| .Ldgemm_kernel_L4_Mv1_END: | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| b.any .Ldgemm_kernel_L4_Mv1_20 | |||
| .Ldgemm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 2 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #2 | |||
| ble .Ldgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC, lsl #1 // add 2 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L2_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x2 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 4 to do? | |||
| ble .Ldgemm_kernel_L2_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L2_Mv1_22 | |||
| .Ldgemm_kernel_L2_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L2_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Ldgemm_kernel_L2_Mv1_46 | |||
| .Ldgemm_kernel_L2_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x2 | |||
| .Ldgemm_kernel_L2_Mv1_END: | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| b.any .Ldgemm_kernel_L2_Mv1_20 | |||
| .Ldgemm_kernel_L2_END: | |||
| add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 1 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble .Ldgemm_kernel_L999 // done | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC // add 1 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L1_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x1 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 8 to do? | |||
| ble .Ldgemm_kernel_L1_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L1_Mv1_22 | |||
| .Ldgemm_kernel_L1_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L1_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L1_Mv1_46 | |||
| .Ldgemm_kernel_L1_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x1 | |||
| .Ldgemm_kernel_L1_Mv1_END: | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| b.any .Ldgemm_kernel_L1_Mv1_20 | |||
| .Ldgemm_kernel_L1_END: | |||
| /******************************************************************************/ | |||
| .Ldgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,79 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| svint64_t lda_vec = svindex_s64(0LL, lda); | |||
| uint64_t sve_size = svcntd(); | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint64_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); | |||
| svst1_f64(pg, (double *) boffset, a_vec); | |||
| aoffset1++; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size * lda; | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,77 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| uint64_t sve_size = svcntd(); | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint64_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat64_t a_vec = svld1(pg, (double *)aoffset1); | |||
| svst1_f64(pg, (double *) boffset, a_vec); | |||
| aoffset1 += lda; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size; | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,93 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, offset; | |||
| uint64_t sve_size = svcntd(); | |||
| svint64_t posY_vec = svdup_s64(posY); | |||
| svint64_t posX_vec = svdup_s64(posX); | |||
| svint64_t lda_vec = svdup_s64(lda); | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| do { | |||
| offset = posX - posY; | |||
| svint64_t vec_off = svdup_s64(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint64_t temp = svadd_z(pg, posX_vec, index); | |||
| svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); | |||
| svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda); | |||
| svint64_t gat_ind = svsel(cmp, temp1, temp2); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec); | |||
| svst1(pg, b, data_vec); | |||
| b += active; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,93 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, offset; | |||
| uint64_t sve_size = svcntd(); | |||
| svint64_t posY_vec = svdup_s64(posY); | |||
| svint64_t posX_vec = svdup_s64(posX); | |||
| svint64_t lda_vec = svdup_s64(lda); | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| do { | |||
| offset = posX - posY; | |||
| svint64_t vec_off = svdup_s64(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint64_t temp = svadd_z(pg, posX_vec, index); | |||
| svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); | |||
| svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda); | |||
| svint64_t gat_ind = svsel(cmp, temp2, temp1); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, one_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
| svst1(pg, b, data_vec); | |||
| b += active; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,121 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef __ARM_FEATURE_SVE | |||
| #include <arm_sve.h> | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| FLOAT *ao; | |||
| js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| do | |||
| { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao = a + posY + posX * lda; | |||
| } else { | |||
| ao = a + posX + posY * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X > posY) { | |||
| svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| svst1(pn, b, aj_vec); | |||
| ao ++; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else | |||
| if (X < posY) { | |||
| ao += lda; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = *(ao+k*lda+j); | |||
| } | |||
| b[temp++] = ONE; | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| } | |||
| #else | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k <= j; k++) { | |||
| b[temp++] = *(ao+k*lda+j); | |||
| } | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| } | |||
| #endif | |||
| ao += n_active; | |||
| b += n_active*n_active; | |||
| X += n_active; | |||
| i += n_active; | |||
| } | |||
| } while (i < m); | |||
| posY += n_active; | |||
| js += n_active; | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,121 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef __ARM_FEATURE_SVE | |||
| #include <arm_sve.h> | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| FLOAT *ao; | |||
| js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| do | |||
| { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao = a + posY + posX * lda; | |||
| } else { | |||
| ao = a + posX + posY * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X > posY) { | |||
| ao ++; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else | |||
| if (X < posY) { | |||
| svfloat64_t aj_vec = svld1(pn, ao); | |||
| svst1(pn, b, aj_vec); | |||
| ao += lda; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| b[temp++] = ONE; | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = *(ao+j*lda+k); | |||
| } | |||
| } | |||
| #else | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| for (int k = j; k < n_active; k++) { | |||
| b[temp++] = *(ao+j*lda+k); | |||
| } | |||
| } | |||
| #endif | |||
| ao += n_active * lda; | |||
| b += n_active*n_active; | |||
| X += n_active; | |||
| i += n_active; | |||
| } | |||
| } while (i < m); | |||
| posY += n_active; | |||
| js += n_active; | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,121 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef __ARM_FEATURE_SVE | |||
| #include <arm_sve.h> | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| FLOAT *ao; | |||
| js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| do | |||
| { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao = a + posX + posY * lda; | |||
| } else { | |||
| ao = a + posY + posX * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X < posY) { | |||
| svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| svst1(pn, b, aj_vec); | |||
| ao ++; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else | |||
| if (X > posY) { | |||
| ao += lda; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| b[temp++] = ONE; | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = *(ao+k*lda+j); | |||
| } | |||
| } | |||
| #else | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| for (int k = j; k < n_active; k++) { | |||
| b[temp++] = *(ao+k*lda+j); | |||
| } | |||
| } | |||
| #endif | |||
| ao += n_active; | |||
| b += n_active*n_active; | |||
| X += n_active; | |||
| i += n_active; | |||
| } | |||
| } while (i < m); | |||
| posY += n_active; | |||
| js += n_active; | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,119 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef __ARM_FEATURE_SVE | |||
| #include <arm_sve.h> | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| FLOAT *ao; | |||
| js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| do | |||
| { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao = a + posX + posY * lda; | |||
| } else { | |||
| ao = a + posY + posX * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X < posY) { | |||
| ao ++; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else | |||
| if (X > posY) { | |||
| svfloat64_t aj_vec = svld1(pn, ao); | |||
| svst1(pn, b, aj_vec); | |||
| ao += lda; | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = *(ao+j*lda+k); | |||
| } | |||
| b[temp++] = ONE; | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| } | |||
| #else | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k <= j; k++) { | |||
| b[temp++] = *(ao+j*lda+k); | |||
| } | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = ZERO; | |||
| } | |||
| } | |||
| #endif | |||
| ao += n_active * lda; | |||
| b += n_active*n_active; | |||
| X += n_active; | |||
| i += n_active; | |||
| } | |||
| } while (i < m); | |||
| posY += n_active; | |||
| js += n_active; | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| return 0; | |||
| } | |||
| @@ -3294,13 +3294,44 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #elif defined(ARMV8SVE) || defined(A64FX) | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| /* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl". | |||
| Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ | |||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||
| #define SGEMM_DEFAULT_P 128 | |||
| #define DGEMM_DEFAULT_P 160 | |||
| #define CGEMM_DEFAULT_P 128 | |||
| #define ZGEMM_DEFAULT_P 128 | |||
| #define SGEMM_DEFAULT_Q 352 | |||
| #define DGEMM_DEFAULT_Q 128 | |||
| #define CGEMM_DEFAULT_Q 224 | |||
| #define ZGEMM_DEFAULT_Q 112 | |||
| #define SGEMM_DEFAULT_R 4096 | |||
| #define DGEMM_DEFAULT_R 4096 | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #else /* Other/undetected ARMv8 cores */ | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -3325,6 +3356,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #endif /* Cores */ | |||
| #endif /* ARMv8 */ | |||
| #if defined(ARMV5) | |||