Add sgemm part for Arm SVEtags/v0.3.19
| @@ -1483,29 +1483,61 @@ $(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XT | |||
| $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ | |||
| ifdef STRMMUNCOPY_M | |||
| $(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef STRMMLNCOPY_M | |||
| $(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef STRMMUTCOPY_M | |||
| $(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef STRMMLTCOPY_M | |||
| $(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| $(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
| @@ -1809,11 +1841,21 @@ $(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N). | |||
| $(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ | |||
| ifdef SSYMMUCOPY_M | |||
| $(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
| else | |||
| $(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
| endif | |||
| ifdef SSYMMLCOPY_M | |||
| $(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
| else | |||
| $(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
| endif | |||
| $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ | |||
| @@ -114,35 +114,26 @@ DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| ifeq ($(SGEMM_UNROLL_M), 16) | |||
| SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_M), 4) | |||
| SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S | |||
| SGEMMINCOPY = sgemm_ncopy_sve_v1.c | |||
| SGEMMITCOPY = sgemm_tcopy_sve_v1.c | |||
| SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_N), 16) | |||
| SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S | |||
| else | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_N), 4) | |||
| SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||
| else | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| endif | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| STRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| STRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| SSYMMUCOPY_M = symm_ucopy_sve.c | |||
| SSYMMLCOPY_M = symm_lcopy_sve.c | |||
| DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
| @@ -114,35 +114,27 @@ DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| ifeq ($(SGEMM_UNROLL_M), 16) | |||
| SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_M), 4) | |||
| SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S | |||
| SGEMMINCOPY = sgemm_ncopy_sve_v1.c | |||
| SGEMMITCOPY = sgemm_tcopy_sve_v1.c | |||
| SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_N), 16) | |||
| SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S | |||
| else | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_N), 4) | |||
| SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||
| else | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| endif | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| STRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| STRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| SSYMMUCOPY_M = symm_ucopy_sve.c | |||
| SSYMMLCOPY_M = symm_lcopy_sve.c | |||
| DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
| @@ -0,0 +1,874 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* X0 X1 X2 s0 X3 x4 x5 x6 */ | |||
| /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ | |||
| #define origM x0 | |||
| #define origN x1 | |||
| #define origK x2 | |||
| #define origPA x3 | |||
| #define origPB x4 | |||
| #define pC x5 | |||
| #define LDC x6 | |||
| #define temp x7 | |||
| #define counterL x8 | |||
| #define counterI x9 | |||
| #define counterJ x10 | |||
| #define pB x11 | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define lanes x15 | |||
| #define pA x16 | |||
| #define alpha w17 | |||
| #define alpha0 s10 | |||
| #define alphaZ z2.s | |||
| #define A_PRE_SIZE 1536 | |||
| #define B_PRE_SIZE 512 | |||
| #define C_PRE_SIZE 128 | |||
| // 00 origM | |||
| // 01 origN | |||
| // 02 origK | |||
| // 03 origPA | |||
| // 04 origPB | |||
| // 05 pC | |||
| // 06 origLDC -> LDC | |||
| // 07 temp | |||
| // 08 counterL | |||
| // 09 counterI | |||
| // 10 counterJ | |||
| // 11 pB | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 lanes | |||
| // 16 pA | |||
| // 17 | |||
| // 18 must save | |||
| // 19 must save | |||
| // 20 must save | |||
| // 21 must save | |||
| // 22 must save | |||
| // 23 must save | |||
| // 24 must save | |||
| // 25 must save | |||
| // 26 must save | |||
| // 27 must save | |||
| // 28 must save | |||
| // 29 frame | |||
| // 30 link | |||
| // 31 sp | |||
| //v00 ALPHA -> pA0_0 | |||
| //v01 pA0_1 | |||
| //v02 ALPHA0 | |||
| //v03 | |||
| //v04 | |||
| //v05 | |||
| //v06 | |||
| //v07 | |||
| //v08 must save pB0_0 | |||
| //v09 must save pB0_1 | |||
| //v10 must save pB0_2 | |||
| //v11 must save pB0_3 | |||
| //v12 must save pB0_4 | |||
| //v13 must save pB0_5 | |||
| //v14 must save pB0_6 | |||
| //v15 must save pB0_7 | |||
| //v16 must save C0 | |||
| //v17 must save C1 | |||
| //v18 must save C2 | |||
| //v19 must save C3 | |||
| //v20 must save C4 | |||
| //v21 must save C5 | |||
| //v22 must save C6 | |||
| //v23 must save C7 | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| .macro INITv1x8 | |||
| dup z16.s, #0 | |||
| dup z17.s, #0 | |||
| dup z18.s, #0 | |||
| dup z19.s, #0 | |||
| dup z20.s, #0 | |||
| dup z21.s, #0 | |||
| dup z22.s, #0 | |||
| dup z23.s, #0 | |||
| .endm | |||
| .macro KERNELv1x8_I | |||
| ld1w z0.s, p1/z, [pA] | |||
| ld1w z1.s, p1/z, [pA, lanes, lsl #2] // next one | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 | |||
| ld1rw z8.s, p0/z, [pB] | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| fmla z16.s, p1/m, z0.s, z8.s | |||
| ld1rw z8.s, p0/z, [pB] | |||
| fmla z17.s, p1/m, z0.s, z9.s | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| fmla z18.s, p1/m, z0.s, z10.s | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| fmla z19.s, p1/m, z0.s, z11.s | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| fmla z20.s, p1/m, z0.s, z12.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| fmla z21.s, p1/m, z0.s, z13.s | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| fmla z22.s, p1/m, z0.s, z14.s | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| fmla z23.s, p1/m, z0.s, z15.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| .endm | |||
| .macro KERNELv1x8_M1 | |||
| ld1w z1.s, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
| fmla z16.s, p1/m, z0.s, z8.s | |||
| ld1rw z8.s, p0/z, [pB] | |||
| fmla z17.s, p1/m, z0.s, z9.s | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| fmla z18.s, p1/m, z0.s, z10.s | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| fmla z19.s, p1/m, z0.s, z11.s | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| fmla z20.s, p1/m, z0.s, z12.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| fmla z21.s, p1/m, z0.s, z13.s | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| fmla z22.s, p1/m, z0.s, z14.s | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| fmla z23.s, p1/m, z0.s, z15.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| .endm | |||
| .macro KERNELv1x8_M2 | |||
| ld1w z0.s, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
| fmla z16.s, p1/m, z1.s, z8.s | |||
| ld1rw z8.s, p0/z, [pB] | |||
| fmla z17.s, p1/m, z1.s, z9.s | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| fmla z18.s, p1/m, z1.s, z10.s | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| fmla z19.s, p1/m, z1.s, z11.s | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| fmla z20.s, p1/m, z1.s, z12.s | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z21.s, p1/m, z1.s, z13.s | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| fmla z22.s, p1/m, z1.s, z14.s | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| fmla z23.s, p1/m, z1.s, z15.s | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| .endm | |||
| .macro KERNELv1x8_E | |||
| fmla z16.s, p1/m, z1.s, z8.s | |||
| fmla z17.s, p1/m, z1.s, z9.s | |||
| fmla z18.s, p1/m, z1.s, z10.s | |||
| fmla z19.s, p1/m, z1.s, z11.s | |||
| fmla z20.s, p1/m, z1.s, z12.s | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z21.s, p1/m, z1.s, z13.s | |||
| fmla z22.s, p1/m, z1.s, z14.s | |||
| fmla z23.s, p1/m, z1.s, z15.s | |||
| .endm | |||
| .macro KERNELv1x8_SUB | |||
| ld1w z0.s, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
| ld1rw z8.s, p0/z, [pB] | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| fmla z16.s, p1/m, z0.s, z8.s | |||
| fmla z17.s, p1/m, z0.s, z9.s | |||
| fmla z18.s, p1/m, z0.s, z10.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z19.s, p1/m, z0.s, z11.s | |||
| fmla z20.s, p1/m, z0.s, z12.s | |||
| fmla z21.s, p1/m, z0.s, z13.s | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z22.s, p1/m, z0.s, z14.s | |||
| fmla z23.s, p1/m, z0.s, z15.s | |||
| .endm | |||
| .macro SAVEv1x8 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1w z24.s, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaZ | |||
| st1w z24.s, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1w z25.s, p1/z, [pCRow1] | |||
| fmla z25.s, p1/m, z17.s, alphaZ | |||
| st1w z25.s, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1w z26.s, p1/z, [pCRow2] | |||
| fmla z26.s, p1/m, z18.s, alphaZ | |||
| st1w z26.s, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1w z27.s, p1/z, [pCRow1] | |||
| fmla z27.s, p1/m, z19.s, alphaZ | |||
| st1w z27.s, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1w z28.s, p1/z, [pCRow2] | |||
| fmla z28.s, p1/m, z20.s, alphaZ | |||
| st1w z28.s, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1w z29.s, p1/z, [pCRow1] | |||
| fmla z29.s, p1/m, z21.s, alphaZ | |||
| st1w z29.s, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1w z30.s, p1/z, [pCRow2] | |||
| fmla z30.s, p1/m, z22.s, alphaZ | |||
| st1w z30.s, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld1w z31.s, p1/z, [pCRow1] | |||
| fmla z31.s, p1/m, z23.s, alphaZ | |||
| st1w z31.s, p1, [pCRow1] | |||
| add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x4 | |||
| dup z16.s, #0 | |||
| dup z17.s, #0 | |||
| dup z18.s, #0 | |||
| dup z19.s, #0 | |||
| .endm | |||
| .macro KERNELv1x4_SUB | |||
| ld1w z0.s, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
| ld1rw z8.s, p0/z, [pB] | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| add pB, pB, 16 | |||
| fmla z16.s, p1/m, z0.s, z8.s | |||
| fmla z17.s, p1/m, z0.s, z9.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z18.s, p1/m, z0.s, z10.s | |||
| fmla z19.s, p1/m, z0.s, z11.s | |||
| .endm | |||
| .macro SAVEv1x4 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1w z24.s, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaZ | |||
| st1w z24.s, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1w z25.s, p1/z, [pCRow1] | |||
| fmla z25.s, p1/m, z17.s, alphaZ | |||
| st1w z25.s, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1w z26.s, p1/z, [pCRow2] | |||
| fmla z26.s, p1/m, z18.s, alphaZ | |||
| st1w z26.s, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld1w z27.s, p1/z, [pCRow1] | |||
| fmla z27.s, p1/m, z19.s, alphaZ | |||
| st1w z27.s, p1, [pCRow1] | |||
| add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x2 | |||
| dup z16.s, #0 | |||
| dup z17.s, #0 | |||
| .endm | |||
| .macro KERNELv1x2_SUB | |||
| ld1w z0.s, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
| ld1rw z8.s, p0/z, [pB] | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| add pB, pB, 8 | |||
| fmla z16.s, p1/m, z0.s, z8.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z17.s, p1/m, z0.s, z9.s | |||
| .endm | |||
| .macro SAVEv1x2 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1w z24.s, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaZ | |||
| st1w z24.s, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld1w z25.s, p1/z, [pCRow1] | |||
| fmla z25.s, p1/m, z17.s, alphaZ | |||
| st1w z25.s, p1, [pCRow1] | |||
| add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x1 | |||
| dup z16.s, #0 | |||
| .endm | |||
| .macro KERNELv1x1_SUB | |||
| ld1w z0.s, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #2 // pA = pA + lanes * 8 | |||
| ld1rw z8.s, p0/z, [pB] | |||
| add pB, pB, 4 | |||
| fmla z16.s, p1/m, z0.s, z8.s | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| .endm | |||
| .macro SAVEv1x1 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld1w z24.s, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaZ | |||
| st1w z24.s, p1, [pCRow0] | |||
| add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 | |||
| .endm | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alpha, s0 | |||
| dup alphaZ, alpha | |||
| lsl LDC, LDC, #2 // ldc = ldc * 4 | |||
| ptrue p0.s // create true predicate | |||
| mov pB, origPB | |||
| // Loop over N | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #3 // J = J / 8 | |||
| cmp counterJ, #0 | |||
| ble .Ldgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| /* Repeat this as long as there are 8 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_BEGIN: | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC, lsl #3 // add 8 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L8_Mv1_BEGIN: | |||
| /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | |||
| mov counterI, #0 | |||
| whilelt p1.s, counterI, origM | |||
| cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x8 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt .Ldgemm_kernel_L8_Mv1_32 | |||
| KERNELv1x8_I | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble .Ldgemm_kernel_L8_Mv1_22a | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_22: | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L8_Mv1_22 | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_22a: | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_E | |||
| b .Ldgemm_kernel_L8_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_32: | |||
| tst counterL, #1 | |||
| ble .Ldgemm_kernel_L8_Mv1_40 | |||
| KERNELv1x8_I | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_M2 | |||
| KERNELv1x8_M1 | |||
| KERNELv1x8_E | |||
| b .Ldgemm_kernel_L8_Mv1_44 | |||
| .Ldgemm_kernel_L8_Mv1_40: | |||
| INITv1x8 | |||
| .Ldgemm_kernel_L8_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L8_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_46: | |||
| KERNELv1x8_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Ldgemm_kernel_L8_Mv1_46 | |||
| .Ldgemm_kernel_L8_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x8 | |||
| .Ldgemm_kernel_L8_Mv1_END: | |||
| incw counterI | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension | |||
| b.any .Ldgemm_kernel_L8_Mv1_20 | |||
| .Ldgemm_kernel_L8_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 8 * 4 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt .Ldgemm_kernel_L8_BEGIN | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 4 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #4 | |||
| ble .Ldgemm_kernel_L2_BEGIN | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC, lsl #2 // add 4 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L4_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x4 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 4 to do? | |||
| ble .Ldgemm_kernel_L4_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L4_Mv1_22 | |||
| .Ldgemm_kernel_L4_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L4_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Ldgemm_kernel_L4_Mv1_46 | |||
| .Ldgemm_kernel_L4_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x4 | |||
| .Ldgemm_kernel_L4_Mv1_END: | |||
| incw counterI | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| b.any .Ldgemm_kernel_L4_Mv1_20 | |||
| .Ldgemm_kernel_L4_END: | |||
| lsl temp, origK, #4 | |||
| add origPB, origPB, temp // B = B + K * 4 * 4 | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 2 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #2 | |||
| ble .Ldgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC, lsl #1 // add 2 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L2_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x2 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 4 to do? | |||
| ble .Ldgemm_kernel_L2_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L2_Mv1_22 | |||
| .Ldgemm_kernel_L2_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L2_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Ldgemm_kernel_L2_Mv1_46 | |||
| .Ldgemm_kernel_L2_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x2 | |||
| .Ldgemm_kernel_L2_Mv1_END: | |||
| incw counterI | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| b.any .Ldgemm_kernel_L2_Mv1_20 | |||
| .Ldgemm_kernel_L2_END: | |||
| add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 1 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble .Ldgemm_kernel_L999 // done | |||
| mov pCRow0, pC | |||
| add pC, pC, LDC // add 1 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Ldgemm_kernel_L1_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x1 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 8 to do? | |||
| ble .Ldgemm_kernel_L1_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L1_Mv1_22 | |||
| .Ldgemm_kernel_L1_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Ldgemm_kernel_L1_Mv1_100 | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Ldgemm_kernel_L1_Mv1_46 | |||
| .Ldgemm_kernel_L1_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x1 | |||
| .Ldgemm_kernel_L1_Mv1_END: | |||
| incw counterI | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| b.any .Ldgemm_kernel_L1_Mv1_20 | |||
| .Ldgemm_kernel_L1_END: | |||
| /******************************************************************************/ | |||
| .Ldgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,78 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| svint32_t lda_vec = svindex_s32(0LL, lda); | |||
| uint32_t sve_size = svcntw(); | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, n); | |||
| uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint32_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec); | |||
| svst1_f32(pg, (float *) boffset, a_vec); | |||
| aoffset1++; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size * lda; | |||
| j += svcntw(); | |||
| pg = svwhilelt_b32(j, n); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,77 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| uint32_t sve_size = svcntw(); | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, n); | |||
| uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint32_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat32_t a_vec = svld1(pg, (float *) aoffset1); | |||
| svst1_f32(pg, (float *) boffset, a_vec); | |||
| aoffset1 += lda; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size; | |||
| j += svcntw(); | |||
| pg = svwhilelt_b32(j, n); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -44,6 +44,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| BLASLONG i, offset; | |||
| #if defined(DOUBLE) | |||
| uint64_t sve_size = svcntd(); | |||
| svint64_t posY_vec = svdup_s64(posY); | |||
| svint64_t posX_vec = svdup_s64(posX); | |||
| @@ -89,5 +90,54 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| #else | |||
| uint32_t sve_size = svcntw(); | |||
| svint32_t posY_vec = svdup_s32(posY); | |||
| svint32_t posX_vec = svdup_s32(posX); | |||
| svint32_t lda_vec = svdup_s32(lda); | |||
| svint32_t one_vec = svdup_s32(1); | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| do { | |||
| offset = posX - posY; | |||
| svint32_t vec_off = svdup_s32(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint32_t temp = svadd_z(pg, posX_vec, index); | |||
| svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); | |||
| svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda); | |||
| svint32_t gat_ind = svsel(cmp, temp1, temp2); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec); | |||
| svst1(pg, b, data_vec); | |||
| b += active; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -44,6 +44,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| BLASLONG i, offset; | |||
| #if defined(DOUBLE) | |||
| uint64_t sve_size = svcntd(); | |||
| svint64_t posY_vec = svdup_s64(posY); | |||
| svint64_t posX_vec = svdup_s64(posX); | |||
| @@ -89,5 +90,54 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| #else | |||
| uint32_t sve_size = svcntw(); | |||
| svint32_t posY_vec = svdup_s32(posY); | |||
| svint32_t posX_vec = svdup_s32(posX); | |||
| svint32_t lda_vec = svdup_s32(lda); | |||
| svint32_t one_vec = svdup_s32(1); | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| do { | |||
| offset = posX - posY; | |||
| svint32_t vec_off = svdup_s32(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint32_t temp = svadd_z(pg, posX_vec, index); | |||
| svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); | |||
| svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda); | |||
| svint32_t gat_ind = svsel(cmp, temp2, temp1); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, one_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
| svst1(pg, b, data_vec); | |||
| b += active; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -48,12 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| FLOAT *ao; | |||
| js = 0; | |||
| FLOAT *ao; | |||
| #ifdef DOUBLE | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| { | |||
| X = posX; | |||
| @@ -68,7 +73,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| do | |||
| { | |||
| if (X > posY) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #else | |||
| svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| ao ++; | |||
| b += n_active; | |||
| @@ -113,9 +122,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -50,8 +50,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| js = 0; | |||
| #ifdef DOUBLE | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| { | |||
| X = posX; | |||
| @@ -72,7 +77,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| i ++; | |||
| } else | |||
| if (X < posY) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1(pn, ao); | |||
| #else | |||
| svfloat32_t aj_vec = svld1(pn, ao); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| ao += lda; | |||
| b += n_active; | |||
| @@ -112,9 +121,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| @@ -48,12 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| FLOAT *ao; | |||
| js = 0; | |||
| FLOAT *ao; | |||
| #ifdef DOUBLE | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| { | |||
| X = posX; | |||
| @@ -68,7 +73,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| do | |||
| { | |||
| if (X < posY) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #else | |||
| svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| ao ++; | |||
| b += n_active; | |||
| @@ -113,9 +122,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -50,8 +50,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| js = 0; | |||
| #ifdef DOUBLE | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| { | |||
| X = posX; | |||
| @@ -72,7 +77,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| i ++; | |||
| } else | |||
| if (X > posY) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1(pn, ao); | |||
| #else | |||
| svfloat32_t aj_vec = svld1(pn, ao); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| ao += lda; | |||
| b += n_active; | |||
| @@ -111,9 +120,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -3309,14 +3309,22 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #elif defined(ARMV8SVE) || defined(A64FX) | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". | |||
| Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| /* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N) | |||
| * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro. | |||
| * If SVE size is ever more than 1024, this should be increased also. */ | |||
| #define SGEMM_DEFAULT_UNROLL_MN 32 | |||
| /* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl". | |||
| Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ | |||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||
| #define DGEMM_DEFAULT_UNROLL_MN 32 | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||