Use SVE zgemm/cgemm on Arm(R) Neoverse(TM) V1 coretags/v0.3.24
| @@ -160,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c | |||
| CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CGEMMINCOPY = cgemm_ncopy_sve_v1.c | |||
| CGEMMITCOPY = cgemm_tcopy_sve_v1.c | |||
| CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
| CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| @@ -184,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
| ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZGEMMINCOPY = zgemm_ncopy_sve_v1.c | |||
| ZGEMMITCOPY = zgemm_tcopy_sve_v1.c | |||
| ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
| ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| @@ -1,66 +1 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRMMUNCOPY_M = | |||
| CTRMMLNCOPY_M = | |||
| CTRMMUTCOPY_M = | |||
| CTRMMLTCOPY_M = | |||
| CHEMMLTCOPY_M = | |||
| CHEMMUTCOPY_M = | |||
| CSYMMUCOPY_M = | |||
| CSYMMLCOPY_M = | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| else | |||
| CGEMMINCOPYOBJ = | |||
| CGEMMITCOPYOBJ = | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMCOPYLN_M = | |||
| ZTRSMCOPYLT_M = | |||
| ZTRSMCOPYUN_M = | |||
| ZTRSMCOPYUT_M = | |||
| ZTRMMUNCOPY_M = | |||
| ZTRMMLNCOPY_M = | |||
| ZTRMMUTCOPY_M = | |||
| ZTRMMLTCOPY_M = | |||
| ZHEMMLTCOPY_M = | |||
| ZHEMMUTCOPY_M = | |||
| ZSYMMUCOPY_M = | |||
| ZSYMMLCOPY_M = | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| else | |||
| ZGEMMINCOPYOBJ = | |||
| ZGEMMITCOPYOBJ = | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pB, pB, 32 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_M1 | |||
| @@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_M2 | |||
| @@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri z23.s, p1/m, z2.s, z15.s | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| add pB, pB, 32 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_E | |||
| @@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ii z22.s, p1/m, z3.s, z15.s | |||
| OP_ri z23.s, p1/m, z2.s, z15.s | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_SUB | |||
| @@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ii z22.s, p1/m, z1.s, z15.s | |||
| OP_ri z23.s, p1/m, z0.s, z15.s | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| .endm | |||
| .macro SAVEv1x4 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaz_R | |||
| fmls z24.s, p1/m, z17.s, alphaz_I | |||
| @@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| st2w {z26.s, z27.s}, p1, [pCRow1] | |||
| add pCRow1, pCRow1, lanes, lsl #3 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld2w {z28.s, z29.s}, p1/z, [pCRow2] | |||
| fmla z28.s, p1/m, z20.s, alphaz_R | |||
| fmls z28.s, p1/m, z21.s, alphaz_I | |||
| @@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla z31.s, p1/m, z23.s, alphaz_R | |||
| st2w {z30.s, z31.s}, p1, [pCRow3] | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVEv1x2 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaz_R | |||
| fmls z24.s, p1/m, z17.s, alphaz_I | |||
| @@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| st2w {z26.s, z27.s}, p1, [pCRow1] | |||
| add pCRow1, pCRow1, lanes, lsl #3 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVEv1x1 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaz_R | |||
| fmls z24.s, p1/m, z17.s, alphaz_I | |||
| @@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alphaR, s0 | |||
| dup alphaz_R, alphaR | |||
| fmov alphaI, s1 | |||
| @@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bne .Lcgemm_kernel_L4_Mv1_46 | |||
| .Lcgemm_kernel_L4_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x4 | |||
| .Lcgemm_kernel_L4_Mv1_END: | |||
| @@ -0,0 +1,121 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdint.h> | |||
| #include <stdio.h> | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define COUNT "cntd" | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_INDEX svuint64_t | |||
| #define SV_INDEXER svindex_u64 | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64 | |||
| #else | |||
| #define COUNT "cntw" | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_INDEX svuint32_t | |||
| #define SV_INDEXER svindex_u32 | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32 | |||
| #endif | |||
| #define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ | |||
| a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \ | |||
| a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \ | |||
| svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \ | |||
| a_offset_inner += 2; \ | |||
| b_offset += active * 2; | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
| uint64_t sve_size; | |||
| asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); | |||
| IFLOAT *a_offset, *a_offset_inner, *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2); | |||
| SV_TYPE a_vec_real; | |||
| SV_TYPE a_vec_imag; | |||
| svbool_t pg_true = SV_TRUE(); | |||
| BLASLONG single_vectors_n = n & -sve_size; | |||
| for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = pg_true; | |||
| uint64_t active = sve_size; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| a_offset += sve_size * lda * 2; | |||
| } | |||
| BLASLONG remaining_n = n - single_vectors_n; | |||
| if (remaining_n) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); | |||
| uint64_t active = remaining_n; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,115 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdint.h> | |||
| #include <stdio.h> | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define COUNT "cntd" | |||
| #define SV_TYPE svfloat64x2_t | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64 | |||
| #else | |||
| #define COUNT "cntw" | |||
| #define SV_TYPE svfloat32x2_t | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32 | |||
| #endif | |||
| #define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ | |||
| a_vec = svld2(pg, a_offset_inner); \ | |||
| svst2(pg, b_offset, a_vec); \ | |||
| a_offset_inner += lda * 2; \ | |||
| b_offset += active * 2; | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| uint64_t sve_size = svcntw(); | |||
| asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); | |||
| IFLOAT *a_offset, *a_offset_inner, *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| SV_TYPE a_vec; | |||
| svbool_t pg_true = SV_TRUE(); | |||
| BLASLONG single_vectors_n = n & -sve_size; | |||
| for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = pg_true; | |||
| uint64_t active = sve_size; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| a_offset += sve_size * 2; | |||
| } | |||
| BLASLONG remaining_n = n - single_vectors_n; | |||
| if (remaining_n) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); | |||
| uint64_t active = remaining_n; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -239,8 +239,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_M1 | |||
| @@ -276,9 +274,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_M2 | |||
| @@ -313,11 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri z23.d, p1/m, z2.d, z15.d | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| add pB, pB, 64 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_E | |||
| @@ -340,11 +331,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ir z23.d, p1/m, z3.d, z14.d | |||
| OP_ii z22.d, p1/m, z3.d, z15.d | |||
| OP_ri z23.d, p1/m, z2.d, z15.d | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_SUB | |||
| @@ -382,14 +368,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ir z23.d, p1/m, z1.d, z14.d | |||
| OP_ii z22.d, p1/m, z1.d, z15.d | |||
| OP_ri z23.d, p1/m, z0.d, z15.d | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| .endm | |||
| .macro SAVEv1x4 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaz_R | |||
| fmls z24.d, p1/m, z17.d, alphaz_I | |||
| @@ -407,7 +388,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| st2d {z26.d, z27.d}, p1, [pCRow1] | |||
| add pCRow1, pCRow1, lanes, lsl #4 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld2d {z28.d, z29.d}, p1/z, [pCRow2] | |||
| fmla z28.d, p1/m, z20.d, alphaz_R | |||
| @@ -425,12 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla z31.d, p1/m, z23.d, alphaz_R | |||
| st2d {z30.d, z31.d}, p1, [pCRow3] | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -466,8 +442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVEv1x2 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaz_R | |||
| fmls z24.d, p1/m, z17.d, alphaz_I | |||
| @@ -485,10 +459,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| st2d {z26.d, z27.d}, p1, [pCRow1] | |||
| add pCRow1, pCRow1, lanes, lsl #4 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -516,8 +486,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVEv1x1 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaz_R | |||
| fmls z24.d, p1/m, z17.d, alphaz_I | |||
| @@ -527,8 +495,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -553,9 +519,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alphaR, d0 | |||
| dup alphaz_R, alphaR | |||
| fmov alphaI, d1 | |||
| @@ -676,10 +639,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bne .Lzgemm_kernel_L4_Mv1_46 | |||
| .Lzgemm_kernel_L4_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x4 | |||
| .Lzgemm_kernel_L4_Mv1_END: | |||
| @@ -3385,11 +3385,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #define DGEMM_DEFAULT_UNROLL_M 4 // Actually 2VL (8) but kept separate to keep copies separate | |||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_MN 16 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_MN 16 | |||
| #define SGEMM_DEFAULT_P 128 | |||
| #define DGEMM_DEFAULT_P 160 | |||