| @@ -46,16 +46,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define pCRow3 x15 | |||
| #define lanes x15 | |||
| #define pA x16 | |||
| #define alpha x17 | |||
| #define alpha0 d10 | |||
| #define alphaZ z10.d | |||
| #define alphaV0 v10.d[0] | |||
| #define alphaZ z2.d | |||
| #define A_PRE_SIZE 2560 | |||
| #define B_PRE_SIZE 448 | |||
| #define B_PRE_SIZE 512 | |||
| #define C_PRE_SIZE 128 | |||
| // 00 origM | |||
| @@ -73,9 +73,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 pCRow3 | |||
| // 15 lanes | |||
| // 16 pA | |||
| // 17 | |||
| // 17 | |||
| // 18 must save | |||
| // 19 must save | |||
| // 20 must save | |||
| @@ -93,20 +93,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //v00 ALPHA -> pA0_0 | |||
| //v01 pA0_1 | |||
| //v02 pA0_2 | |||
| //v03 pA0_3 | |||
| //v04 pA0_4 | |||
| //v05 pA0_5 | |||
| //v06 pA0_6 | |||
| //v07 pA0_7 | |||
| //v02 ALPHA0 | |||
| //v03 | |||
| //v04 | |||
| //v05 | |||
| //v06 | |||
| //v07 | |||
| //v08 must save pB0_0 | |||
| //v09 must save pB0_1 | |||
| //v10 must save pB0_2 --> ALPHA0 | |||
| //v10 must save pB0_2 | |||
| //v11 must save pB0_3 | |||
| //v12 must save pB1_0 | |||
| //v13 must save pB1_1 | |||
| //v14 must save pB1_2 | |||
| //v15 must save pB1_3 | |||
| //v12 must save pB0_4 | |||
| //v13 must save pB0_5 | |||
| //v14 must save pB0_6 | |||
| //v15 must save pB0_7 | |||
| //v16 must save C0 | |||
| //v17 must save C1 | |||
| //v18 must save C2 | |||
| @@ -133,9 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNELv1x8_I | |||
| ld1d z0.d, p1/z, [pA] | |||
| ld1d z1.d, p1/z, [pA, x18, lsl #3] // next one | |||
| ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one | |||
| //incb pA, all, mul #2 | |||
| add pA, pA, x18, lsl #4 // pA = pA + cnt_active * 2 * 8 | |||
| add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| @@ -157,12 +157,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla z19.d, p1/m, z0.d, z11.d | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| fmla z20.d, p1/m, z0.d, z12.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| fmla z21.d, p1/m, z0.d, z13.d | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| fmla z22.d, p1/m, z0.d, z14.d | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| fmla z23.d, p1/m, z0.d, z15.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| @@ -170,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNELv1x8_M1 | |||
| ld1d z1.d, p1/z, [pA] | |||
| add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| ld1rd z8.d, p0/z, [pB] | |||
| @@ -181,12 +183,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla z19.d, p1/m, z0.d, z11.d | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| fmla z20.d, p1/m, z0.d, z12.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| fmla z21.d, p1/m, z0.d, z13.d | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| fmla z22.d, p1/m, z0.d, z14.d | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| fmla z23.d, p1/m, z0.d, z15.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| @@ -194,7 +198,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNELv1x8_M2 | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| fmla z16.d, p1/m, z1.d, z8.d | |||
| ld1rd z8.d, p0/z, [pB] | |||
| @@ -206,6 +210,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| fmla z20.d, p1/m, z1.d, z12.d | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z21.d, p1/m, z1.d, z13.d | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| fmla z22.d, p1/m, z1.d, z14.d | |||
| @@ -222,6 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla z18.d, p1/m, z1.d, z10.d | |||
| fmla z19.d, p1/m, z1.d, z11.d | |||
| fmla z20.d, p1/m, z1.d, z12.d | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z21.d, p1/m, z1.d, z13.d | |||
| fmla z22.d, p1/m, z1.d, z14.d | |||
| fmla z23.d, p1/m, z1.d, z15.d | |||
| @@ -229,7 +235,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNELv1x8_SUB | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| @@ -245,16 +251,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| fmla z17.d, p1/m, z0.d, z9.d | |||
| fmla z18.d, p1/m, z0.d, z10.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z19.d, p1/m, z0.d, z11.d | |||
| fmla z20.d, p1/m, z0.d, z12.d | |||
| fmla z21.d, p1/m, z0.d, z13.d | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla z22.d, p1/m, z0.d, z14.d | |||
| fmla z23.d, p1/m, z0.d, z15.d | |||
| .endm | |||
| .macro SAVEv1x8 | |||
| dup alphaZ, alpha | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| @@ -262,43 +269,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1d z24.d, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaZ | |||
| st1d z24.d, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z25.d, p1/z, [pCRow1] | |||
| fmla z25.d, p1/m, z17.d, alphaZ | |||
| st1d z25.d, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1d z26.d, p1/z, [pCRow2] | |||
| fmla z26.d, p1/m, z18.d, alphaZ | |||
| st1d z26.d, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z27.d, p1/z, [pCRow1] | |||
| fmla z27.d, p1/m, z19.d, alphaZ | |||
| st1d z27.d, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1d z28.d, p1/z, [pCRow2] | |||
| fmla z28.d, p1/m, z20.d, alphaZ | |||
| st1d z28.d, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z29.d, p1/z, [pCRow1] | |||
| fmla z29.d, p1/m, z21.d, alphaZ | |||
| st1d z29.d, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1d z30.d, p1/z, [pCRow2] | |||
| fmla z30.d, p1/m, z22.d, alphaZ | |||
| st1d z30.d, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z31.d, p1/z, [pCRow1] | |||
| fmla z31.d, p1/m, z23.d, alphaZ | |||
| st1d z31.d, p1, [pCRow1] | |||
| add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
| .endm | |||
| @@ -313,7 +326,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNELv1x4_SUB | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| @@ -324,13 +337,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| fmla z17.d, p1/m, z0.d, z9.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z18.d, p1/m, z0.d, z10.d | |||
| fmla z19.d, p1/m, z0.d, z11.d | |||
| .endm | |||
| .macro SAVEv1x4 | |||
| dup alphaZ, alpha | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| @@ -338,23 +351,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1d z24.d, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaZ | |||
| st1d z24.d, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z25.d, p1/z, [pCRow1] | |||
| fmla z25.d, p1/m, z17.d, alphaZ | |||
| st1d z25.d, p1, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1d z26.d, p1/z, [pCRow2] | |||
| fmla z26.d, p1/m, z18.d, alphaZ | |||
| st1d z26.d, p1, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z27.d, p1/z, [pCRow1] | |||
| fmla z27.d, p1/m, z19.d, alphaZ | |||
| st1d z27.d, p1, [pCRow1] | |||
| add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
| .endm | |||
| @@ -367,7 +382,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNELv1x2_SUB | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| @@ -375,12 +390,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pB, pB, 16 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla z17.d, p1/m, z0.d, z9.d | |||
| .endm | |||
| .macro SAVEv1x2 | |||
| dup alphaZ, alpha | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| @@ -388,13 +403,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1d z24.d, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaZ | |||
| st1d z24.d, p1, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1d z25.d, p1/z, [pCRow1] | |||
| fmla z25.d, p1/m, z17.d, alphaZ | |||
| st1d z25.d, p1, [pCRow1] | |||
| add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
| .endm | |||
| @@ -406,28 +421,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNELv1x1_SUB | |||
| ld1d z0.d, p1/z, [pA] | |||
| add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| add pB, pB, 8 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| .endm | |||
| .macro SAVEv1x1 | |||
| dup alphaZ, alpha | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1d z24.d, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaZ | |||
| st1d z24.d, p1, [pCRow0] | |||
| add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
| .endm | |||
| @@ -456,6 +470,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alpha, d0 | |||
| dup alphaZ, alpha | |||
| lsl LDC, LDC, #3 // ldc = ldc * 8 | |||
| ptrue p0.d // create true predicate | |||
| @@ -473,7 +488,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .Ldgemm_kernel_L8_BEGIN: | |||
| mov pCRow0, pC | |||
| add pC, pCRow0, LDC, lsl #3 // add 8 x LDC | |||
| add pC, pC, LDC, lsl #3 // add 8 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| @@ -481,11 +496,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp x18, p0, p1.d | |||
| /* mov counterI, origM */ | |||
| /* asr counterI, counterI, #3 // counterI = counterI / 8 */ | |||
| /* cmp counterI, #0 */ | |||
| /* ble .Ldgemm_kernel_L4_M4_BEGIN */ | |||
| cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_Mv1_20: | |||
| @@ -584,7 +595,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp x18, p0, p1.d | |||
| cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||
| b.any .Ldgemm_kernel_L8_Mv1_20 | |||
| .Ldgemm_kernel_L8_END: | |||
| @@ -608,7 +619,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov pCRow0, pC | |||
| add pC, pCRow0, LDC, lsl #2 // add 4 x LDC | |||
| add pC, pC, LDC, lsl #2 // add 4 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| @@ -616,7 +627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp x18, p0, p1.d | |||
| cntp lanes, p0, p1.d | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_20: | |||
| @@ -626,17 +637,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 4 to do? | |||
| blt .Ldgemm_kernel_L4_Mv1_44 | |||
| ble .Ldgemm_kernel_L4_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| KERNELv1x4_SUB | |||
| @@ -651,6 +666,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| @@ -667,12 +683,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp x18, p0, p1.d | |||
| cntp lanes, p0, p1.d | |||
| b.any .Ldgemm_kernel_L4_Mv1_20 | |||
| .Ldgemm_kernel_L4_END: | |||
| add origPB, origPB, origK, lsl #5 // B = B + K * 4 * 8 | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| /******************************************************************************/ | |||
| /******************************************************************************/ | |||
| @@ -686,7 +703,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov pCRow0, pC | |||
| add pC, pCRow0, LDC, lsl #1 // add 2 x LDC | |||
| add pC, pC, LDC, lsl #1 // add 2 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| @@ -694,7 +711,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp x18, p0, p1.d | |||
| cntp lanes, p0, p1.d | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_20: | |||
| @@ -704,15 +721,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 4 to do? | |||
| blt .Ldgemm_kernel_L2_Mv1_44 | |||
| ble .Ldgemm_kernel_L2_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| @@ -729,6 +748,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| @@ -745,7 +765,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp x18, p0, p1.d | |||
| cntp lanes, p0, p1.d | |||
| b.any .Ldgemm_kernel_L2_Mv1_20 | |||
| @@ -764,7 +784,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov pCRow0, pC | |||
| add pC, pCRow0, LDC, lsl #1 // add 2 x LDC | |||
| add pC, pC, LDC // add 1 x LDC | |||
| mov pA, origPA // pA = start of A array | |||
| @@ -772,7 +792,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp x18, p0, p1.d | |||
| cntp lanes, p0, p1.d | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_20: | |||
| @@ -781,12 +801,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| INITv1x1 // fill with zeros | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #0 // is there at least 4 to do? | |||
| blt .Ldgemm_kernel_L1_Mv1_44 | |||
| cmp counterL , #0 // is there at least 8 to do? | |||
| ble .Ldgemm_kernel_L1_Mv1_44 | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_22: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| @@ -807,10 +828,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_Mv1_46: | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNELv1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Ldgemm_kernel_L1_Mv1_46 | |||
| bgt .Ldgemm_kernel_L1_Mv1_46 | |||
| .Ldgemm_kernel_L1_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| @@ -823,7 +845,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp x18, p0, p1.d | |||
| cntp lanes, p0, p1.d | |||
| b.any .Ldgemm_kernel_L1_Mv1_20 | |||