| @@ -52,12 +52,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define alpha0 d10 | |||
| #define alphaV0 v10.d[0] | |||
| #define alpha1 d11 | |||
| #define alphaV1 v11.d[0] | |||
| #define alpha2 d14 | |||
| #define alphaV2 v14.d[0] | |||
| #define alpha3 d15 | |||
| #define alphaV3 v15.d[0] | |||
| #define A_PRE_SIZE 2560 | |||
| #define B_PRE_SIZE 448 | |||
| #define C_PRE_SIZE 128 | |||
| // 00 origM | |||
| // 01 origN | |||
| @@ -74,8 +72,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 pA | |||
| // 16 | |||
| // 15 pCRow3 | |||
| // 16 pA | |||
| // 17 | |||
| // 18 must save | |||
| // 19 must save | |||
| @@ -100,14 +98,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //v05 pA1_2, pA1_3 | |||
| //v06 pA1_4, pA1_5 | |||
| //v07 pA1_6, pA1_7 | |||
| //v08 must save pB0_0, pB0_1 | |||
| //v09 must save pB0_2, pB0_3 | |||
| //v10 must save ALPHA0 | |||
| //v11 must save ALPHA1 | |||
| //v12 must save pB1_0, pB1_1 | |||
| //v13 must save pB1_2, pB1_3 | |||
| //v14 must save ALPHA2 | |||
| //v15 must save ALPHA3 | |||
| //v08 must save pB0_0 | |||
| //v09 must save pB0_1 | |||
| //v10 must save pB0_2 --> ALPHA0 | |||
| //v11 must save pB0_3 | |||
| //v12 must save pB1_0 | |||
| //v13 must save pB1_1 | |||
| //v14 must save pB1_2 | |||
| //v15 must save pB1_3 | |||
| //v16 must save C00, C01 | |||
| //v17 must save C02, C03 | |||
| //v18 C04, C05 | |||
| @@ -149,244 +147,257 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_I | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp d8, d9, [pB] | |||
| add pB, pB, #16 | |||
| ldp d10, d11, [pB] | |||
| add pB, pB, #16 | |||
| ldp q0, q1, [pA], #32 | |||
| ldp d8, d9, [pB], #16 | |||
| fmul v16.2d, v0.2d, v8.2d[0] | |||
| fmul v17.2d, v1.2d, v8.2d[0] | |||
| fmul v20.2d, v0.2d, v9.2d[0] | |||
| fmul v18.2d, v2.2d, v8.2d[0] | |||
| fmul v19.2d, v3.2d, v8.2d[0] | |||
| ldp d10, d11, [pB], #16 | |||
| fmul v20.2d, v0.2d, v9.2d[0] | |||
| fmul v17.2d, v1.2d, v8.2d[0] | |||
| fmul v21.2d, v1.2d, v9.2d[0] | |||
| fmul v22.2d, v2.2d, v9.2d[0] | |||
| fmul v23.2d, v3.2d, v9.2d[0] | |||
| ldp q2, q3, [pA], #32 | |||
| fmul v24.2d, v0.2d, v10.2d[0] | |||
| fmul v28.2d, v0.2d, v11.2d[0] | |||
| ldp q4, q5, [pA], #32 | |||
| fmul v25.2d, v1.2d, v10.2d[0] | |||
| fmul v29.2d, v1.2d, v11.2d[0] | |||
| ldp d12, d13, [pB], #16 | |||
| fmul v18.2d, v2.2d, v8.2d[0] | |||
| fmul v22.2d, v2.2d, v9.2d[0] | |||
| ldp d14, d15, [pB], #16 | |||
| fmul v26.2d, v2.2d, v10.2d[0] | |||
| fmul v30.2d, v2.2d, v11.2d[0] | |||
| ldp q6, q7, [pA], #32 | |||
| fmul v19.2d, v3.2d, v8.2d[0] | |||
| fmul v27.2d, v3.2d, v10.2d[0] | |||
| fmul v28.2d, v0.2d, v11.2d[0] | |||
| fmul v29.2d, v1.2d, v11.2d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmul v30.2d, v2.2d, v11.2d[0] | |||
| fmul v31.2d, v3.2d, v11.2d[0] | |||
| fmul v23.2d, v3.2d, v9.2d[0] | |||
| ld1 {v4.2d, v5.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v6.2d, v7.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp d12, d13, [pB] | |||
| add pB, pB, #16 | |||
| ldp d14, d15, [pB] | |||
| add pB, pB, #16 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v21.2d, v1.2d, v9.2d[0] | |||
| fmla v26.2d, v2.2d, v10.2d[0] | |||
| fmla v31.2d, v3.2d, v11.2d[0] | |||
| ld1 {v4.2d}, [pA], #16 | |||
| fmla v20.2d, v0.2d, v9.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| ld1 {v5.2d}, [pA], #16 | |||
| fmla v30.2d, v2.2d, v11.2d[0] | |||
| fmla v27.2d, v3.2d, v10.2d[0] | |||
| ldp d12, d13, [pB] | |||
| add pB, pB, #16 | |||
| ldp q4, q5, [pA], #32 | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v25.2d, v1.2d, v10.2d[0] | |||
| ldp d14, d15, [pB] | |||
| add pB, pB, #16 | |||
| ldp d12, d13, [pB], #16 | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v23.2d, v3.2d, v9.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v25.2d, v1.2d, v10.2d[0] | |||
| ld1 {v6.2d}, [pA], #16 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v21.2d, v1.2d, v9.2d[0] | |||
| fmla v29.2d, v1.2d, v11.2d[0] | |||
| ld1 {v7.2d}, [pA], #16 | |||
| ldp d14, d15, [pB], #16 | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v22.2d, v2.2d, v9.2d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla v26.2d, v2.2d, v10.2d[0] | |||
| fmla v30.2d, v2.2d, v11.2d[0] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| fmla v23.2d, v3.2d, v9.2d[0] | |||
| ldp q6, q7, [pA], #32 | |||
| prfm PLDL1KEEP, [pA, #224] | |||
| prfm PLDL1KEEP, [pA, #224+64] | |||
| fmla v27.2d, v3.2d, v10.2d[0] | |||
| fmla v31.2d, v3.2d, v11.2d[0] | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v21.2d, v5.2d, v13.2d[0] | |||
| fmla v26.2d, v6.2d, v14.2d[0] | |||
| fmla v31.2d, v7.2d, v15.2d[0] | |||
| ld1 {v0.2d}, [pA], #16 | |||
| fmla v20.2d, v4.2d, v13.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v24.2d, v4.2d, v14.2d[0] | |||
| fmla v28.2d, v4.2d, v15.2d[0] | |||
| ld1 {v1.2d}, [pA], #16 | |||
| ldp q0, q1, [pA], #32 | |||
| fmla v30.2d, v6.2d, v15.2d[0] | |||
| fmla v27.2d, v7.2d, v14.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v25.2d, v5.2d, v14.2d[0] | |||
| ldp d8, d9, [pB] | |||
| add pB, pB, #16 | |||
| ldp d8, d9, [pB], #16 | |||
| fmla v28.2d, v4.2d, v15.2d[0] | |||
| fmla v25.2d, v5.2d, v14.2d[0] | |||
| fmla v21.2d, v5.2d, v13.2d[0] | |||
| fmla v29.2d, v5.2d, v15.2d[0] | |||
| ldp d10, d11, [pB] | |||
| add pB, pB, #16 | |||
| ldp d10, d11, [pB], #16 | |||
| fmla v18.2d, v6.2d, v12.2d[0] | |||
| fmla v22.2d, v6.2d, v13.2d[0] | |||
| fmla v19.2d, v7.2d, v12.2d[0] | |||
| ld1 {v2.2d}, [pA], #16 | |||
| fmla v24.2d, v4.2d, v14.2d[0] | |||
| fmla v29.2d, v5.2d, v15.2d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| ld1 {v3.2d}, [pA], #16 | |||
| fmla v26.2d, v6.2d, v14.2d[0] | |||
| fmla v30.2d, v6.2d, v15.2d[0] | |||
| fmla v18.2d, v6.2d, v12.2d[0] | |||
| fmla v19.2d, v7.2d, v12.2d[0] | |||
| fmla v23.2d, v7.2d, v13.2d[0] | |||
| prfm PLDL1KEEP, [pB, #640] | |||
| ldp q2, q3, [pA], #32 | |||
| fmla v27.2d, v7.2d, v14.2d[0] | |||
| fmla v31.2d, v7.2d, v15.2d[0] | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v18.2d, v6.2d, v12.2d[0] | |||
| fmla v19.2d, v7.2d, v12.2d[0] | |||
| fmla v20.2d, v4.2d, v13.2d[0] | |||
| fmla v21.2d, v5.2d, v13.2d[0] | |||
| fmla v22.2d, v6.2d, v13.2d[0] | |||
| fmla v23.2d, v7.2d, v13.2d[0] | |||
| fmla v24.2d, v4.2d, v14.2d[0] | |||
| fmla v25.2d, v5.2d, v14.2d[0] | |||
| fmla v26.2d, v6.2d, v14.2d[0] | |||
| fmla v27.2d, v7.2d, v14.2d[0] | |||
| fmla v28.2d, v4.2d, v15.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v25.2d, v5.2d, v14.2d[0] | |||
| fmla v21.2d, v5.2d, v13.2d[0] | |||
| fmla v29.2d, v5.2d, v15.2d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v18.2d, v6.2d, v12.2d[0] | |||
| fmla v22.2d, v6.2d, v13.2d[0] | |||
| fmla v26.2d, v6.2d, v14.2d[0] | |||
| fmla v30.2d, v6.2d, v15.2d[0] | |||
| fmla v19.2d, v7.2d, v12.2d[0] | |||
| fmla v23.2d, v7.2d, v13.2d[0] | |||
| fmla v27.2d, v7.2d, v14.2d[0] | |||
| fmla v31.2d, v7.2d, v15.2d[0] | |||
| .endm | |||
| .macro KERNEL8x4_SUB | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp d8, d9, [pB] | |||
| add pB, pB, #16 | |||
| ldp d10, d11, [pB] | |||
| add pB, pB, #16 | |||
| ldp q0, q1, [pA], #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| ldp d8, d9, [pB], #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v9.2d[0] | |||
| ldp d10, d11, [pB], #16 | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v21.2d, v1.2d, v9.2d[0] | |||
| fmla v22.2d, v2.2d, v9.2d[0] | |||
| fmla v23.2d, v3.2d, v9.2d[0] | |||
| ldp q2, q3, [pA], #32 | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v25.2d, v1.2d, v10.2d[0] | |||
| fmla v29.2d, v1.2d, v11.2d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v22.2d, v2.2d, v9.2d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| fmla v26.2d, v2.2d, v10.2d[0] | |||
| fmla v30.2d, v2.2d, v11.2d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| fmla v27.2d, v3.2d, v10.2d[0] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v29.2d, v1.2d, v11.2d[0] | |||
| fmla v30.2d, v2.2d, v11.2d[0] | |||
| fmla v31.2d, v3.2d, v11.2d[0] | |||
| fmla v23.2d, v3.2d, v9.2d[0] | |||
| .endm | |||
| .macro SAVE8x4 | |||
| fmov alpha0, alpha | |||
| ld1 {v0.2d, v1.2d}, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ldp q0, q1, [pCRow0] | |||
| fmla v0.2d, v16.2d, alphaV0 | |||
| fmla v1.2d, v17.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d}, [pCRow0] | |||
| stp q0, q1, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld1 {v2.2d, v3.2d}, [pCRow0] | |||
| ldp q2, q3, [pCRow0] | |||
| fmla v2.2d, v18.2d, alphaV0 | |||
| fmla v3.2d, v19.2d, alphaV0 | |||
| st1 {v2.2d, v3.2d}, [pCRow0] | |||
| stp q2, q3, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| ld1 {v4.2d, v5.2d}, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ldp q4, q5, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0 | |||
| fmla v5.2d, v21.2d, alphaV0 | |||
| st1 {v4.2d, v5.2d}, [pCRow1] | |||
| stp q4, q5, [pCRow1] | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld1 {v6.2d, v7.2d}, [pCRow1] | |||
| ldp q6, q7, [pCRow1] | |||
| fmla v6.2d, v22.2d, alphaV0 | |||
| fmla v7.2d, v23.2d, alphaV0 | |||
| st1 {v6.2d, v7.2d}, [pCRow1] | |||
| stp q6, q7, [pCRow1] | |||
| add pCRow1, pCRow1, #32 | |||
| ld1 {v0.2d, v1.2d}, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| ldp q0, q1, [pCRow2] | |||
| fmla v0.2d, v24.2d, alphaV0 | |||
| fmla v1.2d, v25.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d}, [pCRow2] | |||
| stp q0, q1, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| ld1 {v2.2d, v3.2d}, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| ldp q2, q3, [pCRow2] | |||
| fmla v2.2d, v26.2d, alphaV0 | |||
| fmla v3.2d, v27.2d, alphaV0 | |||
| st1 {v2.2d, v3.2d}, [pCRow2] | |||
| stp q2, q3, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| ld1 {v4.2d, v5.2d}, [pCRow3] | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| ldp q4, q5, [pCRow3] | |||
| fmla v4.2d, v28.2d, alphaV0 | |||
| fmla v5.2d, v29.2d, alphaV0 | |||
| st1 {v4.2d, v5.2d}, [pCRow3] | |||
| stp q4, q5, [pCRow3] | |||
| add pCRow3, pCRow3, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| ld1 {v6.2d, v7.2d}, [pCRow3] | |||
| ldp q6, q7, [pCRow3] | |||
| fmla v6.2d, v30.2d, alphaV0 | |||
| fmla v7.2d, v31.2d, alphaV0 | |||
| st1 {v6.2d, v7.2d}, [pCRow3] | |||
| stp q6, q7, [pCRow3] | |||
| add pCRow3, pCRow3, #32 | |||
| prfm PLDL2KEEP, [pCRow0, #128] | |||
| prfm PLDL2KEEP, [pCRow1, #128] | |||
| prfm PLDL2KEEP, [pCRow2, #128] | |||
| prfm PLDL2KEEP, [pCRow3, #128] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -422,30 +433,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x4 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d, v9.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| fmla v9.2d, v17.2d, alphaV1 | |||
| fmla v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v12.2d, v13.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV2 | |||
| fmla v13.2d, v21.2d, alphaV3 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| fmla v13.2d, v21.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1 {v8.2d, v9.2d}, [pCRow2] | |||
| fmla v8.2d, v24.2d, alphaV0 | |||
| fmla v9.2d, v25.2d, alphaV1 | |||
| fmla v9.2d, v25.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow2] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1 {v12.2d, v13.2d}, [pCRow1] | |||
| fmla v12.2d, v28.2d, alphaV2 | |||
| fmla v13.2d, v29.2d, alphaV3 | |||
| fmla v12.2d, v28.2d, alphaV0 | |||
| fmla v13.2d, v29.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -474,6 +486,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x4 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| @@ -481,19 +494,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v12.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV1 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1 {v8.2d}, [pCRow2] | |||
| fmla v8.2d, v24.2d, alphaV2 | |||
| fmla v8.2d, v24.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow2] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1 {v12.2d}, [pCRow1] | |||
| fmla v12.2d, v28.2d, alphaV3 | |||
| fmla v12.2d, v28.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -518,6 +531,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x4 | |||
| fmov alpha0, alpha | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v8.d}[0], [pCRow0] | |||
| @@ -531,7 +545,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v12.d}[0], [pCRow2] | |||
| ld1 {v12.d}[1], [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV1 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.d}[0], [pCRow2] | |||
| st1 {v12.d}[1], [pCRow1] | |||
| @@ -571,20 +585,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE8x2 | |||
| fmov alpha0, alpha | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| fmla v0.2d, v16.2d, alphaV0 | |||
| fmla v1.2d, v17.2d, alphaV1 | |||
| fmla v2.2d, v18.2d, alphaV2 | |||
| fmla v3.2d, v19.2d, alphaV3 | |||
| fmla v1.2d, v17.2d, alphaV0 | |||
| fmla v2.2d, v18.2d, alphaV0 | |||
| fmla v3.2d, v19.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0 | |||
| fmla v5.2d, v21.2d, alphaV1 | |||
| fmla v6.2d, v22.2d, alphaV2 | |||
| fmla v7.2d, v23.2d, alphaV3 | |||
| fmla v5.2d, v21.2d, alphaV0 | |||
| fmla v6.2d, v22.2d, alphaV0 | |||
| fmla v7.2d, v23.2d, alphaV0 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -612,16 +627,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x2 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d, v9.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| fmla v9.2d, v17.2d, alphaV1 | |||
| fmla v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v12.2d, v13.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV2 | |||
| fmla v13.2d, v21.2d, alphaV3 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| fmla v13.2d, v21.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -646,6 +662,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x2 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| @@ -653,7 +670,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pCRow1 , pCRow0, LDC | |||
| ld1 {v12.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV1 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -676,6 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x2 | |||
| fmov alpha0, alpha | |||
| add pCRow1 , pCRow0, LDC | |||
| ld1 {v8.d}[0], [pCRow0] | |||
| @@ -713,11 +731,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE8x1 | |||
| fmov alpha0, alpha | |||
| ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| fmla v0.2d, v16.2d, alphaV0 | |||
| fmla v1.2d, v17.2d, alphaV1 | |||
| fmla v2.2d, v18.2d, alphaV2 | |||
| fmla v3.2d, v19.2d, alphaV3 | |||
| fmla v1.2d, v17.2d, alphaV0 | |||
| fmla v2.2d, v18.2d, alphaV0 | |||
| fmla v3.2d, v19.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -743,9 +762,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x1 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d, v9.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| fmla v9.2d, v17.2d, alphaV1 | |||
| fmla v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -769,6 +789,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x1 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| @@ -793,6 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x1 | |||
| fmov alpha0, alpha | |||
| ldr d8, [pCRow0] | |||
| fmadd d8, d16, alpha0, d8 | |||
| str d8, [pCRow0] | |||
| @@ -820,6 +842,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alpha, d0 | |||
| lsl LDC, LDC, #3 // ldc = ldc * 8 | |||
| @@ -838,6 +863,7 @@ dgemm_kernel_L4_BEGIN: | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow3, pCRow2, LDC | |||
| add pC, pCRow3, LDC | |||
| mov pA, origPA // pA = start of A array | |||
| @@ -849,6 +875,7 @@ dgemm_kernel_L4_M8_BEGIN: | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L4_M4_BEGIN | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_20: | |||
| mov pB, origPB | |||
| @@ -868,8 +895,8 @@ dgemm_kernel_L4_M8_20: | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble dgemm_kernel_L4_M8_22a | |||
| .align 5 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_22: | |||
| KERNEL8x4_M1 | |||
| @@ -884,7 +911,7 @@ dgemm_kernel_L4_M8_22: | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M8_22 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| @@ -898,6 +925,7 @@ dgemm_kernel_L4_M8_22a: | |||
| b dgemm_kernel_L4_M8_44 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_32: | |||
| tst counterL, #1 | |||
| @@ -923,6 +951,7 @@ dgemm_kernel_L4_M8_44: | |||
| ands counterL , origK, #7 | |||
| ble dgemm_kernel_L4_M8_100 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_46: | |||
| KERNEL8x4_SUB | |||
| @@ -931,6 +960,9 @@ dgemm_kernel_L4_M8_46: | |||
| bne dgemm_kernel_L4_M8_46 | |||
| dgemm_kernel_L4_M8_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVE8x4 | |||