Browse Source

THUNDERX2T99: Improve DGEMM

tags/v0.2.20^2
Ashwin Sekhar T K 9 years ago
parent
commit
0f1d6e8b39
1 changed files with 58 additions and 87 deletions
  1. +58
    -87
      kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S

+ 58
- 87
kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S View File

@@ -151,187 +151,164 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm

.macro KERNEL8x4_I
ldur q0, [pA]
ldur q1, [pA, #16]

ldur q8, [pB]
ldur q9, [pB, #16]
ldp q0, q1, [pA]
ldp q8, q9, [pB]
ldp q2, q3, [pA, #32]
ldp q4, q5, [pA, #64]
ldp q12, q13, [pB, #32]
ldp q6, q7, [pA, #96]

fmul v16.2d, v0.2d, v8.d[0]
fmul v20.2d, v0.2d, v8.d[1]

fmul v17.2d, v1.2d, v8.d[0]
fmul v21.2d, v1.2d, v8.d[1]

ldp q2, q3, [pA, #32]
add pA, pA, #128
add pB, pB, #64

fmul v24.2d, v0.2d, v9.d[0]
ldp q4, q5, [pA, #64]
fmul v28.2d, v0.2d, v9.d[1]

fmul v25.2d, v1.2d, v9.d[0]
fmul v29.2d, v1.2d, v9.d[1]

ldur q12, [pB, #32]
ldur q13, [pB, #48]
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]

fmul v18.2d, v2.2d, v8.d[0]
fmul v22.2d, v2.2d, v8.d[1]

ldur q6, [pA, #96]
ldur q7, [pA, #112]

add pB, pB, #64
add pA, pA, #128

fmul v26.2d, v2.2d, v9.d[0]
fmul v30.2d, v2.2d, v9.d[1]

fmul v19.2d, v3.2d, v8.d[0]
fmul v27.2d, v3.2d, v9.d[0]

prfm PLDL1KEEP, [pA, A_PRE_SIZE]

fmul v31.2d, v3.2d, v9.d[1]
fmul v23.2d, v3.2d, v8.d[1]

prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
.endm

.macro KERNEL8x4_M1_M2
fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.d[1]

ldp q12, q13, [pB]
ldp q4, q5, [pA]
ldp q6, q7, [pA, #32]

fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.d[1]

ldp q12, q13, [pB]
prfm PLDL1KEEP, [pA, A_PRE_SIZE]

fmla v17.2d, v1.2d, v8.d[0]
fmla v25.2d, v1.2d, v9.d[0]

prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]

fmla v21.2d, v1.2d, v8.d[1]
fmla v29.2d, v1.2d, v9.d[1]

prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]

fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v8.d[1]

prfm PLDL1KEEP, [pA, A_PRE_SIZE]

fmla v26.2d, v2.2d, v9.d[0]
fmla v30.2d, v2.2d, v9.d[1]
fmla v19.2d, v3.2d, v8.d[0]
fmla v23.2d, v3.2d, v8.d[1]

ldp q6, q7, [pA, #32]
prfm PLDL1KEEP, [pA, #3840]

fmla v19.2d, v3.2d, v8.d[0]
fmla v23.2d, v3.2d, v8.d[1]
fmla v27.2d, v3.2d, v9.d[0]
fmla v31.2d, v3.2d, v9.d[1]


ldp q8, q9, [pB, #32]
ldp q0, q1, [pA, #64]
ldp q2, q3, [pA, #96]

fmla v16.2d, v4.2d, v12.d[0]
fmla v20.2d, v4.2d, v12.d[1]
fmla v24.2d, v4.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.d[1]

ldp q0, q1, [pA, #64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]

fmla v17.2d, v5.2d, v12.d[0]
fmla v25.2d, v5.2d, v13.d[0]

ldp q8, q9, [pB, #32]
ldp q2, q3, [pA, #96]

fmla v21.2d, v5.2d, v12.d[1]
fmla v29.2d, v5.2d, v13.d[1]

fmla v18.2d, v6.2d, v12.d[0]
fmla v22.2d, v6.2d, v12.d[1]

prfm PLDL1KEEP, [pB, B_PRE_SIZE]

fmla v26.2d, v6.2d, v13.d[0]
fmla v30.2d, v6.2d, v13.d[1]

fmla v19.2d, v7.2d, v12.d[0]
fmla v23.2d, v7.2d, v12.d[1]

add pB, pB, #64
add pA, pA, #128

fmla v19.2d, v7.2d, v12.d[0]
fmla v23.2d, v7.2d, v12.d[1]
fmla v27.2d, v7.2d, v13.d[0]
fmla v31.2d, v7.2d, v13.d[1]
.endm


.macro KERNEL8x4_M1
ldp q12, q13, [pB]
ldp q4, q5, [pA]
ldp q6, q7, [pA, #32]

fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.d[1]

ldp q4, q5, [pA], #32

fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.d[1]

ldp q12, q13, [pB]
add pB, pB, #32
prfm PLDL1KEEP, [pA, A_PRE_SIZE]

fmla v17.2d, v1.2d, v8.d[0]
fmla v25.2d, v1.2d, v9.d[0]

prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]

fmla v21.2d, v1.2d, v8.d[1]
fmla v29.2d, v1.2d, v9.d[1]

prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]

fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v8.d[1]

prfm PLDL1KEEP, [pA, A_PRE_SIZE]

fmla v26.2d, v2.2d, v9.d[0]
fmla v30.2d, v2.2d, v9.d[1]
fmla v19.2d, v3.2d, v8.d[0]
fmla v23.2d, v3.2d, v8.d[1]

ldp q6, q7, [pA], #32
add pB, pB, #32
add pA, pA, #64

fmla v19.2d, v3.2d, v8.d[0]
fmla v23.2d, v3.2d, v8.d[1]
fmla v27.2d, v3.2d, v9.d[0]
fmla v31.2d, v3.2d, v9.d[1]
.endm

.macro KERNEL8x4_M2
ldp q8, q9, [pB]
ldp q0, q1, [pA]
ldp q2, q3, [pA, #32]

fmla v16.2d, v4.2d, v12.d[0]
fmla v20.2d, v4.2d, v12.d[1]
fmla v24.2d, v4.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.d[1]

ldp q0, q1, [pA], #32
prfm PLDL1KEEP, [pB, B_PRE_SIZE]

fmla v17.2d, v5.2d, v12.d[0]
fmla v25.2d, v5.2d, v13.d[0]

ldp q8, q9, [pB]
add pB, pB, #32

fmla v21.2d, v5.2d, v12.d[1]
fmla v29.2d, v5.2d, v13.d[1]

fmla v18.2d, v6.2d, v12.d[0]
fmla v22.2d, v6.2d, v12.d[1]

prfm PLDL1KEEP, [pB, B_PRE_SIZE]

fmla v26.2d, v6.2d, v13.d[0]
fmla v30.2d, v6.2d, v13.d[1]

add pB, pB, #32
add pA, pA, #64

fmla v19.2d, v7.2d, v12.d[0]
fmla v23.2d, v7.2d, v12.d[1]

ldp q2, q3, [pA], #32

fmla v27.2d, v7.2d, v13.d[0]
fmla v31.2d, v7.2d, v13.d[1]
.endm
@@ -342,13 +319,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla v24.2d, v4.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.d[1]

prfm PLDL1KEEP, [pB, B_PRE_SIZE]

fmla v17.2d, v5.2d, v12.d[0]
fmla v25.2d, v5.2d, v13.d[0]
fmla v21.2d, v5.2d, v12.d[1]
fmla v29.2d, v5.2d, v13.d[1]

prfm PLDL1KEEP, [pB, B_PRE_SIZE]

fmla v18.2d, v6.2d, v12.d[0]
fmla v22.2d, v6.2d, v12.d[1]
fmla v26.2d, v6.2d, v13.d[0]
@@ -361,42 +338,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm

.macro KERNEL8x4_SUB
ldp q0, q1, [pA], #32
ldp q0, q1, [pA]
ldp q8, q9, [pB]
ldp q2, q3, [pA, #32]

ldur q8, [pB]
prfm PLDL1KEEP, [pA, A_PRE_SIZE]

fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.d[1]

ldur q9, [pB, #16]
add pB, pB, #32

fmla v17.2d, v1.2d, v8.d[0]
fmla v21.2d, v1.2d, v8.d[1]

ldp q2, q3, [pA], #32
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]

fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.d[1]

fmla v25.2d, v1.2d, v9.d[0]
fmla v29.2d, v1.2d, v9.d[1]

prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]

fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v8.d[1]

prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]

fmla v26.2d, v2.2d, v9.d[0]
fmla v30.2d, v2.2d, v9.d[1]

prfm PLDL1KEEP, [pB, B_PRE_SIZE]
add pB, pB, #32
add pA, pA, #64

fmla v19.2d, v3.2d, v8.d[0]
fmla v27.2d, v3.2d, v9.d[0]

fmla v31.2d, v3.2d, v9.d[1]
fmla v23.2d, v3.2d, v8.d[1]
.endm


Loading…
Cancel
Save