From 8fc6b0e02a7741a0b5f41d64d9b7074ff3af22af Mon Sep 17 00:00:00 2001 From: nihuini Date: Fri, 20 Dec 2019 11:12:00 +0800 Subject: [PATCH] add arm-a53-a55-dual-issue doc --- .../developer-guide/arm-a53-a55-dual-issue.md | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 docs/developer-guide/arm-a53-a55-dual-issue.md diff --git a/docs/developer-guide/arm-a53-a55-dual-issue.md b/docs/developer-guide/arm-a53-a55-dual-issue.md new file mode 100644 index 000000000..b5da97e72 --- /dev/null +++ b/docs/developer-guide/arm-a53-a55-dual-issue.md @@ -0,0 +1,85 @@ +## natural assembly +* no register dependency, no penalty +``` +ld1 {v0.4s}, [r0], #16 +fmla v10.4s, v16.4s, v24.s[0] +fmla v11.4s, v16.4s, v24.s[1] +fmla v12.4s, v16.4s, v24.s[2] +fmla v13.4s, v16.4s, v24.s[3] +``` + +## A53 +* 128bit vector load cannot be dual issued with fmla, wait 2 cycles +* 64bit vector load cannot be dual issued with fmla, wait 1 cycle +* 64bit integer load can be dual issued with fmla, no penalty +* pointer update can be dual issued with fmla, no penalty +* 64bit vector load and 64bit vector insert can be dual issued, no penalty +* any vector load cannot be issued on the 4th cycle of each fmla (enters the accumulator pipeline) + +### practical guide +* use 64bit vector load only +* issue vector load every three fmla +* 1 cycle to load 64bit, dual issue with the prvious interleaved 64bit insert +* load the remaining 64bit into integer register, dual issue with fmla +* update pointer, dual issue with fmla +* insert 64bit into vector from integer register, dual issue with the next interleaved 64bit load +* add nop every three fmla if no load, seems to be faster +``` +ldr d0, [r0] // 1 cycle, v0 first 64bit +fmla +ldr x23, [r0, #8] // 0 cycle, v0 second 64bit to temp register +fmla +add r0, r0, #16 // 0 cycle, update pointer +fmla +ldr d1, [r0] // 1 cycle, v1 first 64bit +ins v0.d[1], x23 // 0 cycle, v0 second 64bit complete +fmla +ldr x23, [r0, #8] // 0 cycle, v1 second 64bit to temp register +fmla +add r0, r0, #16 // 0 cycle, update pointer +fmla +ins v1.d[1], x23 // 1 cycle, v1 second 64bit complete +nop +fmla +fmla +fmla +nop +nop +fmla +fmla +fmla +``` + +## A55 +* 128bit vector load cannot be dual issued with fmla, wait 2 cycles +* 64bit vector load can be dual issued with fmla, no penalty +* 64bit integer load can be dual issued with fmla, no penalty +* pointer update can be dual issued with fmla, no penalty +* 64bit vector insert can be dual issued with fmla, no penalty + +### practical guide +* use 64bit vector load only +* load 64bit, dual issue with fmla +* load the remaining 64bit into integer register, dual issue with fmla +* update pointer, dual issue with fmla +* insert 64bit into vector from integer register, dual issue with fmla +* interleaved load loose register dependency +* nop trick is not needed +``` +ldr d0, [r0] // 0 cycle, v0 first 64bit +fmla +ldr x23, [r0, #8] // 0 cycle, v0 second 64bit to temp register +fmla +add r0, r0, #16 // 0 cycle, update pointer +fmla +ldr d1, [r0] // 0 cycle, v1 first 64bit +fmla +ins v0.d[1], x23 // 0 cycle, v0 second 64bit complete +fmla +ldr x23, [r0, #8] // 0 cycle, v1 second 64bit to temp register +fmla +add r0, r0, #16 // 0 cycle, update pointer +fmla +ins v1.d[1], x23 // 0 cycle, v1 second 64bit complete +fmla +```