From 488147dcbdc06c44b3eb46da083e64bbdf661eea Mon Sep 17 00:00:00 2001 From: zhanyuan Date: Sat, 24 Oct 2020 14:39:11 +0800 Subject: [PATCH] [MSLITE] Optimize depthwise conv 3x3 arm64 --- .../lite/nnacl/assembly/arm64/ConvDw3x3Int8.S | 159 ++++++++++++++++-- 1 file changed, 141 insertions(+), 18 deletions(-) diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S index a31be4f7ee..441c5282fe 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S @@ -67,7 +67,6 @@ ConvDw3x3Int8Neon64: ld1 {v7.8h}, [x2], x20 ld1 {v8.8h}, [x2], x20 -Loop: mov x16, x1 add x17, x16, x5 add x18, x17, x5 @@ -83,6 +82,8 @@ Loop: ld1 {v21.4s}, [x3] ld1 {v22.4s}, [x19] + ld1 {v23.4s}, [x3] + ld1 {v24.4s}, [x19] // subtract input zp ssubl v9.8h, v9.8b, v25.8b @@ -95,31 +96,160 @@ Loop: ssubl v18.8h, v18.8b, v25.8b ssubl v19.8h, v19.8b, v25.8b + cmp w8, #2 + beq WIDTH2_LEFT cmp w8, #1 - beq Width1 + beq WIDTH1_LEFT -Width2: +HEIGHT1_LOOP: + smlal v21.4s, v0.4h, v9.4h ld1 {v12.8b}, [x16] + smlal2 v22.4s, v0.8h, v9.8h ld1 {v16.8b}, [x17] + smlal v23.4s, v0.4h, v10.4h + smlal2 v24.4s, v0.8h, v10.8h ld1 {v20.8b}, [x18] - - ld1 {v23.4s}, [x3] - ld1 {v24.4s}, [x19] - + add x1, x1, x21 ssubl v12.8h, v12.8b, v25.8b + smlal v21.4s, v1.4h, v10.4h + mov x16, x1 + add x17, x16, x5 + add x18, x17, x5 + smlal2 v22.4s, v1.8h, v10.8h + ld1 {v9.8b}, [x16], x4 ssubl v16.8h, v16.8b, v25.8b + smlal v23.4s, v1.4h, v11.4h + ld1 {v10.8b}, [x16], x4 ssubl v20.8h, v20.8b, v25.8b + smlal2 v24.4s, v1.8h, v11.8h + smlal v21.4s, v2.4h, v11.4h + smlal2 v22.4s, v2.8h, v11.8h + ld1 {v11.8b}, [x16], x4 + smlal v23.4s, v2.4h, v12.4h + smlal2 v24.4s, v2.8h, v12.8h + smlal v21.4s, v3.4h, v13.4h + smlal2 v22.4s, v3.8h, v13.8h + ld1 {v13.8b}, [x17], x4 + smlal v23.4s, v3.4h, v14.4h + smlal2 v24.4s, v3.8h, v14.8h + smlal v21.4s, v4.4h, v14.4h + smlal2 v22.4s, v4.8h, v14.8h + ld1 {v14.8b}, [x17], x4 + smlal v23.4s, v4.4h, v15.4h + smlal2 v24.4s, v4.8h, v15.8h + smlal v21.4s, v5.4h, v15.4h + smlal2 v22.4s, v5.8h, v15.8h + ld1 {v15.8b}, [x17], x4 + smlal v23.4s, v5.4h, v16.4h + smlal2 v24.4s, v5.8h, v16.8h + smlal v21.4s, v6.4h, v17.4h + smlal2 v22.4s, v6.8h, v17.8h + ld1 {v17.8b}, [x18], x4 + smlal v23.4s, v6.4h, v18.4h + smlal2 v24.4s, v6.8h, v18.8h + smlal v21.4s, v7.4h, v18.4h + smlal2 v22.4s, v7.8h, v18.8h + ld1 {v18.8b}, [x18], x4 + smlal v23.4s, v7.4h, v19.4h + smlal2 v24.4s, v7.8h, v19.8h + smlal v21.4s, v8.4h, v19.4h + smlal2 v22.4s, v8.8h, v19.8h + ld1 {v19.8b}, [x18], x4 + smlal v23.4s, v8.4h, v20.4h + smlal2 v24.4s, v8.8h, v20.8h + + // Apply left shfit + sqshl v21.4s, v21.4s, v26.4s + sqshl v22.4s, v22.4s, v26.4s + sqshl v23.4s, v23.4s, v26.4s + sqshl v24.4s, v24.4s, v26.4s + // Apply the fixed-point part of the multiplier. + sqrdmulh v21.4s, v21.4s, v27.4s + sqrdmulh v22.4s, v22.4s, v27.4s + sqrdmulh v23.4s, v23.4s, v27.4s + sqrdmulh v24.4s, v24.4s, v27.4s + + // Apply right shfit + and v12.16b, v28.16b, v21.16b + sshr v12.4s, v12.4s, #31 + sqadd v21.4s, v21.4s, v12.4s + srshl v21.4s, v21.4s, v28.4s + and v16.16b, v28.16b, v22.16b + sshr v16.4s, v16.4s, #31 + sqadd v22.4s, v22.4s, v16.4s + srshl v22.4s, v22.4s, v28.4s + and v20.16b, v28.16b, v23.16b + sshr v20.4s, v20.4s, #31 + sqadd v23.4s, v23.4s, v20.4s + srshl v23.4s, v23.4s, v28.4s + and v12.16b, v28.16b, v24.16b + sshr v12.4s, v12.4s, #31 + sqadd v24.4s, v24.4s, v12.4s + srshl v24.4s, v24.4s, v28.4s + + // Add output zero point + sqadd v21.4s, v21.4s, v29.4s + sqadd v22.4s, v22.4s, v29.4s + sqadd v23.4s, v23.4s, v29.4s + sqadd v24.4s, v24.4s, v29.4s + + // Apply min bound + smax v21.4s, v21.4s, v30.4s + smax v22.4s, v22.4s, v30.4s + smax v23.4s, v23.4s, v30.4s + smax v24.4s, v24.4s, v30.4s + + // Apply max bound + smin v21.4s, v21.4s, v31.4s + smin v22.4s, v22.4s, v31.4s + smin v23.4s, v23.4s, v31.4s + smin v24.4s, v24.4s, v31.4s + + sqxtn v21.4h, v21.4s + sqxtn2 v21.8h, v22.4s + ld1 {v22.4s}, [x19] + ssubl v9.8h, v9.8b, v25.8b + ssubl v10.8h, v10.8b, v25.8b + sqxtn v23.4h, v23.4s + sqxtn2 v23.8h, v24.4s + ld1 {v24.4s}, [x19] + sqxtn v21.8b, v21.8h + sqxtn2 v21.16b, v23.8h + st1 {v21.8b}, [x0], x6 + mov v23.d[0], v21.d[1] + ld1 {v21.4s}, [x3] + st1 {v23.8b}, [x0], x6 + ssubl v11.8h, v11.8b, v25.8b + ssubl v13.8h, v13.8b, v25.8b + ld1 {v23.4s}, [x3] + ssubl v14.8h, v14.8b, v25.8b + ssubl v15.8h, v15.8b, v25.8b + ssubl v17.8h, v17.8b, v25.8b + ssubl v18.8h, v18.8b, v25.8b + ssubl v19.8h, v19.8b, v25.8b + sub w8, w8, #2 + cmp w8, #2 + bgt HEIGHT1_LOOP + + cmp w8, #2 + blt WIDTH1_LEFT + +WIDTH2_LEFT: smlal v21.4s, v0.4h, v9.4h smlal2 v22.4s, v0.8h, v9.8h + ld1 {v12.8b}, [x16] + ssubl v12.8h, v12.8b, v25.8b smlal v23.4s, v0.4h, v10.4h smlal2 v24.4s, v0.8h, v10.8h smlal v21.4s, v1.4h, v10.4h smlal2 v22.4s, v1.8h, v10.8h + ld1 {v16.8b}, [x17] smlal v23.4s, v1.4h, v11.4h smlal2 v24.4s, v1.8h, v11.8h smlal v21.4s, v2.4h, v11.4h smlal2 v22.4s, v2.8h, v11.8h + ld1 {v20.8b}, [x18] smlal v23.4s, v2.4h, v12.4h smlal2 v24.4s, v2.8h, v12.8h smlal v21.4s, v3.4h, v13.4h @@ -128,10 +258,12 @@ Width2: smlal2 v24.4s, v3.8h, v14.8h smlal v21.4s, v4.4h, v14.4h smlal2 v22.4s, v4.8h, v14.8h + ssubl v16.8h, v16.8b, v25.8b smlal v23.4s, v4.4h, v15.4h smlal2 v24.4s, v4.8h, v15.8h smlal v21.4s, v5.4h, v15.4h smlal2 v22.4s, v5.8h, v15.8h + ssubl v20.8h, v20.8b, v25.8b smlal v23.4s, v5.4h, v16.4h smlal2 v24.4s, v5.8h, v16.8h smlal v21.4s, v6.4h, v17.4h @@ -201,16 +333,12 @@ Width2: sqxtn2 v23.8h, v24.4s sqxtn v21.8b, v21.8h sqxtn2 v21.16b, v23.8h - st1 {v21.8b}, [x0], x6 mov v23.d[0], v21.d[1] st1 {v23.8b}, [x0], x6 - sub w8, w8, #2 - cbz w8, End - add x1, x1, x21 - b Loop + b End -Width1: +WIDTH1_LEFT: smlal v21.4s, v0.4h, v9.4h smlal2 v22.4s, v0.8h, v9.8h smlal v21.4s, v1.4h, v10.4h @@ -263,12 +391,7 @@ Width1: sqxtn v21.4h, v21.4s sqxtn2 v21.8h, v22.4s sqxtn v21.8b, v21.8h - st1 {v21.8b}, [x0], x6 - sub w8, w8, #1 - cbz w8, End - add x1, x1, x4 - b Loop End: sub sp, sp, #160