From e097608b32d99e790b916d2f3724d655e88f6c95 Mon Sep 17 00:00:00 2001 From: zhanyuan Date: Mon, 26 Oct 2020 15:03:52 +0800 Subject: [PATCH] [MSLITE] Optimize depthwise conv int8 assembly --- .../lite/nnacl/assembly/arm64/ConvDw3x3Int8.S | 84 +++++++------------ 1 file changed, 32 insertions(+), 52 deletions(-) diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S index 441c5282fe..0f1b4d3703 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8.S @@ -158,36 +158,28 @@ HEIGHT1_LOOP: smlal v23.4s, v8.4h, v20.4h smlal2 v24.4s, v8.8h, v20.8h - // Apply left shfit + cbz w12, SKIP_LEFTSHIFT1 sqshl v21.4s, v21.4s, v26.4s sqshl v22.4s, v22.4s, v26.4s sqshl v23.4s, v23.4s, v26.4s sqshl v24.4s, v24.4s, v26.4s - - // Apply the fixed-point part of the multiplier. sqrdmulh v21.4s, v21.4s, v27.4s sqrdmulh v22.4s, v22.4s, v27.4s sqrdmulh v23.4s, v23.4s, v27.4s sqrdmulh v24.4s, v24.4s, v27.4s + b OUTZP1 - // Apply right shfit - and v12.16b, v28.16b, v21.16b - sshr v12.4s, v12.4s, #31 - sqadd v21.4s, v21.4s, v12.4s - srshl v21.4s, v21.4s, v28.4s - and v16.16b, v28.16b, v22.16b - sshr v16.4s, v16.4s, #31 - sqadd v22.4s, v22.4s, v16.4s - srshl v22.4s, v22.4s, v28.4s - and v20.16b, v28.16b, v23.16b - sshr v20.4s, v20.4s, #31 - sqadd v23.4s, v23.4s, v20.4s - srshl v23.4s, v23.4s, v28.4s - and v12.16b, v28.16b, v24.16b - sshr v12.4s, v12.4s, #31 - sqadd v24.4s, v24.4s, v12.4s - srshl v24.4s, v24.4s, v28.4s +SKIP_LEFTSHIFT1: + sqrdmulh v21.4s, v21.4s, v27.4s + sqrdmulh v22.4s, v22.4s, v27.4s + sqrdmulh v23.4s, v23.4s, v27.4s + sqrdmulh v24.4s, v24.4s, v27.4s + sqrshl v21.4s, v21.4s, v28.4s + sqrshl v22.4s, v22.4s, v28.4s + sqrshl v23.4s, v23.4s, v28.4s + sqrshl v24.4s, v24.4s, v28.4s +OUTZP1: // Add output zero point sqadd v21.4s, v21.4s, v29.4s sqadd v22.4s, v22.4s, v29.4s @@ -279,36 +271,28 @@ WIDTH2_LEFT: smlal v23.4s, v8.4h, v20.4h smlal2 v24.4s, v8.8h, v20.8h - // Apply left shfit + cbz w12, SKIP_LEFTSHIFT2 sqshl v21.4s, v21.4s, v26.4s sqshl v22.4s, v22.4s, v26.4s sqshl v23.4s, v23.4s, v26.4s sqshl v24.4s, v24.4s, v26.4s - - // Apply the fixed-point part of the multiplier. sqrdmulh v21.4s, v21.4s, v27.4s sqrdmulh v22.4s, v22.4s, v27.4s sqrdmulh v23.4s, v23.4s, v27.4s sqrdmulh v24.4s, v24.4s, v27.4s + b OUTZP2 - // Apply right shfit - and v9.16b, v28.16b, v21.16b - sshr v9.4s, v9.4s, #31 - sqadd v21.4s, v21.4s, v9.4s - srshl v21.4s, v21.4s, v28.4s - and v10.16b, v28.16b, v22.16b - sshr v10.4s, v10.4s, #31 - sqadd v22.4s, v22.4s, v10.4s - srshl v22.4s, v22.4s, v28.4s - and v11.16b, v28.16b, v23.16b - sshr v11.4s, v11.4s, #31 - sqadd v23.4s, v23.4s, v11.4s - srshl v23.4s, v23.4s, v28.4s - and v12.16b, v28.16b, v24.16b - sshr v12.4s, v12.4s, #31 - sqadd v24.4s, v24.4s, v12.4s - srshl v24.4s, v24.4s, v28.4s +SKIP_LEFTSHIFT2: + sqrdmulh v21.4s, v21.4s, v27.4s + sqrdmulh v22.4s, v22.4s, v27.4s + sqrdmulh v23.4s, v23.4s, v27.4s + sqrdmulh v24.4s, v24.4s, v27.4s + sqrshl v21.4s, v21.4s, v28.4s + sqrshl v22.4s, v22.4s, v28.4s + sqrshl v23.4s, v23.4s, v28.4s + sqrshl v24.4s, v24.4s, v28.4s +OUTZP2: // Add output zero point sqadd v21.4s, v21.4s, v29.4s sqadd v22.4s, v22.4s, v29.4s @@ -358,24 +342,20 @@ WIDTH1_LEFT: smlal v21.4s, v8.4h, v19.4h smlal2 v22.4s, v8.8h, v19.8h - // Apply left shfit + cbz w12, SKIP_LEFTSHIFT3 sqshl v21.4s, v21.4s, v26.4s sqshl v22.4s, v22.4s, v26.4s - - // Apply the fixed-point part of the multiplier. sqrdmulh v21.4s, v21.4s, v27.4s sqrdmulh v22.4s, v22.4s, v27.4s + b OUTZP3 - // Apply right shfit - and v9.16b, v28.16b, v21.16b - sshr v9.4s, v9.4s, #31 - sqadd v21.4s, v21.4s, v9.4s - srshl v21.4s, v21.4s, v28.4s - and v10.16b, v28.16b, v22.16b - sshr v10.4s, v10.4s, #31 - sqadd v22.4s, v22.4s, v10.4s - srshl v22.4s, v22.4s, v28.4s +SKIP_LEFTSHIFT3: + sqrdmulh v21.4s, v21.4s, v27.4s + sqrdmulh v22.4s, v22.4s, v27.4s + sqrshl v21.4s, v21.4s, v28.4s + sqrshl v22.4s, v22.4s, v28.4s +OUTZP3: // Add output zero point sqadd v21.4s, v21.4s, v29.4s sqadd v22.4s, v22.4s, v29.4s