From 8e8c0ed73fc7082169e165924ea2da58dbba8b91 Mon Sep 17 00:00:00 2001 From: yangruoqi713 Date: Wed, 25 Nov 2020 19:07:04 +0800 Subject: [PATCH] [MSLITE][Develop] fix bug of arm cpu conv depthwise int8 3x3 --- .../nnacl/assembly/arm64/ConvDw3x3Int8Corner.S | 18 ++++++++---------- .../assembly/arm64/ConvDw3x3Int8Horizontal.S | 13 ++++++++----- .../assembly/arm64/ConvDw3x3Int8Vertical.S | 18 ++++++++---------- 3 files changed, 24 insertions(+), 25 deletions(-) diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S index e48d5f9f6d..fce898a286 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S @@ -119,12 +119,15 @@ ConvDw3x3Int8Corner: b AddZpLoop PerChannelPostLoop: sqshl v23.4s, v23.4s, v28.4s - sqshl v24.4s, v24.4s, v28.4s ld1 {v28.4s}, [x10], #16 sqrdmulh v23.4s, v23.4s, v27.4s - sqrdmulh v24.4s, v24.4s, v27.4s ld1 {v27.4s}, [x9], #16 sqrshl v23.4s, v23.4s, v29.4s + ld1 {v29.4s}, [x11], #16 + sqshl v24.4s, v24.4s, v28.4s + ld1 {v28.4s}, [x10], #16 + sqrdmulh v24.4s, v24.4s, v27.4s + ld1 {v27.4s}, [x9], #16 sqrshl v24.4s, v24.4s, v29.4s ld1 {v29.4s}, [x11], #16 @@ -145,11 +148,6 @@ ConvDw3x3Int8Corner: st1 {v24.s}[0], [x0], #4 ld1 {v23.4s}, [x3], #16 ld1 {v24.4s}, [x3], #16 - cbz x14, NEXT_LOOP - ld1 {v27.4s}, [x9], #16 - ld1 {v28.4s}, [x10], #16 - ld1 {v29.4s}, [x11], #16 - NEXT_LOOP: sub x6, x6, #8 cmp x6, #8 bgt LoopC8 @@ -181,14 +179,14 @@ ConvDw3x3Int8Corner: b AddZp PerChannelPost: sqshl v23.4s, v23.4s, v28.4s - sqshl v24.4s, v24.4s, v28.4s ld1 {v28.4s}, [x10], #16 sqrdmulh v23.4s, v23.4s, v27.4s - sqrdmulh v24.4s, v24.4s, v27.4s ld1 {v27.4s}, [x9], #16 sqrshl v23.4s, v23.4s, v29.4s - sqrshl v24.4s, v24.4s, v29.4s ld1 {v29.4s}, [x11], #16 + sqshl v24.4s, v24.4s, v28.4s + sqrdmulh v24.4s, v24.4s, v27.4s + sqrshl v24.4s, v24.4s, v29.4s AddZp: add v23.4s, v23.4s, v26.4s diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S index 5ca28d0d78..339ea05b77 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S @@ -148,12 +148,15 @@ ConvDw3x3Int8Horizontal: b AddZpLoop PerChannelPostLoop: sqshl v23.4s, v23.4s, v28.4s - sqshl v24.4s, v24.4s, v28.4s ld1 {v28.4s}, [x10], #16 sqrdmulh v23.4s, v23.4s, v27.4s - sqrdmulh v24.4s, v24.4s, v27.4s ld1 {v27.4s}, [x9], #16 sqrshl v23.4s, v23.4s, v29.4s + ld1 {v29.4s}, [x11], #16 + sqshl v24.4s, v24.4s, v28.4s + ld1 {v28.4s}, [x10], #16 + sqrdmulh v24.4s, v24.4s, v27.4s + ld1 {v27.4s}, [x9], #16 sqrshl v24.4s, v24.4s, v29.4s ld1 {v29.4s}, [x11], #16 @@ -209,14 +212,14 @@ ConvDw3x3Int8Horizontal: b AddZp PerChannelPost: sqshl v23.4s, v23.4s, v28.4s - sqshl v24.4s, v24.4s, v28.4s ld1 {v28.4s}, [x10], #16 sqrdmulh v23.4s, v23.4s, v27.4s - sqrdmulh v24.4s, v24.4s, v27.4s ld1 {v27.4s}, [x9], #16 sqrshl v23.4s, v23.4s, v29.4s - sqrshl v24.4s, v24.4s, v29.4s ld1 {v29.4s}, [x11], #16 + sqshl v24.4s, v24.4s, v28.4s + sqrdmulh v24.4s, v24.4s, v27.4s + sqrshl v24.4s, v24.4s, v29.4s AddZp: add v23.4s, v23.4s, v26.4s diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S index 383d6b4f36..d1b0f02732 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S @@ -139,12 +139,15 @@ ConvDw3x3Int8Vertical: b AddZpLoop PerChannelPostLoop: sqshl v23.4s, v23.4s, v28.4s - sqshl v24.4s, v24.4s, v28.4s ld1 {v28.4s}, [x10], #16 sqrdmulh v23.4s, v23.4s, v27.4s - sqrdmulh v24.4s, v24.4s, v27.4s ld1 {v27.4s}, [x9], #16 sqrshl v23.4s, v23.4s, v29.4s + ld1 {v29.4s}, [x11], #16 + sqshl v24.4s, v24.4s, v28.4s + ld1 {v28.4s}, [x10], #16 + sqrdmulh v24.4s, v24.4s, v27.4s + ld1 {v27.4s}, [x9], #16 sqrshl v24.4s, v24.4s, v29.4s ld1 {v29.4s}, [x11], #16 @@ -165,11 +168,6 @@ ConvDw3x3Int8Vertical: st1 {v24.s}[0], [x0], #4 ld1 {v23.4s}, [x3], #16 ld1 {v24.4s}, [x3], #16 - cbz x14, NEXT_LOOP - ld1 {v27.4s}, [x9], #16 - ld1 {v28.4s}, [x10], #16 - ld1 {v29.4s}, [x11], #16 - NEXT_LOOP: sub x6, x6, #8 cmp x6, #8 bgt LoopC8 @@ -205,14 +203,14 @@ ConvDw3x3Int8Vertical: b AddZp PerChannelPost: sqshl v23.4s, v23.4s, v28.4s - sqshl v24.4s, v24.4s, v28.4s ld1 {v28.4s}, [x10], #16 sqrdmulh v23.4s, v23.4s, v27.4s - sqrdmulh v24.4s, v24.4s, v27.4s ld1 {v27.4s}, [x9], #16 sqrshl v23.4s, v23.4s, v29.4s - sqrshl v24.4s, v24.4s, v29.4s ld1 {v29.4s}, [x11], #16 + sqshl v24.4s, v24.4s, v28.4s + sqrdmulh v24.4s, v24.4s, v27.4s + sqrshl v24.4s, v24.4s, v29.4s AddZp: add v23.4s, v23.4s, v26.4s