Browse Source

!9020 [MSLITE][Develop] fix bug of arm cpu conv depthwise int8 3x3

From: @yangruoqi713
Reviewed-by: @HilbertDavid,@zhang_xue_tong
Signed-off-by: @HilbertDavid
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
0344078a9c
3 changed files with 24 additions and 25 deletions
  1. +8
    -10
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S
  2. +8
    -5
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S
  3. +8
    -10
      mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S

+ 8
- 10
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S View File

@@ -119,12 +119,15 @@ ConvDw3x3Int8Corner:
b AddZpLoop
PerChannelPostLoop:
sqshl v23.4s, v23.4s, v28.4s
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16
sqrdmulh v23.4s, v23.4s, v27.4s
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16
sqrshl v23.4s, v23.4s, v29.4s
ld1 {v29.4s}, [x11], #16
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16
sqrshl v24.4s, v24.4s, v29.4s
ld1 {v29.4s}, [x11], #16

@@ -145,11 +148,6 @@ ConvDw3x3Int8Corner:
st1 {v24.s}[0], [x0], #4
ld1 {v23.4s}, [x3], #16
ld1 {v24.4s}, [x3], #16
cbz x14, NEXT_LOOP
ld1 {v27.4s}, [x9], #16
ld1 {v28.4s}, [x10], #16
ld1 {v29.4s}, [x11], #16
NEXT_LOOP:
sub x6, x6, #8
cmp x6, #8
bgt LoopC8
@@ -181,14 +179,14 @@ ConvDw3x3Int8Corner:
b AddZp
PerChannelPost:
sqshl v23.4s, v23.4s, v28.4s
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16
sqrdmulh v23.4s, v23.4s, v27.4s
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16
sqrshl v23.4s, v23.4s, v29.4s
sqrshl v24.4s, v24.4s, v29.4s
ld1 {v29.4s}, [x11], #16
sqshl v24.4s, v24.4s, v28.4s
sqrdmulh v24.4s, v24.4s, v27.4s
sqrshl v24.4s, v24.4s, v29.4s

AddZp:
add v23.4s, v23.4s, v26.4s


+ 8
- 5
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S View File

@@ -148,12 +148,15 @@ ConvDw3x3Int8Horizontal:
b AddZpLoop
PerChannelPostLoop:
sqshl v23.4s, v23.4s, v28.4s
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16
sqrdmulh v23.4s, v23.4s, v27.4s
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16
sqrshl v23.4s, v23.4s, v29.4s
ld1 {v29.4s}, [x11], #16
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16
sqrshl v24.4s, v24.4s, v29.4s
ld1 {v29.4s}, [x11], #16

@@ -209,14 +212,14 @@ ConvDw3x3Int8Horizontal:
b AddZp
PerChannelPost:
sqshl v23.4s, v23.4s, v28.4s
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16
sqrdmulh v23.4s, v23.4s, v27.4s
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16
sqrshl v23.4s, v23.4s, v29.4s
sqrshl v24.4s, v24.4s, v29.4s
ld1 {v29.4s}, [x11], #16
sqshl v24.4s, v24.4s, v28.4s
sqrdmulh v24.4s, v24.4s, v27.4s
sqrshl v24.4s, v24.4s, v29.4s

AddZp:
add v23.4s, v23.4s, v26.4s


+ 8
- 10
mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S View File

@@ -139,12 +139,15 @@ ConvDw3x3Int8Vertical:
b AddZpLoop
PerChannelPostLoop:
sqshl v23.4s, v23.4s, v28.4s
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16
sqrdmulh v23.4s, v23.4s, v27.4s
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16
sqrshl v23.4s, v23.4s, v29.4s
ld1 {v29.4s}, [x11], #16
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16
sqrshl v24.4s, v24.4s, v29.4s
ld1 {v29.4s}, [x11], #16

@@ -165,11 +168,6 @@ ConvDw3x3Int8Vertical:
st1 {v24.s}[0], [x0], #4
ld1 {v23.4s}, [x3], #16
ld1 {v24.4s}, [x3], #16
cbz x14, NEXT_LOOP
ld1 {v27.4s}, [x9], #16
ld1 {v28.4s}, [x10], #16
ld1 {v29.4s}, [x11], #16
NEXT_LOOP:
sub x6, x6, #8
cmp x6, #8
bgt LoopC8
@@ -205,14 +203,14 @@ ConvDw3x3Int8Vertical:
b AddZp
PerChannelPost:
sqshl v23.4s, v23.4s, v28.4s
sqshl v24.4s, v24.4s, v28.4s
ld1 {v28.4s}, [x10], #16
sqrdmulh v23.4s, v23.4s, v27.4s
sqrdmulh v24.4s, v24.4s, v27.4s
ld1 {v27.4s}, [x9], #16
sqrshl v23.4s, v23.4s, v29.4s
sqrshl v24.4s, v24.4s, v29.4s
ld1 {v29.4s}, [x11], #16
sqshl v24.4s, v24.4s, v28.4s
sqrdmulh v24.4s, v24.4s, v27.4s
sqrshl v24.4s, v24.4s, v29.4s

AddZp:
add v23.4s, v23.4s, v26.4s


Loading…
Cancel
Save