diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S index e48d5f9f6d..fce898a286 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Corner.S @@ -119,12 +119,15 @@ ConvDw3x3Int8Corner: b AddZpLoop PerChannelPostLoop: sqshl v23.4s, v23.4s, v28.4s - sqshl v24.4s, v24.4s, v28.4s ld1 {v28.4s}, [x10], #16 sqrdmulh v23.4s, v23.4s, v27.4s - sqrdmulh v24.4s, v24.4s, v27.4s ld1 {v27.4s}, [x9], #16 sqrshl v23.4s, v23.4s, v29.4s + ld1 {v29.4s}, [x11], #16 + sqshl v24.4s, v24.4s, v28.4s + ld1 {v28.4s}, [x10], #16 + sqrdmulh v24.4s, v24.4s, v27.4s + ld1 {v27.4s}, [x9], #16 sqrshl v24.4s, v24.4s, v29.4s ld1 {v29.4s}, [x11], #16 @@ -145,11 +148,6 @@ ConvDw3x3Int8Corner: st1 {v24.s}[0], [x0], #4 ld1 {v23.4s}, [x3], #16 ld1 {v24.4s}, [x3], #16 - cbz x14, NEXT_LOOP - ld1 {v27.4s}, [x9], #16 - ld1 {v28.4s}, [x10], #16 - ld1 {v29.4s}, [x11], #16 - NEXT_LOOP: sub x6, x6, #8 cmp x6, #8 bgt LoopC8 @@ -181,14 +179,14 @@ ConvDw3x3Int8Corner: b AddZp PerChannelPost: sqshl v23.4s, v23.4s, v28.4s - sqshl v24.4s, v24.4s, v28.4s ld1 {v28.4s}, [x10], #16 sqrdmulh v23.4s, v23.4s, v27.4s - sqrdmulh v24.4s, v24.4s, v27.4s ld1 {v27.4s}, [x9], #16 sqrshl v23.4s, v23.4s, v29.4s - sqrshl v24.4s, v24.4s, v29.4s ld1 {v29.4s}, [x11], #16 + sqshl v24.4s, v24.4s, v28.4s + sqrdmulh v24.4s, v24.4s, v27.4s + sqrshl v24.4s, v24.4s, v29.4s AddZp: add v23.4s, v23.4s, v26.4s diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S index 5ca28d0d78..339ea05b77 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Horizontal.S @@ -148,12 +148,15 @@ ConvDw3x3Int8Horizontal: b AddZpLoop PerChannelPostLoop: sqshl v23.4s, v23.4s, v28.4s - sqshl v24.4s, v24.4s, v28.4s ld1 {v28.4s}, [x10], #16 sqrdmulh v23.4s, v23.4s, v27.4s - sqrdmulh v24.4s, v24.4s, v27.4s ld1 {v27.4s}, [x9], #16 sqrshl v23.4s, v23.4s, v29.4s + ld1 {v29.4s}, [x11], #16 + sqshl v24.4s, v24.4s, v28.4s + ld1 {v28.4s}, [x10], #16 + sqrdmulh v24.4s, v24.4s, v27.4s + ld1 {v27.4s}, [x9], #16 sqrshl v24.4s, v24.4s, v29.4s ld1 {v29.4s}, [x11], #16 @@ -209,14 +212,14 @@ ConvDw3x3Int8Horizontal: b AddZp PerChannelPost: sqshl v23.4s, v23.4s, v28.4s - sqshl v24.4s, v24.4s, v28.4s ld1 {v28.4s}, [x10], #16 sqrdmulh v23.4s, v23.4s, v27.4s - sqrdmulh v24.4s, v24.4s, v27.4s ld1 {v27.4s}, [x9], #16 sqrshl v23.4s, v23.4s, v29.4s - sqrshl v24.4s, v24.4s, v29.4s ld1 {v29.4s}, [x11], #16 + sqshl v24.4s, v24.4s, v28.4s + sqrdmulh v24.4s, v24.4s, v27.4s + sqrshl v24.4s, v24.4s, v29.4s AddZp: add v23.4s, v23.4s, v26.4s diff --git a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S index 383d6b4f36..d1b0f02732 100644 --- a/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S +++ b/mindspore/lite/nnacl/assembly/arm64/ConvDw3x3Int8Vertical.S @@ -139,12 +139,15 @@ ConvDw3x3Int8Vertical: b AddZpLoop PerChannelPostLoop: sqshl v23.4s, v23.4s, v28.4s - sqshl v24.4s, v24.4s, v28.4s ld1 {v28.4s}, [x10], #16 sqrdmulh v23.4s, v23.4s, v27.4s - sqrdmulh v24.4s, v24.4s, v27.4s ld1 {v27.4s}, [x9], #16 sqrshl v23.4s, v23.4s, v29.4s + ld1 {v29.4s}, [x11], #16 + sqshl v24.4s, v24.4s, v28.4s + ld1 {v28.4s}, [x10], #16 + sqrdmulh v24.4s, v24.4s, v27.4s + ld1 {v27.4s}, [x9], #16 sqrshl v24.4s, v24.4s, v29.4s ld1 {v29.4s}, [x11], #16 @@ -165,11 +168,6 @@ ConvDw3x3Int8Vertical: st1 {v24.s}[0], [x0], #4 ld1 {v23.4s}, [x3], #16 ld1 {v24.4s}, [x3], #16 - cbz x14, NEXT_LOOP - ld1 {v27.4s}, [x9], #16 - ld1 {v28.4s}, [x10], #16 - ld1 {v29.4s}, [x11], #16 - NEXT_LOOP: sub x6, x6, #8 cmp x6, #8 bgt LoopC8 @@ -205,14 +203,14 @@ ConvDw3x3Int8Vertical: b AddZp PerChannelPost: sqshl v23.4s, v23.4s, v28.4s - sqshl v24.4s, v24.4s, v28.4s ld1 {v28.4s}, [x10], #16 sqrdmulh v23.4s, v23.4s, v27.4s - sqrdmulh v24.4s, v24.4s, v27.4s ld1 {v27.4s}, [x9], #16 sqrshl v23.4s, v23.4s, v29.4s - sqrshl v24.4s, v24.4s, v29.4s ld1 {v29.4s}, [x11], #16 + sqshl v24.4s, v24.4s, v28.4s + sqrdmulh v24.4s, v24.4s, v27.4s + sqrshl v24.4s, v24.4s, v29.4s AddZp: add v23.4s, v23.4s, v26.4s