From: @yangruoqi713 Reviewed-by: @HilbertDavid,@zhang_xue_tong Signed-off-by: @HilbertDavidtags/v1.1.0
| @@ -119,12 +119,15 @@ ConvDw3x3Int8Corner: | |||||
| b AddZpLoop | b AddZpLoop | ||||
| PerChannelPostLoop: | PerChannelPostLoop: | ||||
| sqshl v23.4s, v23.4s, v28.4s | sqshl v23.4s, v23.4s, v28.4s | ||||
| sqshl v24.4s, v24.4s, v28.4s | |||||
| ld1 {v28.4s}, [x10], #16 | ld1 {v28.4s}, [x10], #16 | ||||
| sqrdmulh v23.4s, v23.4s, v27.4s | sqrdmulh v23.4s, v23.4s, v27.4s | ||||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||||
| ld1 {v27.4s}, [x9], #16 | ld1 {v27.4s}, [x9], #16 | ||||
| sqrshl v23.4s, v23.4s, v29.4s | sqrshl v23.4s, v23.4s, v29.4s | ||||
| ld1 {v29.4s}, [x11], #16 | |||||
| sqshl v24.4s, v24.4s, v28.4s | |||||
| ld1 {v28.4s}, [x10], #16 | |||||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||||
| ld1 {v27.4s}, [x9], #16 | |||||
| sqrshl v24.4s, v24.4s, v29.4s | sqrshl v24.4s, v24.4s, v29.4s | ||||
| ld1 {v29.4s}, [x11], #16 | ld1 {v29.4s}, [x11], #16 | ||||
| @@ -145,11 +148,6 @@ ConvDw3x3Int8Corner: | |||||
| st1 {v24.s}[0], [x0], #4 | st1 {v24.s}[0], [x0], #4 | ||||
| ld1 {v23.4s}, [x3], #16 | ld1 {v23.4s}, [x3], #16 | ||||
| ld1 {v24.4s}, [x3], #16 | ld1 {v24.4s}, [x3], #16 | ||||
| cbz x14, NEXT_LOOP | |||||
| ld1 {v27.4s}, [x9], #16 | |||||
| ld1 {v28.4s}, [x10], #16 | |||||
| ld1 {v29.4s}, [x11], #16 | |||||
| NEXT_LOOP: | |||||
| sub x6, x6, #8 | sub x6, x6, #8 | ||||
| cmp x6, #8 | cmp x6, #8 | ||||
| bgt LoopC8 | bgt LoopC8 | ||||
| @@ -181,14 +179,14 @@ ConvDw3x3Int8Corner: | |||||
| b AddZp | b AddZp | ||||
| PerChannelPost: | PerChannelPost: | ||||
| sqshl v23.4s, v23.4s, v28.4s | sqshl v23.4s, v23.4s, v28.4s | ||||
| sqshl v24.4s, v24.4s, v28.4s | |||||
| ld1 {v28.4s}, [x10], #16 | ld1 {v28.4s}, [x10], #16 | ||||
| sqrdmulh v23.4s, v23.4s, v27.4s | sqrdmulh v23.4s, v23.4s, v27.4s | ||||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||||
| ld1 {v27.4s}, [x9], #16 | ld1 {v27.4s}, [x9], #16 | ||||
| sqrshl v23.4s, v23.4s, v29.4s | sqrshl v23.4s, v23.4s, v29.4s | ||||
| sqrshl v24.4s, v24.4s, v29.4s | |||||
| ld1 {v29.4s}, [x11], #16 | ld1 {v29.4s}, [x11], #16 | ||||
| sqshl v24.4s, v24.4s, v28.4s | |||||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||||
| sqrshl v24.4s, v24.4s, v29.4s | |||||
| AddZp: | AddZp: | ||||
| add v23.4s, v23.4s, v26.4s | add v23.4s, v23.4s, v26.4s | ||||
| @@ -148,12 +148,15 @@ ConvDw3x3Int8Horizontal: | |||||
| b AddZpLoop | b AddZpLoop | ||||
| PerChannelPostLoop: | PerChannelPostLoop: | ||||
| sqshl v23.4s, v23.4s, v28.4s | sqshl v23.4s, v23.4s, v28.4s | ||||
| sqshl v24.4s, v24.4s, v28.4s | |||||
| ld1 {v28.4s}, [x10], #16 | ld1 {v28.4s}, [x10], #16 | ||||
| sqrdmulh v23.4s, v23.4s, v27.4s | sqrdmulh v23.4s, v23.4s, v27.4s | ||||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||||
| ld1 {v27.4s}, [x9], #16 | ld1 {v27.4s}, [x9], #16 | ||||
| sqrshl v23.4s, v23.4s, v29.4s | sqrshl v23.4s, v23.4s, v29.4s | ||||
| ld1 {v29.4s}, [x11], #16 | |||||
| sqshl v24.4s, v24.4s, v28.4s | |||||
| ld1 {v28.4s}, [x10], #16 | |||||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||||
| ld1 {v27.4s}, [x9], #16 | |||||
| sqrshl v24.4s, v24.4s, v29.4s | sqrshl v24.4s, v24.4s, v29.4s | ||||
| ld1 {v29.4s}, [x11], #16 | ld1 {v29.4s}, [x11], #16 | ||||
| @@ -209,14 +212,14 @@ ConvDw3x3Int8Horizontal: | |||||
| b AddZp | b AddZp | ||||
| PerChannelPost: | PerChannelPost: | ||||
| sqshl v23.4s, v23.4s, v28.4s | sqshl v23.4s, v23.4s, v28.4s | ||||
| sqshl v24.4s, v24.4s, v28.4s | |||||
| ld1 {v28.4s}, [x10], #16 | ld1 {v28.4s}, [x10], #16 | ||||
| sqrdmulh v23.4s, v23.4s, v27.4s | sqrdmulh v23.4s, v23.4s, v27.4s | ||||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||||
| ld1 {v27.4s}, [x9], #16 | ld1 {v27.4s}, [x9], #16 | ||||
| sqrshl v23.4s, v23.4s, v29.4s | sqrshl v23.4s, v23.4s, v29.4s | ||||
| sqrshl v24.4s, v24.4s, v29.4s | |||||
| ld1 {v29.4s}, [x11], #16 | ld1 {v29.4s}, [x11], #16 | ||||
| sqshl v24.4s, v24.4s, v28.4s | |||||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||||
| sqrshl v24.4s, v24.4s, v29.4s | |||||
| AddZp: | AddZp: | ||||
| add v23.4s, v23.4s, v26.4s | add v23.4s, v23.4s, v26.4s | ||||
| @@ -139,12 +139,15 @@ ConvDw3x3Int8Vertical: | |||||
| b AddZpLoop | b AddZpLoop | ||||
| PerChannelPostLoop: | PerChannelPostLoop: | ||||
| sqshl v23.4s, v23.4s, v28.4s | sqshl v23.4s, v23.4s, v28.4s | ||||
| sqshl v24.4s, v24.4s, v28.4s | |||||
| ld1 {v28.4s}, [x10], #16 | ld1 {v28.4s}, [x10], #16 | ||||
| sqrdmulh v23.4s, v23.4s, v27.4s | sqrdmulh v23.4s, v23.4s, v27.4s | ||||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||||
| ld1 {v27.4s}, [x9], #16 | ld1 {v27.4s}, [x9], #16 | ||||
| sqrshl v23.4s, v23.4s, v29.4s | sqrshl v23.4s, v23.4s, v29.4s | ||||
| ld1 {v29.4s}, [x11], #16 | |||||
| sqshl v24.4s, v24.4s, v28.4s | |||||
| ld1 {v28.4s}, [x10], #16 | |||||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||||
| ld1 {v27.4s}, [x9], #16 | |||||
| sqrshl v24.4s, v24.4s, v29.4s | sqrshl v24.4s, v24.4s, v29.4s | ||||
| ld1 {v29.4s}, [x11], #16 | ld1 {v29.4s}, [x11], #16 | ||||
| @@ -165,11 +168,6 @@ ConvDw3x3Int8Vertical: | |||||
| st1 {v24.s}[0], [x0], #4 | st1 {v24.s}[0], [x0], #4 | ||||
| ld1 {v23.4s}, [x3], #16 | ld1 {v23.4s}, [x3], #16 | ||||
| ld1 {v24.4s}, [x3], #16 | ld1 {v24.4s}, [x3], #16 | ||||
| cbz x14, NEXT_LOOP | |||||
| ld1 {v27.4s}, [x9], #16 | |||||
| ld1 {v28.4s}, [x10], #16 | |||||
| ld1 {v29.4s}, [x11], #16 | |||||
| NEXT_LOOP: | |||||
| sub x6, x6, #8 | sub x6, x6, #8 | ||||
| cmp x6, #8 | cmp x6, #8 | ||||
| bgt LoopC8 | bgt LoopC8 | ||||
| @@ -205,14 +203,14 @@ ConvDw3x3Int8Vertical: | |||||
| b AddZp | b AddZp | ||||
| PerChannelPost: | PerChannelPost: | ||||
| sqshl v23.4s, v23.4s, v28.4s | sqshl v23.4s, v23.4s, v28.4s | ||||
| sqshl v24.4s, v24.4s, v28.4s | |||||
| ld1 {v28.4s}, [x10], #16 | ld1 {v28.4s}, [x10], #16 | ||||
| sqrdmulh v23.4s, v23.4s, v27.4s | sqrdmulh v23.4s, v23.4s, v27.4s | ||||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||||
| ld1 {v27.4s}, [x9], #16 | ld1 {v27.4s}, [x9], #16 | ||||
| sqrshl v23.4s, v23.4s, v29.4s | sqrshl v23.4s, v23.4s, v29.4s | ||||
| sqrshl v24.4s, v24.4s, v29.4s | |||||
| ld1 {v29.4s}, [x11], #16 | ld1 {v29.4s}, [x11], #16 | ||||
| sqshl v24.4s, v24.4s, v28.4s | |||||
| sqrdmulh v24.4s, v24.4s, v27.4s | |||||
| sqrshl v24.4s, v24.4s, v29.4s | |||||
| AddZp: | AddZp: | ||||
| add v23.4s, v23.4s, v26.4s | add v23.4s, v23.4s, v26.4s | ||||