|
- #ifdef __aarch64__
-
- .text
- .align 5
- .global ConvDwInt8Row
- #ifndef __APPLE__
- .type ConvDwInt8Row, %function
- #endif
-
- // void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
- // int output_channel, int input_step, int8_t input_zp)
- // x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels,
- // x4: output_channel, x5: input_step, x6: input_zp
- //
- ConvDwInt8Row:
- // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
- // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
- // x19 ~ x29 should be also preserved
- // whereas our coding style do not permit such amount of parameters
- cmp x3, #0
- beq End
-
- mov x10, x0
-
- dup v31.8b, w6
-
- LoopOutPixel:
- mov x7, x1
- mov x8, x2
- mov x9, x4
-
- LoopDepth16In:
- cmp x9, #16
- blt L8
- sub x9, x9, #16
-
- ld1 {v0.8b, v1.8b}, [x7], #16
- ld1 {v2.8h, v3.8h}, [x8], #32
- ld1 {v16.4s, v17.4s}, [x0], #32
-
- ssubl v20.8h, v0.8b, v31.8b
- smlal v16.4s, v20.4h, v2.4h
- smlal2 v17.4s, v20.8h, v2.8h
-
-
- cmp x9, #16
- blt LoopDepth16Out
- LoopDepth16:
-
- st1 {v16.4s, v17.4s}, [x10], #32
- ld1 {v18.4s, v19.4s}, [x0], #32
- ssubl v21.8h, v1.8b, v31.8b
- smlal v18.4s, v21.4h, v3.4h
- smlal2 v19.4s, v21.8h, v3.8h
- st1 {v18.4s, v19.4s}, [x10], #32
-
- ld1 {v0.8b, v1.8b}, [x7], #16
- ld1 {v2.8h, v3.8h}, [x8], #32
- ld1 {v16.4s, v17.4s}, [x0], #32
-
- ssubl v20.8h, v0.8b, v31.8b
- smlal v16.4s, v20.4h, v2.4h
- smlal2 v17.4s, v20.8h, v2.8h
-
- sub x9, x9, #16
- cmp x9, #16
- bge LoopDepth16
-
- LoopDepth16Out:
-
- st1 {v16.4s, v17.4s}, [x10], #32
- ld1 {v18.4s, v19.4s}, [x0], #32
- ssubl v21.8h, v1.8b, v31.8b
- smlal v18.4s, v21.4h, v3.4h
- smlal2 v19.4s, v21.8h, v3.8h
- st1 {v18.4s, v19.4s}, [x10], #32
-
- L8:
- cmp x9, #8
- blt L0
-
- LoopDepth8:
- ld1 {v0.8b}, [x7], #8
- ld1 {v2.8h}, [x8], #16
- ld1 {v16.4s, v17.4s}, [x0], #32
-
- ssubl v20.8h, v0.8b, v31.8b
- smlal v16.4s, v20.4h, v2.4h
- smlal2 v17.4s, v20.8h, v2.8h
- st1 {v16.4s, v17.4s}, [x10], #32
-
- sub x9, x9, #8
- cmp x9, #8
- bge LoopDepth8
-
- L0:
- cmp x9, #0
- beq Loop16LineEnd
-
- LoopDepth0:
- ldrsb w14, [x7], #1
- ldrsh w15, [x8], #2
- ldr w16, [x0], #4
- add w14, w14, w6
-
- sxth w14, w14
- madd w14, w14, w15, w16
- str w14, [x10], #4
-
- subs x9, x9, #1
- bne LoopDepth0
-
- Loop16LineEnd:
-
- subs x3, x3, #1
- add x1, x1, x5
- bne LoopOutPixel
-
- End:
- ret
-
- #endif
|