|
- #ifdef __arm__
- #ifndef __aarch64__
-
- .text
- .align 5
- .global ConvDw3x3BorderPixelInt8
- #ifndef __APPLE__
- .type ConvDw3x3BorderPixelInt8, %function
- #endif
-
- // void ConvDw3x3BorderPixelInt8(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height,
- // size_t width, size_t in_kh_step, size_t in_kw_step, size_t channel, size_t in_zp, size_t out_zp,
- // size_t out_multiplier, size_t left_shift, size_t right_shift, size_t acc_min, size_t acc_max) {
-
- // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step,
- // r8: channel, r9: in_zp, r10: out_zp, r11: out_multiplier, r12: left_shift, r13: right_shift
- // r14: acc_min, r15: acc_max
- ConvDw3x3BorderPixelInt8:
- // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
- // according to https://stackoverflow.com/questions/53625807
- // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
- // clang's rule seems more simple, though there are no subroutine calls here
- // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
-
- push {r4-r8, r9-r12, lr}
- vpush {q4-q7}
- add sp, sp, #104
-
- ldr r4, [sp]
- ldr r5, [sp, #4]
- ldr r6, [sp, #8]
- ldr r7, [sp, #12]
- ldr r8, [sp, #16]
-
- ldrb r10, [sp, #20] // in_zp
- vdup.8 d18, r10
- ldr r10, [sp, #24] // out_zp
- vdup.32 q15, r10
- ldr r10, [sp, #28] // out_multiplier
- vdup.32 q14, r10
- ldr r10, [sp, #32] // left_shift
- vdup.32 q13, r10
- ldr r10, [sp, #36] // right_shift
- vdup.32 q12, r10
- ldr r10, [sp, #40] // acc_min
- vdup.32 q11, r10
- ldr r10, [sp, #44] // acc_max
- vdup.32 q10, r10
-
- mov r4, #2
- mul lr, r8, r4
-
- LoopC:
- mov r9, r1
- mov r10, r2
- ldr r4, [sp]
-
- vld1.32 {q3}, [r3]!
- vld1.32 {q4}, [r3]!
- LoopH:
- mov r11, r9
- mov r12, r10
- ldr r5, [sp, #4]
- LoopW:
- vld1.8 {d0}, [r11], r7
- vld1.16 {d2, d3}, [r12], lr // weight
- vsubl.s8 q2, d0, d18 // -zp
-
- vmlal.s16 q3, d4, d2
- vmlal.s16 q4, d5, d3
-
- subs r5, r5, #1
- bne LoopW
- subs r4, r4, #1
- add r9, r9, r6
- mov r11, #3
- mul r5, lr, r11
- add r10, r10, r5
- bne LoopH
-
- vshl.s32 q3, q3, q13
- vqrdmulh.s32 q3, q3, q14
- vand q5, q3, q12
- vshr.s32 q5, q5, #31
- vqadd.s32 q3, q3, q5
- vrshl.s32 q3, q3, q12
- vadd.i32 q3, q3, q15
- vmax.s32 q3, q3, q11
- vmin.s32 q3, q3, q10
- vqmovn.s32 d14, q3
-
- vshl.s32 q4, q4, q13
- vqrdmulh.s32 q4, q4, q14
- vand q6, q4, q12
- vshr.s32 q6, q6, #31
- vqadd.s32 q4, q4, q6
- vrshl.s32 q4, q4, q12
- vadd.i32 q4, q4, q15
- vmax.s32 q4, q4, q11
- vmin.s32 q4, q4, q10
- vqmovn.s32 d15, q4
- vqmovn.s16 d16, q7
-
- vst1.8 {d16}, [r0]!
- add r1, r1, #8
- add r2, r2, #16
-
- sub r8, r8, #8
- cmp r8, #8
- bge LoopC
-
- sub sp, sp, #104
- vpop {q4-q7}
- pop {r4-r8, r9-r12, pc}
- #endif
- #endif
|