#ifdef __arm__ #ifndef __aarch64__ .text .align 5 .global ConvDwInt8Row #ifndef __APPLE__ .type ConvDwInt8Row, %function #endif // void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels, // int output_channel, int input_step, int8_t input_zp) // r0: output_ptr, r1: input_ptr, r2: weight_ptr, r3: num_pixels, // r4: output_channel, r5: input_step, r6: input_zp, ConvDwInt8Row: // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" // according to https://stackoverflow.com/questions/53625807 // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway // clang's rule seems more simple, though there are no subroutine calls here // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf push {r4-r8, r9-r12, lr} vpush {q4-q7} add sp, sp, #104 cmp r3, #0 beq End ldr r4, [sp] // channel ldr r5, [sp, #4] // input_step ldr r6, [sp, #8] // input_zp vdup.8 d30, r6 mov r7, r0 LoopPixel: mov r8, r1 // input mov r10, r2 // weight mov r11, r4 LoopDepth16In: cmp r11, #16 blt L8 sub r11, r11, #16 vld1.8 {q0}, [r8]! vld1.16 {q1, q2}, [r10]! // weight vsubl.s8 q3, d0, d30 // -zp vld1.32 {q4, q5}, [r0]! vmlal.s16 q4, d6, d2 vmlal.s16 q5, d7, d3 cmp r11, #16 blt LoopDepth16Out LoopDepth16: vst1.32 {q4, q5}, [r7]! vsubl.s8 q6, d1, d30 vld1.32 {q7, q8}, [r0]! vmlal.s16 q7, d12, d4 vmlal.s16 q8, d13, d5 vst1.32 {q7, q8}, [r7]! vld1.8 {q0}, [r8]! vld1.16 {q1, q2}, [r10]! // weight vsubl.s8 q3, d0, d30 // -zp vld1.32 {q4, q5}, [r0]! vmlal.s16 q4, d6, d2 vmlal.s16 q5, d7, d3 sub r11, r11, #16 cmp r11, #16 bge LoopDepth16 LoopDepth16Out: vst1.32 {q4, q5}, [r7]! vsubl.s8 q6, d1, d30 vld1.32 {q7, q8}, [r0]! vmlal.s16 q7, d12, d4 vmlal.s16 q8, d13, d5 vst1.32 {q7, q8}, [r7]! L8: cmp r11, #8 blt L0 LoopDepth8: vld1.8 {d0}, [r8]! vld1.16 {d2, d3}, [r10]! // weight vsubl.s8 q2, d0, d30 // -zp vld1.32 {q3}, [r0]! vmlal.s16 q3, d4, d2 vst1.32 {q3}, [r7]! vld1.32 {q4}, [r0]! vmlal.s16 q4, d5, d3 vst1.32 {q4}, [r7]! sub r11, r11, #8 cmp r11, #8 bge LoopDepth8 L0: cmp r11, #0 beq LoopDepthEnd LoopDepth0: ldrsb r12, [r8], #1 ldrsh r9, [r10], #2 sub r12, r12, r6 ldr lr, [r0], #4 smlabb r12, r12, r9, lr str r12, [r7], #4 subs r11, r11, #1 bne L0 LoopDepthEnd: add r1, r1, r5 subs r3, r3, #1 bne LoopPixel End: sub sp, sp, #104 vpop {q4-q7} pop {r4-r8, r9-r12, pc} #endif #endif