#ifdef __arm__
#ifndef __aarch64__

.text
.align 5
.global ConvDwInt8Row
#ifndef __APPLE__
.type ConvDwInt8Row, %function
#endif

// void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
//                    int output_channel, int input_step, int8_t input_zp)
// r0: output_ptr, r1: input_ptr, r2: weight_ptr, r3: num_pixels,
// r4: output_channel, r5: input_step, r6: input_zp,

ConvDwInt8Row:
    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
    // according to https://stackoverflow.com/questions/53625807
    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
    // clang's rule seems more simple, though there are no subroutine calls here
    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
    push {r4-r8, r9-r12, lr}
    vpush {q4-q7}
    add sp, sp, #104

    cmp r3, #0
    beq End

    ldr r4, [sp] // channel
    ldr r5, [sp, #4] // input_step
    ldr r6, [sp, #8] // input_zp
    vdup.8 d30, r6

    mov r7, r0

    LoopPixel:
        mov r8, r1 // input
        mov r10, r2 // weight
        mov r11, r4

        LoopDepth16In:
            cmp r11, #16
            blt L8
            sub r11, r11, #16

            vld1.8 {q0}, [r8]!
            vld1.16 {q1, q2}, [r10]! // weight

            vsubl.s8 q3, d0, d30  // -zp
            vld1.32 {q4, q5}, [r0]!
            vmlal.s16 q4, d6, d2
            vmlal.s16 q5, d7, d3

            cmp r11, #16
            blt LoopDepth16Out
            LoopDepth16:
                vst1.32 {q4, q5}, [r7]!

                vsubl.s8 q6, d1, d30
                vld1.32 {q7, q8}, [r0]!
                vmlal.s16 q7, d12, d4
                vmlal.s16 q8, d13, d5
                vst1.32 {q7, q8}, [r7]!

                vld1.8 {q0}, [r8]!
                vld1.16 {q1, q2}, [r10]! // weight

                vsubl.s8 q3, d0, d30  // -zp
                vld1.32 {q4, q5}, [r0]!
                vmlal.s16 q4, d6, d2
                vmlal.s16 q5, d7, d3

                sub r11, r11, #16
                cmp r11, #16
                bge LoopDepth16

            LoopDepth16Out:
                vst1.32 {q4, q5}, [r7]!

                vsubl.s8 q6, d1, d30
                vld1.32 {q7, q8}, [r0]!
                vmlal.s16 q7, d12, d4
                vmlal.s16 q8, d13, d5
                vst1.32 {q7, q8}, [r7]!

        L8:
            cmp r11, #8
            blt L0

            LoopDepth8:
                vld1.8 {d0}, [r8]!
                vld1.16 {d2, d3}, [r10]! // weight

                vsubl.s8 q2, d0, d30  // -zp

                vld1.32 {q3}, [r0]!
                vmlal.s16 q3, d4, d2
                vst1.32 {q3}, [r7]!

                vld1.32 {q4}, [r0]!
                vmlal.s16 q4, d5, d3
                vst1.32 {q4}, [r7]!

                sub r11, r11, #8
                cmp r11, #8
                bge LoopDepth8

        L0:
            cmp r11, #0
            beq LoopDepthEnd

            LoopDepth0:
                ldrsb r12, [r8], #1
                ldrsh r9, [r10], #2
                sub r12, r12, r6

                ldr lr, [r0], #4
                smlabb r12, r12, r9, lr
                str r12, [r7], #4

                subs r11, r11, #1
                bne L0

        LoopDepthEnd:
        add r1, r1, r5
        subs r3, r3, #1
        bne LoopPixel

    End:
    sub sp, sp, #104
    vpop {q4-q7}
    pop {r4-r8, r9-r12, pc}
#endif
#endif