#ifdef __aarch64__

.text
.align 5
.global ConvDwInt8Row
#ifndef __APPLE__
.type ConvDwInt8Row, %function
#endif

// void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
//                    int output_channel, int input_step, int8_t input_zp)
// x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels,
// x4: output_channel, x5: input_step, x6: input_zp
//
ConvDwInt8Row:
    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
    // x19 ~ x29 should be also preserved
    // whereas our coding style do not permit such amount of parameters
cmp x3, #0
beq End

mov x10, x0

dup v31.8b, w6

LoopOutPixel:
mov x7, x1
mov x8, x2
mov x9, x4

    LoopDepth16In:
    cmp x9, #16
    blt L8
    sub x9, x9, #16

    ld1 {v0.8b, v1.8b}, [x7], #16
    ld1 {v2.8h, v3.8h}, [x8], #32
    ld1 {v16.4s, v17.4s}, [x0], #32

    ssubl v20.8h, v0.8b, v31.8b
    smlal v16.4s, v20.4h, v2.4h
    smlal2 v17.4s, v20.8h, v2.8h


    cmp x9, #16
    blt LoopDepth16Out
    LoopDepth16:

    st1 {v16.4s, v17.4s}, [x10], #32
    ld1 {v18.4s, v19.4s}, [x0], #32
    ssubl v21.8h, v1.8b, v31.8b
    smlal v18.4s, v21.4h, v3.4h
    smlal2 v19.4s, v21.8h, v3.8h
    st1 {v18.4s, v19.4s}, [x10], #32

    ld1 {v0.8b, v1.8b}, [x7], #16
    ld1 {v2.8h, v3.8h}, [x8], #32
    ld1 {v16.4s, v17.4s}, [x0], #32

    ssubl v20.8h, v0.8b, v31.8b
    smlal v16.4s, v20.4h, v2.4h
    smlal2 v17.4s, v20.8h, v2.8h

    sub x9, x9, #16
    cmp x9, #16
    bge LoopDepth16

    LoopDepth16Out:

    st1 {v16.4s, v17.4s}, [x10], #32
    ld1 {v18.4s, v19.4s}, [x0], #32
    ssubl v21.8h, v1.8b, v31.8b
    smlal v18.4s, v21.4h, v3.4h
    smlal2 v19.4s, v21.8h, v3.8h
    st1 {v18.4s, v19.4s}, [x10], #32

    L8:
    cmp x9, #8
    blt L0

    LoopDepth8:
    ld1 {v0.8b}, [x7], #8
    ld1 {v2.8h}, [x8], #16
    ld1 {v16.4s, v17.4s}, [x0], #32

    ssubl v20.8h, v0.8b, v31.8b
    smlal v16.4s, v20.4h, v2.4h
    smlal2 v17.4s, v20.8h, v2.8h
    st1 {v16.4s, v17.4s}, [x10], #32

    sub x9, x9, #8
    cmp x9, #8
    bge LoopDepth8

    L0:
    cmp x9, #0
    beq Loop16LineEnd

    LoopDepth0:
    ldrsb w14, [x7], #1
    ldrsh w15, [x8], #2
    ldr w16, [x0], #4
    add w14, w14, w6

    sxth w14, w14
    madd w14, w14, w15, w16
    str w14, [x10], #4

    subs x9, x9, #1
    bne LoopDepth0

    Loop16LineEnd:

subs x3, x3, #1
add x1, x1, x5
bne LoopOutPixel

End:
ret

#endif