|
|
@@ -7,219 +7,273 @@ |
|
|
#ifndef __APPLE__ |
|
|
#ifndef __APPLE__ |
|
|
.type ConvDwInt8Center, %function |
|
|
.type ConvDwInt8Center, %function |
|
|
#endif |
|
|
#endif |
|
|
|
|
|
|
|
|
// void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width, |
|
|
|
|
|
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, |
|
|
|
|
|
// size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift, |
|
|
|
|
|
// int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max); |
|
|
|
|
|
// r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: width, #56: kernel_h, #60: kernel_w, |
|
|
|
|
|
// #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step |
|
|
|
|
|
// #88: out_multiplier, #92: left_shift, #96: right_shift, #100: out_zp, #104: acc_min, #108: acc_max |
|
|
|
|
|
|
|
|
// void DepthwiseCenterInt8(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int height, |
|
|
|
|
|
// int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, |
|
|
|
|
|
// int in_sw_step, int in_kh_step, int in_kw_step, int8_t *in_zp, int32_t *out_zp, |
|
|
|
|
|
// int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, int32_t *acc_min, |
|
|
|
|
|
// int32_t *acc_max) |
|
|
|
|
|
// #-48: dst, #-44: src, #-40: weight, #-36: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w, |
|
|
|
|
|
// #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step, #36: in_kw_step |
|
|
|
|
|
// #40: in_zp, #44: out_zp, #48: out_multiplier, #52: left_shift, #56: right_shift, #60:acc_min, #64: acc_max |
|
|
ConvDwInt8Center: |
|
|
ConvDwInt8Center: |
|
|
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" |
|
|
|
|
|
// according to https://stackoverflow.com/questions/53625807 |
|
|
|
|
|
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway |
|
|
|
|
|
// clang's rule seems more simple, though there are no subroutine calls here |
|
|
|
|
|
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf |
|
|
|
|
|
|
|
|
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" |
|
|
|
|
|
// according to https://stackoverflow.com/questions/53625807 |
|
|
|
|
|
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway |
|
|
|
|
|
// clang's rule seems more simple, though there are no subroutine calls here |
|
|
|
|
|
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf |
|
|
push {r0-r8, r10, r11, lr} |
|
|
push {r0-r8, r10, r11, lr} |
|
|
vpush {q4-q7} |
|
|
vpush {q4-q7} |
|
|
add sp, sp, #112 |
|
|
|
|
|
|
|
|
|
|
|
ldr r4, [sp, #48] |
|
|
|
|
|
|
|
|
ldr lr, [sp, #168] |
|
|
|
|
|
vld1.32 {q0, q1}, [lr] |
|
|
|
|
|
vpush {q0, q1} |
|
|
|
|
|
ldr lr, [sp, #204] |
|
|
|
|
|
vld1.32 {q0, q1}, [lr] |
|
|
|
|
|
vpush {q0, q1} |
|
|
|
|
|
ldr lr, [sp, #240] |
|
|
|
|
|
vld1.32 {q0, q1}, [lr] |
|
|
|
|
|
vpush {q0, q1} |
|
|
|
|
|
add sp, sp, #208 |
|
|
|
|
|
|
|
|
ldr r12, [sp, #92] |
|
|
|
|
|
vdup.32 q9, r12 |
|
|
|
|
|
|
|
|
ldr r1, [sp, #-36] |
|
|
|
|
|
vld1.32 {q8, q9}, [r1] |
|
|
|
|
|
ldr r1, [sp, #44] |
|
|
|
|
|
vld1.32 {q10, q11}, [r1] |
|
|
|
|
|
ldr r1, [sp, #48] |
|
|
|
|
|
vld1.32 {q12, q13}, [r1] |
|
|
|
|
|
ldr r1, [sp, #52] |
|
|
|
|
|
vld1.32 {q14, q15}, [r1] |
|
|
|
|
|
|
|
|
ldr r11, [sp, #88] |
|
|
|
|
|
vdup.32 q10, r11 |
|
|
|
|
|
|
|
|
ldr r11, [sp, #28] |
|
|
|
|
|
ldr r4, [sp] |
|
|
|
|
|
LoopH: |
|
|
|
|
|
ldr r1, [sp, #-44] |
|
|
|
|
|
ldr r0, [sp, #-48] |
|
|
|
|
|
ldr r5, [sp, #4] |
|
|
|
|
|
LoopW2: |
|
|
|
|
|
vmov q4, q8 |
|
|
|
|
|
vmov q5, q9 |
|
|
|
|
|
vmov q6, q8 |
|
|
|
|
|
vmov q7, q9 |
|
|
|
|
|
mov r7, r1 |
|
|
|
|
|
ldr r3, [sp, #-40] |
|
|
|
|
|
ldr r6, [sp, #8] |
|
|
|
|
|
LoopKH: |
|
|
|
|
|
mov r9, r7 |
|
|
|
|
|
ldr r10, [sp, #12] |
|
|
|
|
|
LoopKW: |
|
|
|
|
|
mov r8, r9 |
|
|
|
|
|
vld1.16 {q0}, [r3]! |
|
|
|
|
|
ldr lr, [sp, #40] |
|
|
|
|
|
vld1.8 {d2}, [lr] |
|
|
|
|
|
|
|
|
ldr r10, [sp, #96] |
|
|
|
|
|
vdup.32 q11, r10 |
|
|
|
|
|
|
|
|
vld1.8 {d3}, [r8] |
|
|
|
|
|
add r8, r8, r11 |
|
|
|
|
|
vsubl.s8 q2, d3, d2 |
|
|
|
|
|
vmlal.s16 q4, d4, d0 |
|
|
|
|
|
vmlal.s16 q5, d5, d1 |
|
|
|
|
|
|
|
|
ldr r8, [sp, #100] |
|
|
|
|
|
vdup.32 q12, r8 |
|
|
|
|
|
|
|
|
|
|
|
ldr r7, [sp, #104] |
|
|
|
|
|
vdup.32 q13, r7 |
|
|
|
|
|
|
|
|
vld1.8 {d3}, [r8] |
|
|
|
|
|
add r8, r8, r11 |
|
|
|
|
|
vsubl.s8 q2, d3, d2 |
|
|
|
|
|
vmlal.s16 q6, d4, d0 |
|
|
|
|
|
vmlal.s16 q7, d5, d1 |
|
|
|
|
|
|
|
|
ldr r6, [sp, #108] |
|
|
|
|
|
vdup.32 q14, r6 |
|
|
|
|
|
|
|
|
ldr r12, [sp, #36] |
|
|
|
|
|
add r9, r9, r12 |
|
|
|
|
|
subs r10, r10, #1 |
|
|
|
|
|
bne LoopKW |
|
|
|
|
|
ldr r12, [sp, #32] |
|
|
|
|
|
add r7, r7, r12 |
|
|
|
|
|
subs r6, r6, #1 |
|
|
|
|
|
bne LoopKH |
|
|
|
|
|
|
|
|
vld1.32 {q15}, [r3] |
|
|
|
|
|
|
|
|
vshl.s32 q4, q4, q14 |
|
|
|
|
|
vshl.s32 q5, q5, q15 |
|
|
|
|
|
vshl.s32 q6, q6, q14 |
|
|
|
|
|
vshl.s32 q7, q7, q15 |
|
|
|
|
|
|
|
|
LoopH: |
|
|
|
|
|
ldr r1, [sp, #4] // src_w |
|
|
|
|
|
ldr r5, [sp, #52] // width |
|
|
|
|
|
ldr r0, [sp] // dst_w |
|
|
|
|
|
LoopW4: |
|
|
|
|
|
ldr r11, [sp, #76] // in_sw_step |
|
|
|
|
|
mov r8, r1 // src_kh |
|
|
|
|
|
ldr r2, [sp, #8] // weight_kh |
|
|
|
|
|
ldr r6, [sp, #56] // kernel_h |
|
|
|
|
|
vmov q0, q15 |
|
|
|
|
|
LoopKh4: |
|
|
|
|
|
ldr r12, [sp, #80] //in_kh_step |
|
|
|
|
|
ldr r7, [sp, #60] // kernel_w |
|
|
|
|
|
mov r10, r8 // src_kw |
|
|
|
|
|
LoopKw4: |
|
|
|
|
|
vld1.16 {d24}, [r2]! |
|
|
|
|
|
vld1.16 {d8}, [r10] |
|
|
|
|
|
add r10, r10, r11 |
|
|
|
|
|
vmlal.s16 q0, d8, d24 |
|
|
|
|
|
vld1.16 {d10}, [r10] |
|
|
|
|
|
add r10, r10, r11 |
|
|
|
|
|
vmlal.s16 q1, d10, d24 |
|
|
|
|
|
vld1.16 {d12}, [r10] |
|
|
|
|
|
add r10, r10, r11 |
|
|
|
|
|
vmlal.s16 q2, d12, d24 |
|
|
|
|
|
vld1.16 {d14}, [r10] |
|
|
|
|
|
add r10, r10, r11 |
|
|
|
|
|
vmlal.s16 q3, d14, d24 |
|
|
|
|
|
subs r7, r7, #1 |
|
|
|
|
|
bne LoopKw4 |
|
|
|
|
|
ldr r12, [sp, #80] |
|
|
|
|
|
add r8, r8, r12 |
|
|
|
|
|
subs r6, r6, #1 |
|
|
|
|
|
bne LoopKh4 |
|
|
|
|
|
|
|
|
|
|
|
vshl.s32 q0, q0, q9 |
|
|
|
|
|
vshl.s32 q1, q1, q9 |
|
|
|
|
|
vshl.s32 q2, q2, q9 |
|
|
|
|
|
vshl.s32 q3, q3, q9 |
|
|
|
|
|
vqrdmulh.s32 q0, q0, q10 |
|
|
|
|
|
vqrdmulh.s32 q1, q1, q10 |
|
|
|
|
|
vqrdmulh.s32 q2, q2, q10 |
|
|
|
|
|
vqrdmulh.s32 q3, q3, q10 |
|
|
|
|
|
vand q4, q0, q11 |
|
|
|
|
|
vshr.s32 q4, q4, #31 |
|
|
|
|
|
vqadd.s32 q0, q0, q4 |
|
|
|
|
|
vrshl.s32 q0, q0, q11 |
|
|
|
|
|
vand q5, q1, q11 |
|
|
|
|
|
vshr.s32 q5, q5, #31 |
|
|
|
|
|
vqadd.s32 q1, q1, q5 |
|
|
|
|
|
vrshl.s32 q1, q1, q11 |
|
|
|
|
|
vand q6, q2, q11 |
|
|
|
|
|
vshr.s32 q6, q6, #31 |
|
|
|
|
|
vqadd.s32 q2, q2, q6 |
|
|
|
|
|
vrshl.s32 q2, q2, q11 |
|
|
|
|
|
vand q7, q3, q11 |
|
|
|
|
|
vshr.s32 q7, q7, #31 |
|
|
|
|
|
vqadd.s32 q3, q3, q7 |
|
|
|
|
|
vrshl.s32 q3, q3, q11 |
|
|
|
|
|
vadd.i32 q0, q0, q12 |
|
|
|
|
|
vadd.i32 q1, q1, q12 |
|
|
|
|
|
vadd.i32 q2, q2, q12 |
|
|
|
|
|
vadd.i32 q3, q3, q12 |
|
|
|
|
|
vmax.s32 q0, q0, q13 |
|
|
|
|
|
vmax.s32 q1, q1, q13 |
|
|
|
|
|
vmax.s32 q2, q2, q13 |
|
|
|
|
|
vmax.s32 q3, q3, q13 |
|
|
|
|
|
vmin.s32 q0, q0, q14 |
|
|
|
|
|
vmin.s32 q1, q1, q14 |
|
|
|
|
|
vmin.s32 q2, q2, q14 |
|
|
|
|
|
vmin.s32 q3, q3, q14 |
|
|
|
|
|
|
|
|
|
|
|
vqmovn.s32 d0, q0 |
|
|
|
|
|
vqmovn.s32 d2, q1 |
|
|
|
|
|
vqmovn.s32 d4, q2 |
|
|
|
|
|
vqmovn.s32 d6, q3 |
|
|
|
|
|
vqmovn.s16 d0, q0 |
|
|
|
|
|
vqmovn.s16 d2, q1 |
|
|
|
|
|
vqmovn.s16 d4, q2 |
|
|
|
|
|
vqmovn.s16 d6, q3 |
|
|
|
|
|
|
|
|
|
|
|
mov r3, r0 |
|
|
|
|
|
ldr r12, [sp, #68] |
|
|
|
|
|
vst1.8 {d0[0]}, [r3]! |
|
|
|
|
|
vst1.8 {d0[1]}, [r3]! |
|
|
|
|
|
vst1.8 {d0[2]}, [r3]! |
|
|
|
|
|
vst1.8 {d0[3]}, [r3]! |
|
|
|
|
|
add r0, r0, r12 |
|
|
|
|
|
mov r3, r0 |
|
|
|
|
|
vst1.8 {d2[0]}, [r3]! |
|
|
|
|
|
vst1.8 {d2[1]}, [r3]! |
|
|
|
|
|
vst1.8 {d2[2]}, [r3]! |
|
|
|
|
|
vst1.8 {d2[3]}, [r3]! |
|
|
|
|
|
add r0, r0, r12 |
|
|
|
|
|
mov r3, r0 |
|
|
|
|
|
vst1.8 {d4[0]}, [r3]! |
|
|
|
|
|
vst1.8 {d4[1]}, [r3]! |
|
|
|
|
|
vst1.8 {d4[2]}, [r3]! |
|
|
|
|
|
vst1.8 {d4[3]}, [r3]! |
|
|
|
|
|
add r0, r0, r12 |
|
|
|
|
|
mov r3, r0 |
|
|
|
|
|
vst1.8 {d6[0]}, [r3]! |
|
|
|
|
|
vst1.8 {d6[1]}, [r3]! |
|
|
|
|
|
vst1.8 {d6[2]}, [r3]! |
|
|
|
|
|
vst1.8 {d6[3]}, [r3]! |
|
|
|
|
|
add r0, r0, r12 |
|
|
|
|
|
mov r3, r0 |
|
|
|
|
|
mov r12, #4 |
|
|
|
|
|
mul r11, r11, r12 |
|
|
|
|
|
|
|
|
vqrdmulh.s32 q4, q4, q12 |
|
|
|
|
|
vqrdmulh.s32 q5, q5, q13 |
|
|
|
|
|
vqrdmulh.s32 q6, q6, q12 |
|
|
|
|
|
vqrdmulh.s32 q7, q7, q13 |
|
|
|
|
|
|
|
|
|
|
|
sub lr, sp, #144 |
|
|
|
|
|
vld1.32 {q0, q1}, [lr] |
|
|
|
|
|
|
|
|
|
|
|
vand q2, q4, q0 |
|
|
|
|
|
vshr.s32 q2, q2, #31 |
|
|
|
|
|
vqadd.s32 q4, q4, q2 |
|
|
|
|
|
vrshl.s32 q4, q4, q0 |
|
|
|
|
|
|
|
|
|
|
|
vand q2, q5, q1 |
|
|
|
|
|
vshr.s32 q2, q2, #31 |
|
|
|
|
|
vqadd.s32 q5, q5, q2 |
|
|
|
|
|
vrshl.s32 q5, q5, q1 |
|
|
|
|
|
|
|
|
|
|
|
vand q2, q6, q0 |
|
|
|
|
|
vshr.s32 q2, q2, #31 |
|
|
|
|
|
vqadd.s32 q6, q6, q2 |
|
|
|
|
|
vrshl.s32 q6, q6, q0 |
|
|
|
|
|
|
|
|
|
|
|
vand q2, q7, q1 |
|
|
|
|
|
vshr.s32 q2, q2, #31 |
|
|
|
|
|
vqadd.s32 q7, q7, q2 |
|
|
|
|
|
vrshl.s32 q7, q7, q1 |
|
|
|
|
|
|
|
|
|
|
|
vadd.i32 q4, q4, q10 |
|
|
|
|
|
vadd.i32 q5, q5, q11 |
|
|
|
|
|
vadd.i32 q6, q6, q10 |
|
|
|
|
|
vadd.i32 q7, q7, q11 |
|
|
|
|
|
|
|
|
|
|
|
sub lr, sp, #176 |
|
|
|
|
|
vld1.32 {q0, q1}, [lr] |
|
|
|
|
|
vmax.s32 q4, q4, q0 |
|
|
|
|
|
vmax.s32 q5, q5, q1 |
|
|
|
|
|
vmax.s32 q6, q6, q0 |
|
|
|
|
|
vmax.s32 q7, q7, q1 |
|
|
|
|
|
|
|
|
|
|
|
sub lr, sp, #208 |
|
|
|
|
|
vld1.32 {q0, q1}, [lr] |
|
|
|
|
|
vmin.s32 q4, q4, q0 |
|
|
|
|
|
vmin.s32 q5, q5, q1 |
|
|
|
|
|
vmin.s32 q6, q6, q0 |
|
|
|
|
|
vmin.s32 q7, q7, q1 |
|
|
|
|
|
|
|
|
|
|
|
vqmovn.s32 d0, q4 |
|
|
|
|
|
vqmovn.s32 d1, q5 |
|
|
|
|
|
vqmovn.s32 d2, q6 |
|
|
|
|
|
vqmovn.s32 d3, q7 |
|
|
|
|
|
vqmovn.s16 d0, q0 |
|
|
|
|
|
vqmovn.s16 d1, q1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ldr r12, [sp, #20] |
|
|
|
|
|
mov r8, r0 |
|
|
|
|
|
vst1.8 {d0[0]}, [r8]! |
|
|
|
|
|
vst1.8 {d0[1]}, [r8]! |
|
|
|
|
|
vst1.8 {d0[2]}, [r8]! |
|
|
|
|
|
vst1.8 {d0[3]}, [r8]! |
|
|
|
|
|
vst1.8 {d0[4]}, [r8]! |
|
|
|
|
|
vst1.8 {d0[5]}, [r8]! |
|
|
|
|
|
vst1.8 {d0[6]}, [r8]! |
|
|
|
|
|
vst1.8 {d0[7]}, [r8]! |
|
|
|
|
|
add r0, r0, r12 |
|
|
|
|
|
|
|
|
|
|
|
mov r8, r0 |
|
|
|
|
|
vst1.8 {d1[0]}, [r8]! |
|
|
|
|
|
vst1.8 {d1[1]}, [r8]! |
|
|
|
|
|
vst1.8 {d1[2]}, [r8]! |
|
|
|
|
|
vst1.8 {d1[3]}, [r8]! |
|
|
|
|
|
vst1.8 {d1[4]}, [r8]! |
|
|
|
|
|
vst1.8 {d1[5]}, [r8]! |
|
|
|
|
|
vst1.8 {d1[6]}, [r8]! |
|
|
|
|
|
vst1.8 {d1[7]}, [r8]! |
|
|
|
|
|
add r0, r0, r12 |
|
|
|
|
|
|
|
|
|
|
|
add r1, r1, r11 |
|
|
add r1, r1, r11 |
|
|
add r1, r1, r11 |
|
|
sub r5, r5, #4 |
|
|
|
|
|
cmp r5, #0 |
|
|
|
|
|
ble LoopWEnd |
|
|
|
|
|
cmp r5, #4 |
|
|
|
|
|
bge LoopW4 |
|
|
|
|
|
|
|
|
subs r5, r5, #2 |
|
|
|
|
|
beq LoopEndW |
|
|
|
|
|
cmp r5, #2 |
|
|
|
|
|
bge LoopW2 |
|
|
|
|
|
|
|
|
LoopW: |
|
|
LoopW: |
|
|
mov r8, r1 // src_kh |
|
|
|
|
|
ldr r2, [sp, #8] // weight_kh |
|
|
|
|
|
ldr r6, [sp, #56] // kernel_h |
|
|
|
|
|
vmov q0, q15 |
|
|
|
|
|
LoopKh: |
|
|
|
|
|
ldr r12, [sp, #84] //in_kw_step |
|
|
|
|
|
ldr r7, [sp, #60] // kernel_w |
|
|
|
|
|
mov r10, r8 // src_kw |
|
|
|
|
|
LoopKw: |
|
|
|
|
|
vld1.16 {d2}, [r10] |
|
|
|
|
|
add r10, r10, r12 |
|
|
|
|
|
vld1.16 {d24}, [r2]! |
|
|
|
|
|
vmlal.s16 q0, d2, d24 |
|
|
|
|
|
subs r7, r7, #1 |
|
|
|
|
|
bne LoopKw |
|
|
|
|
|
ldr r12, [sp, #80] |
|
|
|
|
|
add r8, r8, r12 |
|
|
|
|
|
|
|
|
vmov q4, q8 |
|
|
|
|
|
vmov q5, q9 |
|
|
|
|
|
mov r7, r1 |
|
|
|
|
|
ldr r3, [sp, #-40] |
|
|
|
|
|
ldr r6, [sp, #8] |
|
|
|
|
|
LoopKH1: |
|
|
|
|
|
mov r9, r7 |
|
|
|
|
|
ldr r10, [sp, #12] |
|
|
|
|
|
LoopKW1: |
|
|
|
|
|
vld1.16 {q0}, [r3]! |
|
|
|
|
|
ldr lr, [sp, #40] |
|
|
|
|
|
vld1.8 {d2}, [lr] |
|
|
|
|
|
|
|
|
|
|
|
vld1.8 {d3}, [r9] |
|
|
|
|
|
vsubl.s8 q2, d3, d2 |
|
|
|
|
|
vmlal.s16 q4, d4, d0 |
|
|
|
|
|
vmlal.s16 q5, d5, d1 |
|
|
|
|
|
|
|
|
|
|
|
ldr r12, [sp, #36] |
|
|
|
|
|
add r9, r9, r12 |
|
|
|
|
|
subs r10, r10, #1 |
|
|
|
|
|
bne LoopKW1 |
|
|
|
|
|
ldr r12, [sp, #32] |
|
|
|
|
|
add r7, r7, r12 |
|
|
subs r6, r6, #1 |
|
|
subs r6, r6, #1 |
|
|
bne LoopKh |
|
|
|
|
|
|
|
|
|
|
|
vshl.s32 q0, q0, q9 |
|
|
|
|
|
vqrdmulh.s32 q0, q0, q10 |
|
|
|
|
|
vand q4, q0, q11 |
|
|
|
|
|
vshr.s32 q4, q4, #31 |
|
|
|
|
|
vqadd.s32 q0, q0, q4 |
|
|
|
|
|
vrshl.s32 q0, q0, q11 |
|
|
|
|
|
vadd.i32 q0, q0, q12 |
|
|
|
|
|
vmax.s32 q0, q0, q13 |
|
|
|
|
|
vmin.s32 q0, q0, q14 |
|
|
|
|
|
|
|
|
|
|
|
vqmovn.s32 d0, q0 |
|
|
|
|
|
vqmovn.s16 d0, q0 |
|
|
|
|
|
|
|
|
|
|
|
mov r3, r0 |
|
|
|
|
|
ldr r12, [sp, #68] |
|
|
|
|
|
vst1.8 {d0[0]}, [r3]! |
|
|
|
|
|
vst1.8 {d0[1]}, [r3]! |
|
|
|
|
|
vst1.8 {d0[2]}, [r3]! |
|
|
|
|
|
vst1.8 {d0[3]}, [r3]! |
|
|
|
|
|
|
|
|
bne LoopKH1 |
|
|
|
|
|
|
|
|
|
|
|
vshl.s32 q4, q4, q14 |
|
|
|
|
|
vshl.s32 q5, q5, q15 |
|
|
|
|
|
|
|
|
|
|
|
vqrdmulh.s32 q4, q4, q12 |
|
|
|
|
|
vqrdmulh.s32 q5, q5, q13 |
|
|
|
|
|
|
|
|
|
|
|
sub lr, sp, #144 |
|
|
|
|
|
vld1.32 {q0, q1}, [lr] |
|
|
|
|
|
vand q2, q4, q0 |
|
|
|
|
|
vshr.s32 q2, q2, #31 |
|
|
|
|
|
vqadd.s32 q4, q4, q2 |
|
|
|
|
|
vrshl.s32 q4, q4, q0 |
|
|
|
|
|
|
|
|
|
|
|
vand q2, q5, q1 |
|
|
|
|
|
vshr.s32 q2, q2, #31 |
|
|
|
|
|
vqadd.s32 q5, q5, q2 |
|
|
|
|
|
vrshl.s32 q5, q5, q1 |
|
|
|
|
|
|
|
|
|
|
|
vadd.i32 q4, q4, q10 |
|
|
|
|
|
vadd.i32 q5, q5, q11 |
|
|
|
|
|
|
|
|
|
|
|
sub lr, sp, #176 |
|
|
|
|
|
vld1.32 {q0, q1}, [lr] |
|
|
|
|
|
vmax.s32 q4, q4, q0 |
|
|
|
|
|
vmax.s32 q5, q5, q1 |
|
|
|
|
|
|
|
|
|
|
|
sub lr, sp, #208 |
|
|
|
|
|
vld1.32 {q0, q1}, [lr] |
|
|
|
|
|
vmin.s32 q4, q4, q0 |
|
|
|
|
|
vmin.s32 q5, q5, q1 |
|
|
|
|
|
|
|
|
|
|
|
vqmovn.s32 d0, q4 |
|
|
|
|
|
vqmovn.s32 d1, q5 |
|
|
|
|
|
vqmovn.s16 d0, q0 |
|
|
|
|
|
|
|
|
|
|
|
mov r8, r0 |
|
|
|
|
|
vst1.8 {d0[0]}, [r8]! |
|
|
|
|
|
vst1.8 {d0[1]}, [r8]! |
|
|
|
|
|
vst1.8 {d0[2]}, [r8]! |
|
|
|
|
|
vst1.8 {d0[3]}, [r8]! |
|
|
|
|
|
vst1.8 {d0[4]}, [r8]! |
|
|
|
|
|
vst1.8 {d0[5]}, [r8]! |
|
|
|
|
|
vst1.8 {d0[6]}, [r8]! |
|
|
|
|
|
vst1.8 {d0[7]}, [r8]! |
|
|
|
|
|
ldr r12, [sp, #20] |
|
|
add r0, r0, r12 |
|
|
add r0, r0, r12 |
|
|
ldr r12, [sp, #76] |
|
|
|
|
|
add r1, r1, r12 |
|
|
|
|
|
|
|
|
add r1, r1, r11 |
|
|
subs r5, r5, #1 |
|
|
subs r5, r5, #1 |
|
|
bne LoopW |
|
|
bne LoopW |
|
|
ldr r3, [sp, #64] |
|
|
|
|
|
ldr r12, [sp] |
|
|
|
|
|
add r12, r12, r3 |
|
|
|
|
|
str r12, [sp] |
|
|
|
|
|
ldr r3, [sp, #72] |
|
|
|
|
|
ldr r12, [sp, #4] |
|
|
|
|
|
add r12, r12, r3 |
|
|
|
|
|
str r12, [sp, #4] |
|
|
|
|
|
subs r4, r4, #1 |
|
|
|
|
|
bne LoopH |
|
|
|
|
|
LoopWEnd: |
|
|
|
|
|
sub sp, sp, #112 |
|
|
|
|
|
vpop {q4-q7} |
|
|
|
|
|
pop {r0-r8, r10, r11, pc} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LoopEndW: |
|
|
|
|
|
ldr r12, [sp, #16] |
|
|
|
|
|
ldr r1, [sp, #-48] |
|
|
|
|
|
add r1, r1, r12 |
|
|
|
|
|
str r1, [sp, #-48] |
|
|
|
|
|
ldr r12, [sp, #24] |
|
|
|
|
|
ldr r1, [sp, #-44] |
|
|
|
|
|
add r1, r1, r12 |
|
|
|
|
|
str r1, [sp, #-44] |
|
|
|
|
|
subs r4, r4, #1 |
|
|
|
|
|
bne LoopH |
|
|
|
|
|
|
|
|
|
|
|
LoopEndH: |
|
|
|
|
|
sub sp, sp, #208 |
|
|
|
|
|
vpop {q0, q1} |
|
|
|
|
|
vpop {q0, q1} |
|
|
|
|
|
vpop {q0, q1} |
|
|
|
|
|
vpop {q4-q7} |
|
|
|
|
|
pop {r0-r8, r10, r11, pc} |
|
|
#endif |
|
|
#endif |
|
|
#endif |
|
|
#endif |