|
-
- .text
- .align 5
- //.p2align 5,,15
- .global PostFuncBiasReluC4
- #ifndef __APPLE__
- .type PostFuncBiasReluC4, %function
- #endif
-
- //void PostFuncBiasReluC4(float *dst, const float *src, const float *bias, size_t oc4div, size_t oc4mod,
- // size_t plane_size, size_t plane_stride, size_t relu_type);
- // r0 dst r1 srx r2 bias
- // r3 oc4div r4 oc4mod r5 plane_size
- // r6 plane_stride r7 relu_type
-
- // v0 ~ v15 value
- // v16 v17 bias data
- // r10 r11 weite loop tmp buf
- // r16 relu6 #6; r17 relu #0
- // lr oc8 loop control
- // r8 hw loop control
-
- PostFuncBiasReluC4:
- push {r4-r8, r10, r11, lr}
- add sp, sp, #32
-
- ldr r4, [sp]
- ldr r5, [sp, #4]
- ldr r6, [sp, #8]
- ldr r7, [sp, #12]
-
- vmov.i32 q14, #6
- vcvt.f32.s32 q14, q14
- veor q15, q15, q15
-
- mov lr, #4
- add r12, r3, r4
- mul r12, r12, lr
-
- mov lr, #0
-
- Loop_C4:
- cmp lr, r3
- beq Loop_C1
- mov r11, #4
- mul r10, lr, r11
- add r11, r0, r10
- add lr, lr, #4
- mov r8, r5
- vld1.32 {q12}, [r2]!
-
- Loop_4x4:
- cmp r8, #4
- blt Loop_1x4
- sub r8, r8, #4
- vld1.32 {q0-q1}, [r1]!
- vld1.32 {q2-q3}, [r1]!
-
- vadd.f32 q0, q0, q12
- vadd.f32 q1, q1, q12
- vadd.f32 q2, q2, q12
- vadd.f32 q3, q3, q12
-
- cmp r7, #3
- beq Relu6_4x4
- cmp r7, #1
- beq Relu_4x4
- b Write_4x4
- Relu6_4x4:
- vmin.f32 q0, q0, q14
- vmin.f32 q1, q1, q14
- vmin.f32 q2, q2, q14
- vmin.f32 q3, q3, q14
- Relu_4x4:
- vmax.f32 q0, q0, q15
- vmax.f32 q1, q1, q15
- vmax.f32 q2, q2, q15
- vmax.f32 q3, q3, q15
- Write_4x4:
- vst1.32 {q0}, [r11], r12
- vst1.32 {q1}, [r11], r12
- vst1.32 {q2}, [r11], r12
- vst1.32 {q3}, [r11], r12
- b Loop_4x4
-
- Loop_1x4:
- cmp r7, #3
- beq Relu6_1x4
- cmp r7, #1
- beq Relu_1x4
- b Write_1x4
- Relu6_1x4:
- cmp r8, #0
- beq HW_Add
- sub r8, r8, #1
- vld1.32 {q0}, [r1]!
- vadd.f32 q0, q0, q12
- vmin.f32 q0, q0, q14
- vmax.f32 q0, q0, q15
- vst1.32 {q0}, [r11], r12
- b Relu6_1x4
- Relu_1x4:
- cmp r8, #0
- beq HW_Add
- sub r8, r8, #1
- vld1.32 {q0}, [r1]!
- vadd.f32 q0, q0, q12
- vmax.f32 q0, q0, q15
- vst1.32 {q0}, [r11], r12
- b Relu_1x4
- Write_1x4:
- cmp r8, #0
- beq HW_Add
- sub r8, r8, #1
- vld1.32 {q0}, [r1]!
- vadd.f32 q0, q0, q12
- vst1.32 {q0}, [r11], r12
- b Write_1x4
-
- HW_Add:
- add r1, r1, r6
- b Loop_C4
-
- Loop_C1:
- cmp r4, #0
- beq End
- mov r8, r5
- vld1.32 {q12}, [r2]!
- mov r11, #4
- mul r10, lr, r11
- add r0, r0, r10
-
- cmp r4, #1
- beq Loop_C1_1
- cmp r4, #2
- beq Loop_C1_2
- cmp r4, #3
- beq Loop_C1_3
-
- Loop_C1_1:
- cmp r7, #3
- beq Loop_C1_1_Relu6
- cmp r7, #1
- beq Loop_C1_1_Relu
- b Loop_C1_1_Write
- Loop_C1_1_Relu6:
- cmp r8, #0
- beq End
- sub r8, r8, #1
- vld1.32 {q0}, [r1]!
- vadd.f32 q0, q0, q12
- vmin.f32 q0, q0, q14
- vmax.f32 q0, q0, q15
- vst1.32 {d0[0]}, [r0], r12
- b Loop_C1_1_Relu6
- Loop_C1_1_Relu:
- cmp r8, #0
- beq End
- sub r8, r8, #1
- vld1.32 {q0}, [r1]!
- vadd.f32 q0, q0, q12
- vmax.f32 q0, q0, q15
- vst1.32 {d0[0]}, [r0], r12
- b Loop_C1_1_Relu
- Loop_C1_1_Write:
- cmp r8, #0
- beq End
- sub r8, r8, #1
- vld1.32 {q0}, [r1]!
- vadd.f32 q0, q0, q12
- vst1.32 {d0[0]}, [r0], r12
- b Loop_C1_1_Write
-
- Loop_C1_2:
- cmp r7, #3
- beq Loop_C1_2_Relu6
- cmp r7, #1
- beq Loop_C1_2_Relu
- b Loop_C1_2_Write
- Loop_C1_2_Relu6:
- cmp r8, #0
- beq End
- sub r8, r8, #1
- vld1.32 {q0}, [r1]!
- vadd.f32 q0, q0, q12
- vmin.f32 q0, q0, q14
- vmax.f32 q0, q0, q15
- vst1.32 {d0}, [r0], r12
- b Loop_C1_2_Relu6
- Loop_C1_2_Relu:
- cmp r8, #0
- beq End
- sub r8, r8, #1
- vld1.32 {q0}, [r1]!
- vadd.f32 q0, q0, q12
- vmax.f32 q0, q0, q15
- vst1.32 {d0}, [r0], r12
- b Loop_C1_2_Relu
- Loop_C1_2_Write:
- cmp r8, #0
- beq End
- sub r8, r8, #1
- vld1.32 {q0}, [r1]!
- vadd.f32 q0, q0, q12
- vst1.32 {d0}, [r0], r12
- b Loop_C1_2_Write
-
- Loop_C1_3:
- add r11, r0, #8
- cmp r7, #3
- beq Loop_C1_3_Relu6
- cmp r7, #1
- beq Loop_C1_3_Relu
- b Loop_C1_3_Write
- Loop_C1_3_Relu6:
- cmp r8, #0
- beq End
- sub r8, r8, #1
- vld1.32 {q0}, [r1]!
- vadd.f32 q0, q0, q12
- vmin.f32 q0, q0, q14
- vmax.f32 q0, q0, q15
- vst1.32 {d0}, [r0], r6
- vst1.32 {d1[0]}, [r11], r12
- b Loop_C1_3_Relu6
- Loop_C1_3_Relu:
- cmp r8, #0
- beq End
- sub r8, r8, #1
- vld1.32 {q0}, [r1]!
- vadd.f32 q0, q0, q12
- vmax.f32 q0, q0, q15
- vst1.32 {d0}, [r0], r6
- vst1.32 {d1[0]}, [r11], r12
- b Loop_C1_3_Relu
- Loop_C1_3_Write:
- cmp r8, #0
- beq End
- sub r8, r8, #1
- vld1.32 {q0}, [r1]!
- vadd.f32 q0, q0, q12
- vst1.32 {d0}, [r0], r6
- vst1.32 {d1[0]}, [r11], r12
- b Loop_C1_3_Write
-
- End:
- sub sp, sp, #32
- pop {r4-r8, r10, r11, pc}
|