| @@ -0,0 +1,439 @@ | |||||
| #ifdef ENABLE_ARM32 | |||||
| .text | |||||
| .align 5 | |||||
| //.p2align 5,,15 | |||||
| .global PostFuncBiasReluC8 | |||||
| #ifndef __APPLE__ | |||||
| .type PostFuncBiasReluC8, %function | |||||
| #endif | |||||
| //void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div,size_t oc8mod | |||||
| // size_t plane_size, size_t stride, int relu_type); | |||||
| // r0 dst r1 srx r2 bias | |||||
| // r3 oc8div r4 oc8mod r5 plane_size | |||||
| // r6 stride r7 relu_type | |||||
| // v0 ~ v15 value | |||||
| // v16 v17 bias data | |||||
| // r10 r11 weite loop tmp buf | |||||
| // r16 relu6 #6; r17 relu #0 | |||||
| // lr oc8 loop control | |||||
| // r8 hw loop control | |||||
| PostFuncBiasReluC8: | |||||
| push {r4-r8, r10, r11, lr} | |||||
| add sp, sp, #32 | |||||
| ldr r4, [sp] | |||||
| ldr r5, [sp, #4] | |||||
| ldr r6, [sp, #8] | |||||
| ldr r7, [sp, #12] | |||||
| vmov.i32 q14, #6 | |||||
| vcvt.f32.s32 q14, q14 | |||||
| veor q15, q15, q15 | |||||
| mov lr, #0 | |||||
| Loop_C8: | |||||
| cmp lr, r3 | |||||
| beq Loop_C1 | |||||
| mov r11, #4 | |||||
| mul r10, lr, r11 | |||||
| add r11, r0, r10 | |||||
| add lr, lr, #8 | |||||
| mov r8, r5 | |||||
| vld1.32 {q12-q13}, [r2]! | |||||
| Loop_4x8: | |||||
| cmp r8, #4 | |||||
| blt Loop_1x8 | |||||
| sub r8, r8, #4 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vld1.32 {q2-q3}, [r1]! | |||||
| vld1.32 {q8-q9}, [r1]! | |||||
| vld1.32 {q10-q11}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vadd.f32 q1, q1, q13 | |||||
| vadd.f32 q2, q2, q12 | |||||
| vadd.f32 q3, q3, q13 | |||||
| vadd.f32 q8, q8, q12 | |||||
| vadd.f32 q9, q9, q13 | |||||
| vadd.f32 q10, q10, q12 | |||||
| vadd.f32 q11, q11, q13 | |||||
| cmp r7, #3 | |||||
| beq Relu6_4x8 | |||||
| cmp r7, #1 | |||||
| beq Relu_4x8 | |||||
| b Write_4x8 | |||||
| Relu6_4x8: | |||||
| vmin.f32 q0, q0, q14 | |||||
| vmin.f32 q1, q1, q14 | |||||
| vmin.f32 q2, q2, q14 | |||||
| vmin.f32 q3, q3, q14 | |||||
| vmin.f32 q8, q8, q14 | |||||
| vmin.f32 q9, q9, q14 | |||||
| vmin.f32 q10, q10, q14 | |||||
| vmin.f32 q11, q11, q14 | |||||
| Relu_4x8: | |||||
| vmax.f32 q0, q0, q15 | |||||
| vmax.f32 q1, q1, q15 | |||||
| vmax.f32 q2, q2, q15 | |||||
| vmax.f32 q3, q3, q15 | |||||
| vmax.f32 q8, q8, q15 | |||||
| vmax.f32 q9, q9, q15 | |||||
| vmax.f32 q10, q10, q15 | |||||
| vmax.f32 q11, q11, q15 | |||||
| Write_4x8: | |||||
| vst1.32 {q0-q1}, [r11], r6 | |||||
| vst1.32 {q2-q3}, [r11], r6 | |||||
| vst1.32 {q8-q9}, [r11], r6 | |||||
| vst1.32 {q10-q11}, [r11], r6 | |||||
| b Loop_4x8 | |||||
| Loop_1x8: | |||||
| cmp r7, #3 | |||||
| beq Relu6_1x8 | |||||
| cmp r7, #1 | |||||
| beq Relu_1x8 | |||||
| b Write_1x8 | |||||
| Relu6_1x8: | |||||
| cmp r8, #0 | |||||
| beq Loop_C8 | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vadd.f32 q1, q1, q13 | |||||
| vmin.f32 q0, q0, q14 | |||||
| vmin.f32 q1, q1, q14 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vmax.f32 q1, q1, q15 | |||||
| vst1.32 {q0-q1}, [r11], r6 | |||||
| b Relu6_1x8 | |||||
| Relu_1x8: | |||||
| cmp r8, #0 | |||||
| beq Loop_C8 | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vadd.f32 q1, q1, q13 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vmax.f32 q1, q1, q15 | |||||
| vst1.32 {q0-q1}, [r11], r6 | |||||
| b Relu_1x8 | |||||
| Write_1x8: | |||||
| cmp r8, #0 | |||||
| beq Loop_C8 | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vadd.f32 q1, q1, q13 | |||||
| vst1.32 {q0-q1}, [r11], r6 | |||||
| b Write_1x8 | |||||
| Loop_C1: | |||||
| cmp r4, #0 | |||||
| beq End | |||||
| mov r8, r5 | |||||
| vld1.32 {q12-q13}, [r2]! | |||||
| mov r11, #4 | |||||
| mul r10, lr, r11 | |||||
| add r0, r0, r10 | |||||
| cmp r4, #1 | |||||
| beq Loop_C1_1 | |||||
| cmp r4, #2 | |||||
| beq Loop_C1_2 | |||||
| cmp r4, #3 | |||||
| beq Loop_C1_3 | |||||
| cmp r4, #4 | |||||
| beq Loop_C1_4 | |||||
| cmp r4, #5 | |||||
| beq Loop_C1_5 | |||||
| cmp r4, #6 | |||||
| beq Loop_C1_6 | |||||
| cmp r4, #7 | |||||
| beq Loop_C1_7 | |||||
| Loop_C1_1: | |||||
| cmp r7, #3 | |||||
| beq Loop_C1_1_Relu6 | |||||
| cmp r7, #1 | |||||
| beq Loop_C1_1_Relu | |||||
| b Loop_C1_1_Write | |||||
| Loop_C1_1_Relu6: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vmin.f32 q0, q0, q14 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vst1.32 {d0[0]}, [r0], r6 | |||||
| b Loop_C1_1_Relu6 | |||||
| Loop_C1_1_Relu: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vst1.32 {d0[0]}, [r0], r6 | |||||
| b Loop_C1_1_Relu | |||||
| Loop_C1_1_Write: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vst1.32 {d0[0]}, [r0], r6 | |||||
| b Loop_C1_1_Write | |||||
| Loop_C1_2: | |||||
| cmp r7, #3 | |||||
| beq Loop_C1_2_Relu6 | |||||
| cmp r7, #1 | |||||
| beq Loop_C1_2_Relu | |||||
| b Loop_C1_2_Write | |||||
| Loop_C1_2_Relu6: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vmin.f32 q0, q0, q14 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vst1.32 {d0}, [r0], r6 | |||||
| b Loop_C1_2_Relu6 | |||||
| Loop_C1_2_Relu: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vst1.32 {d0}, [r0], r6 | |||||
| b Loop_C1_2_Relu | |||||
| Loop_C1_2_Write: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vst1.32 {d0}, [r0], r6 | |||||
| b Loop_C1_2_Write | |||||
| Loop_C1_3: | |||||
| add r11, r0, #8 | |||||
| cmp r7, #3 | |||||
| beq Loop_C1_3_Relu6 | |||||
| cmp r7, #1 | |||||
| beq Loop_C1_3_Relu | |||||
| b Loop_C1_3_Write | |||||
| Loop_C1_3_Relu6: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vmin.f32 q0, q0, q14 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vst1.32 {d0}, [r0], r6 | |||||
| vst1.32 {d1[0]}, [r11], r6 | |||||
| b Loop_C1_3_Relu6 | |||||
| Loop_C1_3_Relu: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vst1.32 {d0}, [r0], r6 | |||||
| vst1.32 {d1[0]}, [r11], r6 | |||||
| b Loop_C1_3_Relu | |||||
| Loop_C1_3_Write: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vst1.32 {d0}, [r0], r6 | |||||
| vst1.32 {d1[0]}, [r11], r6 | |||||
| b Loop_C1_3_Write | |||||
| Loop_C1_4: | |||||
| cmp r7, #3 | |||||
| beq Loop_C1_4_Relu6 | |||||
| cmp r7, #1 | |||||
| beq Loop_C1_4_Relu | |||||
| b Loop_C1_4_Write | |||||
| Loop_C1_4_Relu6: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vmin.f32 q0, q0, q14 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vst1.32 {q0}, [r0], r6 | |||||
| b Loop_C1_4_Relu6 | |||||
| Loop_C1_4_Relu: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vst1.32 {q0}, [r0], r6 | |||||
| b Loop_C1_4_Relu6 | |||||
| Loop_C1_4_Write: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vst1.32 {q0}, [r0], r6 | |||||
| b Loop_C1_4_Write | |||||
| Loop_C1_5: | |||||
| add r11, r0, #16 | |||||
| cmp r7, #3 | |||||
| beq Loop_C1_5_Relu6 | |||||
| cmp r7, #1 | |||||
| beq Loop_C1_5_Relu | |||||
| b Loop_C1_5_Write | |||||
| Loop_C1_5_Relu6: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vadd.f32 q1, q1, q13 | |||||
| vmin.f32 q0, q0, q14 | |||||
| vmin.f32 q1, q1, q14 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vmax.f32 q1, q1, q15 | |||||
| vst1.32 {q0}, [r0], r6 | |||||
| vst1.32 {d2[0]}, [r11], r6 | |||||
| b Loop_C1_5_Relu6 | |||||
| Loop_C1_5_Relu: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vadd.f32 q1, q1, q13 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vmax.f32 q1, q1, q15 | |||||
| vst1.32 {q0}, [r0], r6 | |||||
| vst1.32 {d2[0]}, [r11], r6 | |||||
| b Loop_C1_5_Relu | |||||
| Loop_C1_5_Write: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vadd.f32 q1, q1, q13 | |||||
| vst1.32 {q0}, [r0], r6 | |||||
| vst1.32 {d2[0]}, [r11], r6 | |||||
| b Loop_C1_5_Write | |||||
| Loop_C1_6: | |||||
| add r11, r0, #16 | |||||
| cmp r7, #3 | |||||
| beq Loop_C1_6_Relu6 | |||||
| cmp r7, #1 | |||||
| beq Loop_C1_6_Relu | |||||
| b Loop_C1_6_Write | |||||
| Loop_C1_6_Relu6: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vadd.f32 q1, q1, q13 | |||||
| vmin.f32 q0, q0, q14 | |||||
| vmin.f32 q1, q1, q14 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vmax.f32 q1, q1, q15 | |||||
| vst1.32 {q0}, [r0], r6 | |||||
| vst1.32 {d2}, [r11], r6 | |||||
| b Loop_C1_6_Relu6 | |||||
| Loop_C1_6_Relu: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vadd.f32 q1, q1, q13 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vmax.f32 q1, q1, q15 | |||||
| vst1.32 {q0}, [r0], r6 | |||||
| vst1.32 {d2}, [r11], r6 | |||||
| b Loop_C1_6_Relu | |||||
| Loop_C1_6_Write: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vadd.f32 q1, q1, q13 | |||||
| vst1.32 {q0}, [r0], r6 | |||||
| vst1.32 {d2}, [r11], r6 | |||||
| b Loop_C1_6_Write | |||||
| Loop_C1_7: | |||||
| add r11, r0, #16 | |||||
| add r10, r0, #24 | |||||
| cmp r7, #3 | |||||
| beq Loop_C1_7_Relu6 | |||||
| cmp r7, #1 | |||||
| beq Loop_C1_7_Relu | |||||
| b Loop_C1_7_Write | |||||
| Loop_C1_7_Relu6: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vadd.f32 q1, q1, q13 | |||||
| vmin.f32 q0, q0, q14 | |||||
| vmin.f32 q1, q1, q14 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vmax.f32 q1, q1, q15 | |||||
| vst1.32 {q0}, [r0], r6 | |||||
| vst1.32 {d2}, [r11], r6 | |||||
| vst1.32 {d3[0]}, [r10], r6 | |||||
| b Loop_C1_7_Relu6 | |||||
| Loop_C1_7_Relu: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vadd.f32 q1, q1, q13 | |||||
| vmax.f32 q0, q0, q15 | |||||
| vmax.f32 q1, q1, q15 | |||||
| vst1.32 {q0}, [r0], r6 | |||||
| vst1.32 {d2}, [r11], r6 | |||||
| vst1.32 {d3[0]}, [r10], r6 | |||||
| b Loop_C1_7_Relu | |||||
| Loop_C1_7_Write: | |||||
| cmp r8, #0 | |||||
| beq End | |||||
| sub r8, r8, #1 | |||||
| vld1.32 {q0-q1}, [r1]! | |||||
| vadd.f32 q0, q0, q12 | |||||
| vadd.f32 q1, q1, q13 | |||||
| vst1.32 {q0}, [r0], r6 | |||||
| vst1.32 {d2}, [r11], r6 | |||||
| vst1.32 {d3[0]}, [r10], r6 | |||||
| b Loop_C1_7_Write | |||||
| End: | |||||
| sub sp, sp, #32 | |||||
| pop {r4-r8, r10, r11, pc} | |||||
| #endif | |||||
| @@ -42,7 +42,7 @@ void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_p | |||||
| void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, | void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, | ||||
| size_t plane_size, size_t stride, bool is_relu, bool is_relu6) { | size_t plane_size, size_t stride, bool is_relu, bool is_relu6) { | ||||
| #ifndef ENABLE_ARM64 | |||||
| #ifndef ENABLE_ARM | |||||
| PostConvFuncComm(c8_out_ptr, out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C8NUM); | PostConvFuncComm(c8_out_ptr, out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C8NUM); | ||||
| #else | #else | ||||
| size_t oc8mod = output_channel % C8NUM; | size_t oc8mod = output_channel % C8NUM; | ||||
| @@ -45,6 +45,8 @@ void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weigh | |||||
| void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | ||||
| size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6); | size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6); | ||||
| void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div, size_t oc8mod, | |||||
| size_t plane_size, size_t stride, size_t relu_type); | |||||
| #endif | #endif | ||||
| #ifdef ENABLE_ARM64 | #ifdef ENABLE_ARM64 | ||||
| @@ -57,9 +59,6 @@ void Relu(float *data, size_t element4); | |||||
| void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_t height, size_t width, | void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_t height, size_t width, | ||||
| size_t in_kh_step, size_t in_kw_step, size_t kernel_w); | size_t in_kh_step, size_t in_kw_step, size_t kernel_w); | ||||
| void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div, size_t oc8mod, | |||||
| size_t plane_size, size_t stride, size_t relu_type); | |||||
| void ConvSwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | void ConvSwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | ||||
| size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t ic4, | size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t ic4, | ||||
| size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, | size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, | ||||