Merge pull request !6182 from liuzhongkai/arm32_new1tags/v1.0.0
| @@ -0,0 +1,63 @@ | |||||
| #ifdef ENABLE_ARM32 | |||||
| .text | |||||
| .align 5 | |||||
| .global ConvDwFp32Border | |||||
| #ifndef __APPLE__ | |||||
| .type ConvDwFp32Border, %function | |||||
| #endif | |||||
| // void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||||
| // size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6) | |||||
| // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step, | |||||
| // r8: kernel_w, r9: relu, r10: relu6 | |||||
| ConvDwFp32Border: | |||||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | |||||
| push {r4-r12, lr} | |||||
| vpush {q4-q7} | |||||
| add sp, sp, #104 | |||||
| ldr r4, [sp] // height | |||||
| ldr r5, [sp, #4] // width | |||||
| ldr r6, [sp, #8] // in_kh_step | |||||
| ldr r7, [sp, #12] // in_kw_step | |||||
| ldr r8, [sp, #16] // kernel_w | |||||
| ldr r9, [sp, #20] // relu | |||||
| ldr r10, [sp, #24] // relu6 | |||||
| vld1.32 {q0}, [r3] // bias | |||||
| vmov.i32 q1, #6 // relu6 | |||||
| vcvt.f32.s32 q1, q1 | |||||
| veor q2, q2, q2 // relu | |||||
| LoopH: | |||||
| mov r11, r1 | |||||
| mov r12, r2 | |||||
| mov r14, r5 | |||||
| LoopW: | |||||
| vld1.32 {q3}, [r11], r7 | |||||
| vld1.32 {q4}, [r12]! | |||||
| vmla.f32 q0, q3, q4 | |||||
| subs r14, r14, #1 | |||||
| bne LoopW | |||||
| subs r4, r4, #1 | |||||
| add r1, r1, r6 | |||||
| add r2, r2, r8 | |||||
| bne LoopH | |||||
| cmp r10, #0 | |||||
| bne Relu6 | |||||
| cmp r9, #0 | |||||
| bne Relu | |||||
| b Write | |||||
| Relu6: | |||||
| vmin.f32 q0, q0, q1 | |||||
| Relu: | |||||
| vmax.f32 q0, q0, q2 | |||||
| Write: | |||||
| vst1.32 {q0}, [r0] | |||||
| sub sp, sp, #104 | |||||
| vpop {q4-q7} | |||||
| pop {r4-r12, pc} | |||||
| #endif | |||||
| @@ -11,9 +11,9 @@ | |||||
| // void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | // void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | ||||
| // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | ||||
| // size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); | // size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); | ||||
| // r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: weight, #56: kernel_h, #60: kernel_w, | |||||
| // #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step | |||||
| // #88: relu, #92: relu6 | |||||
| // r0: dst, r1: src, r2: weight, r3: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w, | |||||
| // #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step,#36: in_kw_step | |||||
| // #40: relu, #44: relu6 | |||||
| ConvDwFp32Center: | ConvDwFp32Center: | ||||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | ||||
| // according to https://stackoverflow.com/questions/53625807 | // according to https://stackoverflow.com/questions/53625807 | ||||
| @@ -24,7 +24,7 @@ ConvDwFp32Center: | |||||
| vpush {q4-q7} | vpush {q4-q7} | ||||
| add sp, sp, #112 | add sp, sp, #112 | ||||
| ldr r4, [sp, #48] | |||||
| ldr r4, [sp] // height | |||||
| vld1.32 {q13}, [r3] | vld1.32 {q13}, [r3] | ||||
| vmov.i32 q14, #6 | vmov.i32 q14, #6 | ||||
| @@ -32,22 +32,25 @@ ConvDwFp32Center: | |||||
| veor q15, q15, q15 | veor q15, q15, q15 | ||||
| LoopH: | LoopH: | ||||
| ldr r1, [sp, #4] // src_w | |||||
| ldr r5, [sp, #52] // width | |||||
| ldr r0, [sp] // dst_w | |||||
| ldr r1, [sp, #-44] // src_w, src_h = src | |||||
| ldr r5, [sp, #4] // width | |||||
| ldr r0, [sp, #-48] // dst_w, dst_h = dst | |||||
| cmp r5, #4 | cmp r5, #4 | ||||
| blt LoopW | blt LoopW | ||||
| LoopW4: | LoopW4: | ||||
| ldr r11, [sp, #76] // in_sw_step | |||||
| mov r8, r1 // src_kh | |||||
| ldr r2, [sp, #8] // weight_kh | |||||
| ldr r6, [sp, #56] // kernel_h | |||||
| ldr r11, [sp, #28] // in_sw_step | |||||
| mov r8, r1 // src_kh, src_w | |||||
| ldr r2, [sp, #-40] // weight_kh, weight | |||||
| ldr r6, [sp, #8] // kernel_h | |||||
| vmov q0, q13 | vmov q0, q13 | ||||
| vmov q1, q13 | |||||
| vmov q2, q13 | |||||
| vmov q3, q13 | |||||
| LoopKh4: | LoopKh4: | ||||
| ldr r12, [sp, #80] //in_kh_step | |||||
| ldr r7, [sp, #60] // kernel_w | |||||
| mov lr, r8 // src_kw | |||||
| ldr r7, [sp, #12] // kernel_w | |||||
| mov lr, r8 // src_kw, src_kh | |||||
| LoopKw4: | LoopKw4: | ||||
| ldr r12, [sp, #36] //in_kw_step | |||||
| mov r10, lr | mov r10, lr | ||||
| vld1.32 {q12}, [r2]! | vld1.32 {q12}, [r2]! | ||||
| vld1.32 {q4}, [r10] | vld1.32 {q4}, [r10] | ||||
| @@ -65,14 +68,14 @@ ConvDwFp32Center: | |||||
| subs r7, r7, #1 | subs r7, r7, #1 | ||||
| add lr, lr, r12 | add lr, lr, r12 | ||||
| bne LoopKw4 | bne LoopKw4 | ||||
| ldr r12, [sp, #80] | |||||
| ldr r12, [sp, #32] // in_kh_step | |||||
| add r8, r8, r12 | add r8, r8, r12 | ||||
| subs r6, r6, #1 | subs r6, r6, #1 | ||||
| bne LoopKh4 | bne LoopKh4 | ||||
| ldr r12, [sp, #92] | |||||
| ldr r12, [sp, #44] | |||||
| cmp r12, #0 | cmp r12, #0 | ||||
| bne Relu64 | bne Relu64 | ||||
| ldr r12, [sp, #88] | |||||
| ldr r12, [sp, #40] | |||||
| cmp r12, #0 | cmp r12, #0 | ||||
| bne Relu4 | bne Relu4 | ||||
| b Write4 | b Write4 | ||||
| @@ -87,7 +90,7 @@ ConvDwFp32Center: | |||||
| vmax.f32 q2, q2, q15 | vmax.f32 q2, q2, q15 | ||||
| vmax.f32 q3, q3, q15 | vmax.f32 q3, q3, q15 | ||||
| Write4: | Write4: | ||||
| ldr r12, [sp, #68] | |||||
| ldr r12, [sp, #20] // block_channel | |||||
| vst1.32 {q0}, [r0] | vst1.32 {q0}, [r0] | ||||
| add r0, r0, r12 | add r0, r0, r12 | ||||
| vst1.32 {q1}, [r0] | vst1.32 {q1}, [r0] | ||||
| @@ -98,36 +101,36 @@ ConvDwFp32Center: | |||||
| add r0, r0, r12 | add r0, r0, r12 | ||||
| mov r12, #4 | mov r12, #4 | ||||
| mul r11, r11, r12 | mul r11, r11, r12 | ||||
| add r1, r1, r11 | |||||
| add r1, r1, r11 // src_w += in_sw_step | |||||
| sub r5, r5, #4 | sub r5, r5, #4 | ||||
| cmp r5, #0 | cmp r5, #0 | ||||
| ble LoopWEnd | ble LoopWEnd | ||||
| cmp r5, #4 | cmp r5, #4 | ||||
| bge LoopW | bge LoopW | ||||
| LoopW: | LoopW: | ||||
| mov r8, r1 // src_kh | |||||
| ldr r2, [sp, #8] // weight_kh | |||||
| ldr r6, [sp, #56] // kernel_h | |||||
| vmov q0, q13 | |||||
| mov r8, r1 // src_kh, src_w | |||||
| ldr r2, [sp, #-40] // weight_kh, weight | |||||
| ldr r6, [sp, #8] // kernel_h | |||||
| vmov q0, q13 // bias | |||||
| LoopKh: | LoopKh: | ||||
| ldr r12, [sp, #84] //in_kw_step | |||||
| ldr r7, [sp, #60] // kernel_w | |||||
| mov r10, r8 // src_kw | |||||
| ldr r7, [sp, #12] // kernel_w | |||||
| mov r10, r8 // src_kw, src_kh | |||||
| LoopKw: | LoopKw: | ||||
| ldr r12, [sp, #36] //in_kw_step | |||||
| vld1.32 {q1}, [r10] | vld1.32 {q1}, [r10] | ||||
| add r10, r10, r12 | add r10, r10, r12 | ||||
| vld1.32 {q12}, [r2]! | vld1.32 {q12}, [r2]! | ||||
| vmla.f32 q0, q1, q12 | vmla.f32 q0, q1, q12 | ||||
| subs r7, r7, #1 | subs r7, r7, #1 | ||||
| bne LoopKw | bne LoopKw | ||||
| ldr r12, [sp, #80] | |||||
| ldr r12, [sp, #32] // in_kh_step | |||||
| add r8, r8, r12 | add r8, r8, r12 | ||||
| subs r6, r6, #1 | subs r6, r6, #1 | ||||
| bne LoopKh | bne LoopKh | ||||
| ldr r12, [sp, #92] | |||||
| ldr r12, [sp, #44] | |||||
| cmp r12, #0 | cmp r12, #0 | ||||
| bne Relu6 | bne Relu6 | ||||
| ldr r12, [sp, #88] | |||||
| ldr r12, [sp, #40] | |||||
| cmp r12, #0 | cmp r12, #0 | ||||
| bne Relu | bne Relu | ||||
| b Write | b Write | ||||
| @@ -136,22 +139,24 @@ ConvDwFp32Center: | |||||
| Relu: | Relu: | ||||
| vmax.f32 q0, q0, q15 | vmax.f32 q0, q0, q15 | ||||
| Write: | Write: | ||||
| ldr r12, [sp, #68] | |||||
| vst1.32 {q0}, [r0] | |||||
| ldr r12, [sp, #20] // block_channel | |||||
| vst1.32 {q0}, [r0] // dst_kw += block_channel | |||||
| add r0, r0, r12 | add r0, r0, r12 | ||||
| ldr r12, [sp, #76] | |||||
| add r1, r1, r12 | |||||
| ldr r12, [sp, #28] // in_sw_step | |||||
| add r1, r1, r12 // src_w += in_sw_step | |||||
| subs r5, r5, #1 | subs r5, r5, #1 | ||||
| bne LoopW | bne LoopW | ||||
| ldr r3, [sp, #64] | |||||
| ldr r12, [sp] | |||||
| ldr r3, [sp, #16] // out_h_step | |||||
| ldr r12, [sp, #-48] | |||||
| add r12, r12, r3 | add r12, r12, r3 | ||||
| str r12, [sp] | |||||
| ldr r3, [sp, #72] | |||||
| ldr r12, [sp, #4] | |||||
| str r12, [sp, #-48] | |||||
| ldr r3, [sp, #24] // in_sh_step | |||||
| ldr r12, [sp, #-44] // src_h += in_sh_step | |||||
| add r12, r12, r3 | add r12, r12, r3 | ||||
| str r12, [sp, #4] | |||||
| subs r4, r4, #1 | |||||
| str r12, [sp, #-44] | |||||
| subs r4, r4, #1 // height | |||||
| bne LoopH | bne LoopH | ||||
| LoopWEnd: | LoopWEnd: | ||||
| sub sp, sp, #112 | sub sp, sp, #112 | ||||
| @@ -0,0 +1,113 @@ | |||||
| #ifdef ENABLE_ARM32 | |||||
| .text | |||||
| .align 5 | |||||
| .global ConvDwFp32Row | |||||
| #ifndef __APPLE__ | |||||
| .type ConvDwFp32Row, %function | |||||
| #endif | |||||
| // voidConvDwFp32Row(float* output_ptr, const float* input_ptr, const float* filter_ptr, | |||||
| // size_t num_pixels, size_t input_channel, size_t input_step) | |||||
| // r0: output_ptr, r1: input_ptr, r2: filter_ptr, r3: num_pixels, | |||||
| // r4: input_channel, r5: input_step | |||||
| ConvDwFp32Row: | |||||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | |||||
| push {r4-r6, r8, r10, r11} | |||||
| vpush {q4-q7} | |||||
| add sp, sp, #88 | |||||
| mov r11, r0 | |||||
| ldr r4, [sp] | |||||
| ldr r5, [sp, #4] | |||||
| mov r6, #4 | |||||
| mul r5, r5, r6 | |||||
| cmp r3, #0 | |||||
| beq End | |||||
| LoopNumPixel: | |||||
| mov r6, r1 // input_ptr | |||||
| mov r8, r2 // filter_ptr | |||||
| mov r10, r4 // input_channel | |||||
| LoopDepth16In: | |||||
| cmp r10, #16 | |||||
| blt L4 | |||||
| sub r10, r10, #16 | |||||
| vld1.32 {q0, q1}, [r6]! | |||||
| vld1.32 {q4, q5}, [r8]! | |||||
| vld1.32 {q8, q9}, [r0]! | |||||
| cmp r10, #16 | |||||
| blt LoopDepth16Out | |||||
| LoopDepth16: | |||||
| vmla.f32 q8, q0, q4 | |||||
| vmla.f32 q9, q1, q5 | |||||
| vst1.32 {q8, q9}, [r11]! | |||||
| vld1.32 {q2, q3}, [r6]! | |||||
| vld1.32 {q6, q7}, [r8]! | |||||
| vld1.32 {q10, q11}, [r0]! | |||||
| vmla.f32 q10, q2, q6 | |||||
| vmla.f32 q11, q3, q7 | |||||
| vst1.32 {q10, q11}, [r11]! | |||||
| vld1.32 {q0, q1}, [r6]! | |||||
| vld1.32 {q4, q5}, [r8]! | |||||
| vld1.32 {q8, q9}, [r0]! | |||||
| sub r10, r10, #16 | |||||
| cmp r10, #16 | |||||
| bge LoopDepth16 | |||||
| LoopDepth16Out: | |||||
| vmla.f32 q8, q0, q4 | |||||
| vmla.f32 q9, q1, q5 | |||||
| vst1.32 {q8, q9}, [r11]! | |||||
| vld1.32 {q2, q3}, [r6]! | |||||
| vld1.32 {q6, q7}, [r8]! | |||||
| vld1.32 {q10, q11}, [r0]! | |||||
| vmla.f32 q10, q2, q6 | |||||
| vmla.f32 q11, q3, q7 | |||||
| vst1.32 {q10, q11}, [r11]! | |||||
| L4: | |||||
| cmp r10, #4 | |||||
| blt L0 | |||||
| LoopDepth4: | |||||
| vld1.32 {q0}, [r6]! | |||||
| vld1.32 {q4}, [r8]! | |||||
| vld1.32 {q8}, [r0]! | |||||
| vmla.f32 q8, q0, q4 | |||||
| vst1.32 {q8}, [r11]! | |||||
| sub r10, r10, #4 | |||||
| cmp r10, #4 | |||||
| bge LoopDepth4 | |||||
| L0: | |||||
| cmp r10, #0 | |||||
| beq Loop16LineEnd | |||||
| LoopDepth0: | |||||
| vld1.32 {s0}, [r6]! | |||||
| vld1.32 {s1}, [r8]! | |||||
| vld1.32 {s2}, [r0]! | |||||
| vmla.f32 s2, s0, s1 | |||||
| vst1.32 {s2}, [r11]! | |||||
| subs r10, r10, #1 | |||||
| bne LoopDepth0 | |||||
| Loop16LineEnd: | |||||
| subs r3, r3, #1 | |||||
| add r1, r1, r5 | |||||
| bne LoopNumPixel | |||||
| End: | |||||
| sub sp, sp, #88 | |||||
| vpop {q4-q7} | |||||
| pop {r4-r6, r8, r10, r11} | |||||
| bx lr | |||||
| #endif | |||||
| @@ -40,6 +40,11 @@ void ConvDwFp32Center(float *dst, const float *src, const float *weight, const f | |||||
| void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, size_t kernel_h, | void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, size_t kernel_h, | ||||
| size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | ||||
| size_t in_kh_step, size_t in_kw_step); | size_t in_kh_step, size_t in_kw_step); | ||||
| void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels, | |||||
| size_t output_channel, size_t input_step); | |||||
| void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||||
| size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6); | |||||
| #endif | #endif | ||||
| #ifdef ENABLE_ARM64 | #ifdef ENABLE_ARM64 | ||||
| @@ -49,12 +54,6 @@ void BiasAddRelu(const float *bias, float *data, size_t oc4, size_t plan_size); | |||||
| void Relu6(float *data, size_t element4); | void Relu6(float *data, size_t element4); | ||||
| void Relu(float *data, size_t element4); | void Relu(float *data, size_t element4); | ||||
| void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels, | |||||
| size_t output_channel, size_t input_step); | |||||
| void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||||
| size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6); | |||||
| void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_t height, size_t width, | void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_t height, size_t width, | ||||
| size_t in_kh_step, size_t in_kw_step, size_t kernel_w); | size_t in_kh_step, size_t in_kw_step, size_t kernel_w); | ||||
| @@ -70,5 +69,4 @@ void ConvSwFp32Center(float *dst, const float *src, const float *weight, const f | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| } | } | ||||
| #endif | #endif | ||||
| #endif /* MINDSPORE_LITE_NNACL_FP32_COMMON_FUNC_H_ */ | #endif /* MINDSPORE_LITE_NNACL_FP32_COMMON_FUNC_H_ */ | ||||
| @@ -21,7 +21,7 @@ | |||||
| #include <arm_neon.h> | #include <arm_neon.h> | ||||
| #endif | #endif | ||||
| #ifndef ENABLE_ARM64 | |||||
| #ifndef ENABLE_ARM | |||||
| void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, int num_pixels, | void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, int num_pixels, | ||||
| int output_channel, int input_step) { | int output_channel, int input_step) { | ||||
| for (int i = 0; i < num_pixels; i++) { | for (int i = 0; i < num_pixels; i++) { | ||||
| @@ -202,7 +202,7 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl | |||||
| const float *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; | const float *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; | ||||
| const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C4NUM; | const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C4NUM; | ||||
| #ifdef ENABLE_ARM64 | |||||
| #ifdef ENABLE_ARM | |||||
| ConvDwFp32Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | ConvDwFp32Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | ||||
| sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float), | sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float), | ||||
| conv_param->kernel_w_ * C4NUM * sizeof(float), relu, relu6); | conv_param->kernel_w_ * C4NUM * sizeof(float), relu, relu6); | ||||
| @@ -286,7 +286,7 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig | |||||
| int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; | int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; | ||||
| const float *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; | const float *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; | ||||
| float *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | float *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | ||||
| #ifdef ENABLE_ARM64 | |||||
| #ifdef ENABLE_ARM | |||||
| ConvDwFp32Center(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, | ConvDwFp32Center(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, | ||||
| conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float), | conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float), | ||||
| sliding->block_channel_ * sizeof(float), sliding->in_sh_step_ * sizeof(float), | sliding->block_channel_ * sizeof(float), sliding->in_sh_step_ * sizeof(float), | ||||