Merge pull request !6182 from liuzhongkai/arm32_new1tags/v1.0.0
| @@ -0,0 +1,63 @@ | |||
| #ifdef ENABLE_ARM32 | |||
| .text | |||
| .align 5 | |||
| .global ConvDwFp32Border | |||
| #ifndef __APPLE__ | |||
| .type ConvDwFp32Border, %function | |||
| #endif | |||
| // void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||
| // size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6) | |||
| // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step, | |||
| // r8: kernel_w, r9: relu, r10: relu6 | |||
| ConvDwFp32Border: | |||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | |||
| push {r4-r12, lr} | |||
| vpush {q4-q7} | |||
| add sp, sp, #104 | |||
| ldr r4, [sp] // height | |||
| ldr r5, [sp, #4] // width | |||
| ldr r6, [sp, #8] // in_kh_step | |||
| ldr r7, [sp, #12] // in_kw_step | |||
| ldr r8, [sp, #16] // kernel_w | |||
| ldr r9, [sp, #20] // relu | |||
| ldr r10, [sp, #24] // relu6 | |||
| vld1.32 {q0}, [r3] // bias | |||
| vmov.i32 q1, #6 // relu6 | |||
| vcvt.f32.s32 q1, q1 | |||
| veor q2, q2, q2 // relu | |||
| LoopH: | |||
| mov r11, r1 | |||
| mov r12, r2 | |||
| mov r14, r5 | |||
| LoopW: | |||
| vld1.32 {q3}, [r11], r7 | |||
| vld1.32 {q4}, [r12]! | |||
| vmla.f32 q0, q3, q4 | |||
| subs r14, r14, #1 | |||
| bne LoopW | |||
| subs r4, r4, #1 | |||
| add r1, r1, r6 | |||
| add r2, r2, r8 | |||
| bne LoopH | |||
| cmp r10, #0 | |||
| bne Relu6 | |||
| cmp r9, #0 | |||
| bne Relu | |||
| b Write | |||
| Relu6: | |||
| vmin.f32 q0, q0, q1 | |||
| Relu: | |||
| vmax.f32 q0, q0, q2 | |||
| Write: | |||
| vst1.32 {q0}, [r0] | |||
| sub sp, sp, #104 | |||
| vpop {q4-q7} | |||
| pop {r4-r12, pc} | |||
| #endif | |||
| @@ -11,9 +11,9 @@ | |||
| // void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||
| // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | |||
| // size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); | |||
| // r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: weight, #56: kernel_h, #60: kernel_w, | |||
| // #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step | |||
| // #88: relu, #92: relu6 | |||
| // r0: dst, r1: src, r2: weight, r3: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w, | |||
| // #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step,#36: in_kw_step | |||
| // #40: relu, #44: relu6 | |||
| ConvDwFp32Center: | |||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | |||
| // according to https://stackoverflow.com/questions/53625807 | |||
| @@ -24,7 +24,7 @@ ConvDwFp32Center: | |||
| vpush {q4-q7} | |||
| add sp, sp, #112 | |||
| ldr r4, [sp, #48] | |||
| ldr r4, [sp] // height | |||
| vld1.32 {q13}, [r3] | |||
| vmov.i32 q14, #6 | |||
| @@ -32,22 +32,25 @@ ConvDwFp32Center: | |||
| veor q15, q15, q15 | |||
| LoopH: | |||
| ldr r1, [sp, #4] // src_w | |||
| ldr r5, [sp, #52] // width | |||
| ldr r0, [sp] // dst_w | |||
| ldr r1, [sp, #-44] // src_w, src_h = src | |||
| ldr r5, [sp, #4] // width | |||
| ldr r0, [sp, #-48] // dst_w, dst_h = dst | |||
| cmp r5, #4 | |||
| blt LoopW | |||
| LoopW4: | |||
| ldr r11, [sp, #76] // in_sw_step | |||
| mov r8, r1 // src_kh | |||
| ldr r2, [sp, #8] // weight_kh | |||
| ldr r6, [sp, #56] // kernel_h | |||
| ldr r11, [sp, #28] // in_sw_step | |||
| mov r8, r1 // src_kh, src_w | |||
| ldr r2, [sp, #-40] // weight_kh, weight | |||
| ldr r6, [sp, #8] // kernel_h | |||
| vmov q0, q13 | |||
| vmov q1, q13 | |||
| vmov q2, q13 | |||
| vmov q3, q13 | |||
| LoopKh4: | |||
| ldr r12, [sp, #80] //in_kh_step | |||
| ldr r7, [sp, #60] // kernel_w | |||
| mov lr, r8 // src_kw | |||
| ldr r7, [sp, #12] // kernel_w | |||
| mov lr, r8 // src_kw, src_kh | |||
| LoopKw4: | |||
| ldr r12, [sp, #36] //in_kw_step | |||
| mov r10, lr | |||
| vld1.32 {q12}, [r2]! | |||
| vld1.32 {q4}, [r10] | |||
| @@ -65,14 +68,14 @@ ConvDwFp32Center: | |||
| subs r7, r7, #1 | |||
| add lr, lr, r12 | |||
| bne LoopKw4 | |||
| ldr r12, [sp, #80] | |||
| ldr r12, [sp, #32] // in_kh_step | |||
| add r8, r8, r12 | |||
| subs r6, r6, #1 | |||
| bne LoopKh4 | |||
| ldr r12, [sp, #92] | |||
| ldr r12, [sp, #44] | |||
| cmp r12, #0 | |||
| bne Relu64 | |||
| ldr r12, [sp, #88] | |||
| ldr r12, [sp, #40] | |||
| cmp r12, #0 | |||
| bne Relu4 | |||
| b Write4 | |||
| @@ -87,7 +90,7 @@ ConvDwFp32Center: | |||
| vmax.f32 q2, q2, q15 | |||
| vmax.f32 q3, q3, q15 | |||
| Write4: | |||
| ldr r12, [sp, #68] | |||
| ldr r12, [sp, #20] // block_channel | |||
| vst1.32 {q0}, [r0] | |||
| add r0, r0, r12 | |||
| vst1.32 {q1}, [r0] | |||
| @@ -98,36 +101,36 @@ ConvDwFp32Center: | |||
| add r0, r0, r12 | |||
| mov r12, #4 | |||
| mul r11, r11, r12 | |||
| add r1, r1, r11 | |||
| add r1, r1, r11 // src_w += in_sw_step | |||
| sub r5, r5, #4 | |||
| cmp r5, #0 | |||
| ble LoopWEnd | |||
| cmp r5, #4 | |||
| bge LoopW | |||
| LoopW: | |||
| mov r8, r1 // src_kh | |||
| ldr r2, [sp, #8] // weight_kh | |||
| ldr r6, [sp, #56] // kernel_h | |||
| vmov q0, q13 | |||
| mov r8, r1 // src_kh, src_w | |||
| ldr r2, [sp, #-40] // weight_kh, weight | |||
| ldr r6, [sp, #8] // kernel_h | |||
| vmov q0, q13 // bias | |||
| LoopKh: | |||
| ldr r12, [sp, #84] //in_kw_step | |||
| ldr r7, [sp, #60] // kernel_w | |||
| mov r10, r8 // src_kw | |||
| ldr r7, [sp, #12] // kernel_w | |||
| mov r10, r8 // src_kw, src_kh | |||
| LoopKw: | |||
| ldr r12, [sp, #36] //in_kw_step | |||
| vld1.32 {q1}, [r10] | |||
| add r10, r10, r12 | |||
| vld1.32 {q12}, [r2]! | |||
| vmla.f32 q0, q1, q12 | |||
| subs r7, r7, #1 | |||
| bne LoopKw | |||
| ldr r12, [sp, #80] | |||
| ldr r12, [sp, #32] // in_kh_step | |||
| add r8, r8, r12 | |||
| subs r6, r6, #1 | |||
| bne LoopKh | |||
| ldr r12, [sp, #92] | |||
| ldr r12, [sp, #44] | |||
| cmp r12, #0 | |||
| bne Relu6 | |||
| ldr r12, [sp, #88] | |||
| ldr r12, [sp, #40] | |||
| cmp r12, #0 | |||
| bne Relu | |||
| b Write | |||
| @@ -136,22 +139,24 @@ ConvDwFp32Center: | |||
| Relu: | |||
| vmax.f32 q0, q0, q15 | |||
| Write: | |||
| ldr r12, [sp, #68] | |||
| vst1.32 {q0}, [r0] | |||
| ldr r12, [sp, #20] // block_channel | |||
| vst1.32 {q0}, [r0] // dst_kw += block_channel | |||
| add r0, r0, r12 | |||
| ldr r12, [sp, #76] | |||
| add r1, r1, r12 | |||
| ldr r12, [sp, #28] // in_sw_step | |||
| add r1, r1, r12 // src_w += in_sw_step | |||
| subs r5, r5, #1 | |||
| bne LoopW | |||
| ldr r3, [sp, #64] | |||
| ldr r12, [sp] | |||
| ldr r3, [sp, #16] // out_h_step | |||
| ldr r12, [sp, #-48] | |||
| add r12, r12, r3 | |||
| str r12, [sp] | |||
| ldr r3, [sp, #72] | |||
| ldr r12, [sp, #4] | |||
| str r12, [sp, #-48] | |||
| ldr r3, [sp, #24] // in_sh_step | |||
| ldr r12, [sp, #-44] // src_h += in_sh_step | |||
| add r12, r12, r3 | |||
| str r12, [sp, #4] | |||
| subs r4, r4, #1 | |||
| str r12, [sp, #-44] | |||
| subs r4, r4, #1 // height | |||
| bne LoopH | |||
| LoopWEnd: | |||
| sub sp, sp, #112 | |||
| @@ -0,0 +1,113 @@ | |||
| #ifdef ENABLE_ARM32 | |||
| .text | |||
| .align 5 | |||
| .global ConvDwFp32Row | |||
| #ifndef __APPLE__ | |||
| .type ConvDwFp32Row, %function | |||
| #endif | |||
| // voidConvDwFp32Row(float* output_ptr, const float* input_ptr, const float* filter_ptr, | |||
| // size_t num_pixels, size_t input_channel, size_t input_step) | |||
| // r0: output_ptr, r1: input_ptr, r2: filter_ptr, r3: num_pixels, | |||
| // r4: input_channel, r5: input_step | |||
| ConvDwFp32Row: | |||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | |||
| push {r4-r6, r8, r10, r11} | |||
| vpush {q4-q7} | |||
| add sp, sp, #88 | |||
| mov r11, r0 | |||
| ldr r4, [sp] | |||
| ldr r5, [sp, #4] | |||
| mov r6, #4 | |||
| mul r5, r5, r6 | |||
| cmp r3, #0 | |||
| beq End | |||
| LoopNumPixel: | |||
| mov r6, r1 // input_ptr | |||
| mov r8, r2 // filter_ptr | |||
| mov r10, r4 // input_channel | |||
| LoopDepth16In: | |||
| cmp r10, #16 | |||
| blt L4 | |||
| sub r10, r10, #16 | |||
| vld1.32 {q0, q1}, [r6]! | |||
| vld1.32 {q4, q5}, [r8]! | |||
| vld1.32 {q8, q9}, [r0]! | |||
| cmp r10, #16 | |||
| blt LoopDepth16Out | |||
| LoopDepth16: | |||
| vmla.f32 q8, q0, q4 | |||
| vmla.f32 q9, q1, q5 | |||
| vst1.32 {q8, q9}, [r11]! | |||
| vld1.32 {q2, q3}, [r6]! | |||
| vld1.32 {q6, q7}, [r8]! | |||
| vld1.32 {q10, q11}, [r0]! | |||
| vmla.f32 q10, q2, q6 | |||
| vmla.f32 q11, q3, q7 | |||
| vst1.32 {q10, q11}, [r11]! | |||
| vld1.32 {q0, q1}, [r6]! | |||
| vld1.32 {q4, q5}, [r8]! | |||
| vld1.32 {q8, q9}, [r0]! | |||
| sub r10, r10, #16 | |||
| cmp r10, #16 | |||
| bge LoopDepth16 | |||
| LoopDepth16Out: | |||
| vmla.f32 q8, q0, q4 | |||
| vmla.f32 q9, q1, q5 | |||
| vst1.32 {q8, q9}, [r11]! | |||
| vld1.32 {q2, q3}, [r6]! | |||
| vld1.32 {q6, q7}, [r8]! | |||
| vld1.32 {q10, q11}, [r0]! | |||
| vmla.f32 q10, q2, q6 | |||
| vmla.f32 q11, q3, q7 | |||
| vst1.32 {q10, q11}, [r11]! | |||
| L4: | |||
| cmp r10, #4 | |||
| blt L0 | |||
| LoopDepth4: | |||
| vld1.32 {q0}, [r6]! | |||
| vld1.32 {q4}, [r8]! | |||
| vld1.32 {q8}, [r0]! | |||
| vmla.f32 q8, q0, q4 | |||
| vst1.32 {q8}, [r11]! | |||
| sub r10, r10, #4 | |||
| cmp r10, #4 | |||
| bge LoopDepth4 | |||
| L0: | |||
| cmp r10, #0 | |||
| beq Loop16LineEnd | |||
| LoopDepth0: | |||
| vld1.32 {s0}, [r6]! | |||
| vld1.32 {s1}, [r8]! | |||
| vld1.32 {s2}, [r0]! | |||
| vmla.f32 s2, s0, s1 | |||
| vst1.32 {s2}, [r11]! | |||
| subs r10, r10, #1 | |||
| bne LoopDepth0 | |||
| Loop16LineEnd: | |||
| subs r3, r3, #1 | |||
| add r1, r1, r5 | |||
| bne LoopNumPixel | |||
| End: | |||
| sub sp, sp, #88 | |||
| vpop {q4-q7} | |||
| pop {r4-r6, r8, r10, r11} | |||
| bx lr | |||
| #endif | |||
| @@ -40,6 +40,11 @@ void ConvDwFp32Center(float *dst, const float *src, const float *weight, const f | |||
| void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, size_t kernel_h, | |||
| size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | |||
| size_t in_kh_step, size_t in_kw_step); | |||
| void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels, | |||
| size_t output_channel, size_t input_step); | |||
| void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||
| size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6); | |||
| #endif | |||
| #ifdef ENABLE_ARM64 | |||
| @@ -49,12 +54,6 @@ void BiasAddRelu(const float *bias, float *data, size_t oc4, size_t plan_size); | |||
| void Relu6(float *data, size_t element4); | |||
| void Relu(float *data, size_t element4); | |||
| void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels, | |||
| size_t output_channel, size_t input_step); | |||
| void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||
| size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6); | |||
| void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_t height, size_t width, | |||
| size_t in_kh_step, size_t in_kw_step, size_t kernel_w); | |||
| @@ -70,5 +69,4 @@ void ConvSwFp32Center(float *dst, const float *src, const float *weight, const f | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| #endif /* MINDSPORE_LITE_NNACL_FP32_COMMON_FUNC_H_ */ | |||
| @@ -21,7 +21,7 @@ | |||
| #include <arm_neon.h> | |||
| #endif | |||
| #ifndef ENABLE_ARM64 | |||
| #ifndef ENABLE_ARM | |||
| void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, int num_pixels, | |||
| int output_channel, int input_step) { | |||
| for (int i = 0; i < num_pixels; i++) { | |||
| @@ -202,7 +202,7 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl | |||
| const float *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; | |||
| const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C4NUM; | |||
| #ifdef ENABLE_ARM64 | |||
| #ifdef ENABLE_ARM | |||
| ConvDwFp32Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | |||
| sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float), | |||
| conv_param->kernel_w_ * C4NUM * sizeof(float), relu, relu6); | |||
| @@ -286,7 +286,7 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig | |||
| int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; | |||
| const float *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; | |||
| float *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | |||
| #ifdef ENABLE_ARM64 | |||
| #ifdef ENABLE_ARM | |||
| ConvDwFp32Center(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, | |||
| conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float), | |||
| sliding->block_channel_ * sizeof(float), sliding->in_sh_step_ * sizeof(float), | |||