Merge pull request !4523 from yangruoqi713/deconv_dwtags/v0.7.0-beta
| @@ -0,0 +1,39 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global DeconvDwFp32Border | |||
| #ifndef __APPLE__ | |||
| .type DeconvDwFp32Border, %function | |||
| #endif | |||
| // void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_t height, size_t width, | |||
| // size_t in_kh_step, size_t in_kw_step, size_t kernel_w) | |||
| // x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w | |||
| DeconvDwFp32Border: | |||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| ld1 {v1.4s}, [x1] | |||
| mov x13, x0 | |||
| mov x14, x2 | |||
| LoopH: | |||
| mov x15, x13 | |||
| mov x16, x14 | |||
| mov x17, x4 | |||
| LoopW: | |||
| ld1 {v0.4s}, [x15] | |||
| ld1 {v2.4s}, [x16], #16 | |||
| fmla v0.4s, v1.4s, v2.4s | |||
| st1 {v0.4s}, [x15], x6 | |||
| subs x17, x17, #1 | |||
| bne LoopW | |||
| subs x3, x3, #1 | |||
| add x13, x13, x5 | |||
| add x14, x14, x7 | |||
| bne LoopH | |||
| ret | |||
| #endif | |||
| @@ -0,0 +1,39 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global DeconvDwFp16Border | |||
| #ifndef __APPLE__ | |||
| .type DeconvDwFp16Border, %function | |||
| #endif | |||
| // void DeconvDwFp16Border(float *dst, const float *src, const float *weight, size_t height, size_t width, | |||
| // size_t in_kh_step, size_t in_kw_step, size_t kernel_w) | |||
| // x0: dst, x1: src, x2: weight, x3: height, x4: width, x5: in_kh_step, x6: in_kw_step, x7: kernel_w | |||
| DeconvDwFp16Border: | |||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| ld1 {v1.8h}, [x1] | |||
| mov x13, x0 | |||
| mov x14, x2 | |||
| LoopH: | |||
| mov x15, x13 | |||
| mov x16, x14 | |||
| mov x17, x4 | |||
| LoopW: | |||
| ld1 {v0.8h}, [x15] | |||
| ld1 {v2.8h}, [x16], #16 | |||
| fmla v0.8h, v1.8h, v2.8h | |||
| st1 {v0.8h}, [x15], x6 | |||
| subs x17, x17, #1 | |||
| bne LoopW | |||
| subs x3, x3, #1 | |||
| add x13, x13, x5 | |||
| add x14, x14, x7 | |||
| bne LoopH | |||
| ret | |||
| #endif | |||
| @@ -35,6 +35,8 @@ void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *wei | |||
| size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, | |||
| size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, | |||
| size_t relu, size_t relu6); | |||
| void DeconvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width, | |||
| size_t in_kh_step, size_t in_kw_step, size_t kernel_w); | |||
| void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width, | |||
| size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, | |||
| size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); | |||
| @@ -184,7 +184,7 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo | |||
| /*deconv depthwise fp16 begin*/ | |||
| void DeconvDepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height, | |||
| int width, int in_kh_step, int in_kw_step, int kernel_w) { | |||
| int width, int in_kh_step, int in_kw_step, int kernel_w_step) { | |||
| float16_t *dst_kh = dst; | |||
| const float16_t *weight_kh = weight; | |||
| for (int kh = 0; kh < height; kh++) { | |||
| @@ -201,7 +201,7 @@ void DeconvDepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const | |||
| weight_kw += C8NUM; | |||
| } // kernel_w loop | |||
| dst_kh += in_kh_step; | |||
| weight_kh += kernel_w * C8NUM; | |||
| weight_kh += kernel_w_step; | |||
| } // kernel_h loop | |||
| } | |||
| @@ -224,9 +224,14 @@ void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float | |||
| const float16_t *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C8NUM; | |||
| float16_t *dst_kernel = dst_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; | |||
| #ifdef ENABLE_ARM64 | |||
| DeconvDwFp16Border(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw, | |||
| sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t), | |||
| conv_param->kernel_w_ * C8NUM * sizeof(float16_t)); | |||
| #else | |||
| DeconvDepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw, | |||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_); | |||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM); | |||
| #endif | |||
| src_kernel += sliding->block_channel_; | |||
| } // width loop | |||
| src_h += sliding->out_h_step_; | |||
| @@ -61,6 +61,10 @@ void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_ | |||
| void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||
| size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6); | |||
| void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_t height, size_t width, | |||
| size_t in_kh_step, size_t in_kw_step, size_t kernel_w); | |||
| void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div, size_t oc8mod, | |||
| size_t plane_size, size_t stride, size_t relu_type); | |||
| #endif | |||
| @@ -634,7 +634,7 @@ void ConvDw3x3Fp32(float *output_data, const float *input_data, const float *wei | |||
| /*deconv depthwise fp32 begin*/ | |||
| void DeconvDepthwiseBorderPixel(float *dst, const float *src, const float *weight, int height, int width, | |||
| int in_kh_step, int in_kw_step, int kernel_w) { | |||
| int in_kh_step, int in_kw_step, int kernel_w_step) { | |||
| float *dst_kh = dst; | |||
| const float *weight_kh = weight; | |||
| for (int kh = 0; kh < height; kh++) { | |||
| @@ -656,7 +656,7 @@ void DeconvDepthwiseBorderPixel(float *dst, const float *src, const float *weigh | |||
| weight_kw += C4NUM; | |||
| } // kernel_w loop | |||
| dst_kh += in_kh_step; | |||
| weight_kh += kernel_w * C4NUM; | |||
| weight_kh += kernel_w_step; | |||
| } // kernel_h loop | |||
| } | |||
| @@ -678,9 +678,14 @@ void DeconvDepthwiseBorder(float *dst, const float *src, const float *weight, in | |||
| const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C4NUM; | |||
| float *dst_kernel = dst_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; | |||
| #ifdef ENABLE_ARM64 | |||
| DeconvDwFp32Border(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw, | |||
| sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float), | |||
| conv_param->kernel_w_ * C4NUM * sizeof(float)); | |||
| #else | |||
| DeconvDepthwiseBorderPixel(dst_kernel, src_kernel, weight_kernel, end_kh - start_kh, end_kw - start_kw, | |||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_); | |||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM); | |||
| #endif | |||
| src_kernel += sliding->block_channel_; | |||
| } // width loop | |||
| src_h += sliding->out_h_step_; | |||