| @@ -139,7 +139,7 @@ int ConvolutionDepthwiseFp16CPUKernel::ReSize() { | |||||
| } | } | ||||
| ConvolutionBaseCPUKernel::Init(); | ConvolutionBaseCPUKernel::Init(); | ||||
| InitSlidingParam(sliding_, conv_param_, C8NUM); | |||||
| InitSlidingParamConvDw(sliding_, conv_param_, C8NUM); | |||||
| auto ret = InitBuffer(); | auto ret = InitBuffer(); | ||||
| if (ret != 0) { | if (ret != 0) { | ||||
| @@ -55,7 +55,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() { | |||||
| conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C); | conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C); | ||||
| // init sliding_ window param | // init sliding_ window param | ||||
| InitSlidingParam(sliding_, conv_param_, C8NUM); | |||||
| InitSlidingParamConvDw(sliding_, conv_param_, C8NUM); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -0,0 +1,56 @@ | |||||
| #ifdef __aarch64__ | |||||
| .text | |||||
| .align 5 | |||||
| .global ConvDwFp32Border | |||||
| #ifndef __APPLE__ | |||||
| .type ConvDwFp32Border, %function | |||||
| #endif | |||||
| // void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||||
| // size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6) | |||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step, | |||||
| // x8: kernel_w, x9: relu, x10: relu6 | |||||
| ConvDwFp32Border: | |||||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||||
| // x19 ~ x29 should be also preserved | |||||
| // whereas our coding style do not permit such amount of parameters | |||||
| ldr x8, [sp] | |||||
| ldr x9, [sp, #8] | |||||
| ldr x10, [sp, #16] | |||||
| ld1 {v0.4s}, [x3] // bias | |||||
| movi v1.4s, #6 // relu 6 | |||||
| scvtf v1.4s, v1.4s | |||||
| dup v2.4s, wzr // relu | |||||
| mov x13, x1 | |||||
| mov x14, x2 | |||||
| LoopH: | |||||
| mov x15, x13 | |||||
| mov x16, x14 | |||||
| mov x17, x5 | |||||
| LoopW: | |||||
| ld1 {v3.4s}, [x15], x7 | |||||
| ld1 {v4.4s}, [x16], #16 | |||||
| fmla v0.4s, v3.4s, v4.4s | |||||
| subs x17, x17, #1 | |||||
| bne LoopW | |||||
| subs x4, x4, #1 | |||||
| add x13, x13, x6 | |||||
| add x14, x14, x8 | |||||
| bne LoopH | |||||
| cbnz x10, Relu6 | |||||
| cbnz x9, Relu | |||||
| b Write | |||||
| Relu6: | |||||
| fmin v0.4s, v0.4s, v1.4s | |||||
| Relu: | |||||
| fmax v0.4s, v0.4s, v2.4s | |||||
| Write: | |||||
| st1 {v0.4s}, [x0] | |||||
| ret | |||||
| #endif | |||||
| @@ -10,7 +10,7 @@ | |||||
| // void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | // void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | ||||
| // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | ||||
| // size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); | // size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); | ||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, | |||||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: kernel_h, x7: kernel_w, | |||||
| // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step | // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step | ||||
| // x14: relu, x15: relu6 | // x14: relu, x15: relu6 | ||||
| ConvDwFp32Center: | ConvDwFp32Center: | ||||
| @@ -58,6 +58,9 @@ void C4BiasAddRelu(float *dst, const float *input, const float *bias, size_t oc, | |||||
| void C4BiasAddRelu6(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride); | void C4BiasAddRelu6(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride); | ||||
| void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); | void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); | ||||
| void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); | void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); | ||||
| void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||||
| size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6); | |||||
| #endif | #endif | ||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| @@ -86,8 +86,9 @@ void AppendSlidingParamConvDw(SlidingWindowParam *sliding, const ConvParameter * | |||||
| } | } | ||||
| /*conv depthwise fp32 begin*/ | /*conv depthwise fp32 begin*/ | ||||
| #ifndef ENABLE_ARM64 | |||||
| void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, const float *bias, int height, int width, | void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, const float *bias, int height, int width, | ||||
| int in_kh_step, int in_kw_step, int kernel_w, bool is_relu, bool is_relu6) { | |||||
| int in_kh_step, int in_kw_step, int kernel_w_step, bool is_relu, bool is_relu6) { | |||||
| const float *src_kh = src; | const float *src_kh = src; | ||||
| const float *weight_kh = weight; | const float *weight_kh = weight; | ||||
| for (int c = 0; c < C4NUM; c++) { | for (int c = 0; c < C4NUM; c++) { | ||||
| @@ -97,22 +98,14 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con | |||||
| const float *src_kw = src_kh; | const float *src_kw = src_kh; | ||||
| const float *weight_kw = weight_kh; | const float *weight_kw = weight_kh; | ||||
| for (int kw = 0; kw < width; kw++) { | for (int kw = 0; kw < width; kw++) { | ||||
| #ifdef ENABLE_ARM64 | |||||
| float32x4_t src_4 = vld1q_f32(src_kw); | |||||
| float32x4_t weight_4 = vld1q_f32(weight_kw); | |||||
| float32x4_t dst_4 = vld1q_f32(dst); | |||||
| dst_4 = vfmaq_f32(dst_4, src_4, weight_4); | |||||
| vst1q_f32(dst, dst_4); | |||||
| #else | |||||
| for (int c = 0; c < C4NUM; c++) { | for (int c = 0; c < C4NUM; c++) { | ||||
| dst[c] += src_kw[c] * weight_kw[c]; | dst[c] += src_kw[c] * weight_kw[c]; | ||||
| } | } | ||||
| #endif | |||||
| src_kw += in_kw_step; | src_kw += in_kw_step; | ||||
| weight_kw += C4NUM; | weight_kw += C4NUM; | ||||
| } // kernel_w loop | } // kernel_w loop | ||||
| src_kh += in_kh_step; | src_kh += in_kh_step; | ||||
| weight_kh += kernel_w * C4NUM; | |||||
| weight_kh += kernel_w_step; | |||||
| } // kernel_h loop | } // kernel_h loop | ||||
| for (int c = 0; c < C4NUM; c++) { | for (int c = 0; c < C4NUM; c++) { | ||||
| dst[c] += bias[c]; | dst[c] += bias[c]; | ||||
| @@ -120,6 +113,7 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con | |||||
| dst[c] = (is_relu6) ? (MSMIN(6, MSMAX(0, dst[c]))) : (dst[c]); | dst[c] = (is_relu6) ? (MSMIN(6, MSMAX(0, dst[c]))) : (dst[c]); | ||||
| } | } | ||||
| } | } | ||||
| #endif | |||||
| void DepthwiseBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom, | void DepthwiseBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom, | ||||
| int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { | int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) { | ||||
| @@ -140,10 +134,15 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl | |||||
| const float *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; | const float *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; | ||||
| const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C4NUM; | const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C4NUM; | ||||
| #ifdef ENABLE_ARM64 | |||||
| ConvDwFp32Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | |||||
| sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float), | |||||
| conv_param->kernel_w_ * C4NUM * sizeof(float), conv_param->is_relu_, conv_param->is_relu6_); | |||||
| #else | |||||
| DepthwiseBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | DepthwiseBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | ||||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_, conv_param->is_relu_, | |||||
| conv_param->is_relu6_); | |||||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM, | |||||
| conv_param->is_relu_, conv_param->is_relu6_); | |||||
| #endif | |||||
| dst_kernel += sliding->block_channel_; | dst_kernel += sliding->block_channel_; | ||||
| } // width loop | } // width loop | ||||
| dst_h += sliding->out_h_step_; | dst_h += sliding->out_h_step_; | ||||
| @@ -5,7 +5,7 @@ gender_res_large_deploy | |||||
| glasses | glasses | ||||
| hat | hat | ||||
| isface | isface | ||||
| #ml_bank_detect_0312 | |||||
| ml_bank_detect_0312 | |||||
| ml_face_div_parsing | ml_face_div_parsing | ||||
| ml_hardware_eyeclose | ml_hardware_eyeclose | ||||
| ml_ocr_detect_20200305 | ml_ocr_detect_20200305 | ||||