| @@ -0,0 +1,56 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global ConvDwFp16Border | |||
| #ifndef __APPLE__ | |||
| .type ConvDwFp16Border, %function | |||
| #endif | |||
| // void ConvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, | |||
| // size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, | |||
| // size_t relu6) | |||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step, | |||
| // x8: kernel_w, x9: relu, x10: relu6 | |||
| ConvDwFp16Border: | |||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| ldr x8, [sp] | |||
| ldr x9, [sp, #8] | |||
| ldr x10, [sp, #16] | |||
| ld1 {v0.8h}, [x3] // bias | |||
| movi v1.8h, #0x46, lsl #8 // relu 6 | |||
| dup v2.4s, wzr // relu | |||
| mov x13, x1 | |||
| mov x14, x2 | |||
| LoopH: | |||
| mov x15, x13 | |||
| mov x16, x14 | |||
| mov x17, x5 | |||
| LoopW: | |||
| ld1 {v3.8h}, [x15], x7 | |||
| ld1 {v4.8h}, [x16], #16 | |||
| fmla v0.8h, v3.8h, v4.8h | |||
| subs x17, x17, #1 | |||
| bne LoopW | |||
| subs x4, x4, #1 | |||
| add x13, x13, x6 | |||
| add x14, x14, x8 | |||
| bne LoopH | |||
| cbnz x10, Relu6 | |||
| cbnz x9, Relu | |||
| b Write | |||
| Relu6: | |||
| fmin v0.8h, v0.8h, v1.8h | |||
| Relu: | |||
| fmax v0.8h, v0.8h, v2.8h | |||
| Write: | |||
| st1 {v0.8h}, [x0] | |||
| ret | |||
| #endif | |||
| @@ -28,6 +28,9 @@ extern "C" { | |||
| #endif | |||
| #ifdef ENABLE_ARM64 | |||
| void ConvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, | |||
| size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, | |||
| size_t relu6); | |||
| void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, | |||
| size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, | |||
| size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, | |||
| @@ -20,7 +20,7 @@ | |||
| /*conv depthwise fp16 begin*/ | |||
| void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, | |||
| int height, int width, int in_kh_step, int in_kw_step, int kernel_w, bool is_relu, | |||
| int height, int width, int in_kh_step, int in_kw_step, int kernel_w_step, bool is_relu, | |||
| bool is_relu6) { | |||
| for (int c = 0; c < C8NUM; c++) { | |||
| dst[c] = 0; | |||
| @@ -41,7 +41,7 @@ void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float1 | |||
| weight_kw += C8NUM; | |||
| } // kernel_w loop | |||
| src_kh += in_kh_step; | |||
| weight_kh += kernel_w * C8NUM; | |||
| weight_kh += kernel_w_step; | |||
| } // kernel_h loop | |||
| for (int c = 0; c < C8NUM; c++) { | |||
| dst[c] += bias[c]; | |||
| @@ -69,11 +69,15 @@ void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t * | |||
| const float16_t *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; | |||
| const float16_t *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C8NUM; | |||
| #ifdef ENABLE_ARM64 | |||
| ConvDwFp16Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | |||
| sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t), | |||
| conv_param->kernel_w_ * C8NUM * sizeof(float16_t), conv_param->is_relu_, conv_param->is_relu6_); | |||
| #else | |||
| DepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, | |||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_, conv_param->is_relu_, | |||
| conv_param->is_relu6_); | |||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM, | |||
| conv_param->is_relu_, conv_param->is_relu6_); | |||
| #endif | |||
| dst_kernel += sliding->block_channel_; | |||
| } // width loop | |||
| dst_h += sliding->out_h_step_; | |||