Browse Source

!4496 [MS][LITE] optimize arm cpu fp16 op: add assembly file for conv depthwise border

Merge pull request !4496 from yangruoqi713/lite
tags/v0.7.0-beta
mindspore-ci-bot Gitee 5 years ago
parent
commit
a69dda0559
3 changed files with 69 additions and 6 deletions
  1. +56
    -0
      mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/ConvDwFp16Border.S
  2. +3
    -0
      mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.h
  3. +10
    -6
      mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.c

+ 56
- 0
mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/ConvDwFp16Border.S View File

@@ -0,0 +1,56 @@
#ifdef __aarch64__

.text
.align 5
.global ConvDwFp16Border
#ifndef __APPLE__
.type ConvDwFp16Border, %function
#endif

// void ConvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
// size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu,
// size_t relu6)

// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: width, x6: in_kh_step, x7: in_kw_step,
// x8: kernel_w, x9: relu, x10: relu6
ConvDwFp16Border:
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
ldr x8, [sp]
ldr x9, [sp, #8]
ldr x10, [sp, #16]

ld1 {v0.8h}, [x3] // bias
movi v1.8h, #0x46, lsl #8 // relu 6
dup v2.4s, wzr // relu

mov x13, x1
mov x14, x2
LoopH:
mov x15, x13
mov x16, x14
mov x17, x5
LoopW:
ld1 {v3.8h}, [x15], x7
ld1 {v4.8h}, [x16], #16
fmla v0.8h, v3.8h, v4.8h
subs x17, x17, #1
bne LoopW
subs x4, x4, #1
add x13, x13, x6
add x14, x14, x8
bne LoopH
cbnz x10, Relu6
cbnz x9, Relu
b Write
Relu6:
fmin v0.8h, v0.8h, v1.8h
Relu:
fmax v0.8h, v0.8h, v2.8h
Write:
st1 {v0.8h}, [x0]

ret
#endif

+ 3
- 0
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/common_func.h View File

@@ -28,6 +28,9 @@ extern "C" {
#endif #endif


#ifdef ENABLE_ARM64 #ifdef ENABLE_ARM64
void ConvDwFp16Border(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
size_t height, size_t width, size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu,
size_t relu6);
void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step,
size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step,


+ 10
- 6
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.c View File

@@ -20,7 +20,7 @@


/*conv depthwise fp16 begin*/ /*conv depthwise fp16 begin*/
void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
int height, int width, int in_kh_step, int in_kw_step, int kernel_w, bool is_relu,
int height, int width, int in_kh_step, int in_kw_step, int kernel_w_step, bool is_relu,
bool is_relu6) { bool is_relu6) {
for (int c = 0; c < C8NUM; c++) { for (int c = 0; c < C8NUM; c++) {
dst[c] = 0; dst[c] = 0;
@@ -41,7 +41,7 @@ void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float1
weight_kw += C8NUM; weight_kw += C8NUM;
} // kernel_w loop } // kernel_w loop
src_kh += in_kh_step; src_kh += in_kh_step;
weight_kh += kernel_w * C8NUM;
weight_kh += kernel_w_step;
} // kernel_h loop } // kernel_h loop
for (int c = 0; c < C8NUM; c++) { for (int c = 0; c < C8NUM; c++) {
dst[c] += bias[c]; dst[c] += bias[c];
@@ -69,11 +69,15 @@ void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *


const float16_t *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_; const float16_t *src_kernel = src_w + start_kh * sliding->in_kh_step_ + start_kw * sliding->in_kw_step_;
const float16_t *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C8NUM; const float16_t *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * C8NUM;

#ifdef ENABLE_ARM64
ConvDwFp16Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t),
conv_param->kernel_w_ * C8NUM * sizeof(float16_t), conv_param->is_relu_, conv_param->is_relu6_);
#else
DepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw, DepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_, conv_param->is_relu_,
conv_param->is_relu6_);

sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM,
conv_param->is_relu_, conv_param->is_relu6_);
#endif
dst_kernel += sliding->block_channel_; dst_kernel += sliding->block_channel_;
} // width loop } // width loop
dst_h += sliding->out_h_step_; dst_h += sliding->out_h_step_;


Loading…
Cancel
Save