[MSLITE] deconv winograd fp16 neon

5 years ago · 51fced3767
--- a/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S
+++ b/mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S
@@ -0,0 +1,279 @@

    .text
    .align 5
    //.p2align 5,,15
    .global PostFuncBiasReluC4Fp16
 #ifndef __APPLE__
    .type PostFuncBiasReluC4Fp16, %function
 #endif

 //void PostFuncBiasReluC4Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc4div, size_t oc4mod,
 //                            size_t plane_size, size_t plane_stride, size_t relu_type);
 // x0 dst           x1 srx           x2 bias
 // w3 oc4div        w4 oc4mod        w5 plane_size
 // x6 plane_stride  x7 relu_type

 PostFuncBiasReluC4Fp16:

  movi v26.4h, #6
  scvtf v26.4h, v26.4h
  dup v27.4h, wzr

  mov x10, #2
  add x12, x3, x4
  mul x12, x12, x10

  mov w10, #0

 Loop_C4:
  cmp w10, w3
  beq Loop_C1
  mov x15, #2
  mul x14, x10, x15
  add x15, x0, x14
  add w10, w10, #4
  mov w13, w5
  ld1 {v16.4h}, [x2], #8

 Loop_8x4:
  cmp w13, #8
  blt Loop_4x4
  sub w13, w13, #8
  ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1], #32
  ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32

  fadd v0.4h, v0.4h, v16.4h
  fadd v1.4h, v1.4h, v16.4h
  fadd v2.4h, v2.4h, v16.4h
  fadd v3.4h, v3.4h, v16.4h
  fadd v4.4h, v4.4h, v16.4h
  fadd v5.4h, v5.4h, v16.4h
  fadd v6.4h, v6.4h, v16.4h
  fadd v7.4h, v7.4h, v16.4h

  cmp x7, #3
  beq Relu6_8x4
  cmp x7, #1
  beq Relu_8x4
  b Write_8x4
 Relu6_8x4:
  fmin v0.4h, v0.4h, v26.4h
  fmin v1.4h, v1.4h, v26.4h
  fmin v2.4h, v2.4h, v26.4h
  fmin v3.4h, v3.4h, v26.4h
  fmin v4.4h, v4.4h, v26.4h
  fmin v5.4h, v5.4h, v26.4h
  fmin v6.4h, v6.4h, v26.4h
  fmin v7.4h, v7.4h, v26.4h
 Relu_8x4:
  fmax v0.4h, v0.4h, v27.4h
  fmax v1.4h, v1.4h, v27.4h
  fmax v2.4h, v2.4h, v27.4h
  fmax v3.4h, v3.4h, v27.4h
  fmax v4.4h, v4.4h, v27.4h
  fmax v5.4h, v5.4h, v27.4h
  fmax v6.4h, v6.4h, v27.4h
  fmax v7.4h, v7.4h, v27.4h
 Write_8x4:
  st1 {v0.4h}, [x15], x12
  st1 {v1.4h}, [x15], x12
  st1 {v2.4h}, [x15], x12
  st1 {v3.4h}, [x15], x12
  st1 {v4.4h}, [x15], x12
  st1 {v5.4h}, [x15], x12
  st1 {v6.4h}, [x15], x12
  st1 {v7.4h}, [x15], x12
  b Loop_8x4

 Loop_4x4:
  cmp w13, #4
  blt Loop_1x4
  sub w13, w13, #4
  ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1], #32
  fadd v0.4h, v0.4h, v16.4h
  fadd v1.4h, v1.4h, v16.4h
  fadd v2.4h, v2.4h, v16.4h
  fadd v3.4h, v3.4h, v16.4h
  cmp x7, #3
  beq Relu6_4x4
  cmp x7, #1
  beq Relu_4x4
  b Write_4x4
 Relu6_4x4:
  fmin v0.4h, v0.4h, v26.4h
  fmin v1.4h, v1.4h, v26.4h
  fmin v2.4h, v2.4h, v26.4h
  fmin v3.4h, v3.4h, v26.4h
 Relu_4x4:
  fmax v0.4h, v0.4h, v27.4h
  fmax v1.4h, v1.4h, v27.4h
  fmax v2.4h, v2.4h, v27.4h
  fmax v3.4h, v3.4h, v27.4h
 Write_4x4:
  st1 {v0.4h}, [x15], x12
  st1 {v1.4h}, [x15], x12
  st1 {v2.4h}, [x15], x12
  st1 {v3.4h}, [x15], x12

 Loop_1x4:
  cmp x7, #3
  beq Relu6_1x4
  cmp x7, #1
  beq Relu_1x4
  b Write_1x4
 Relu6_1x4:
  cmp w13, #0
  beq HW_Add
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmin v0.4h, v0.4h, v26.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.4h}, [x15], x12
  b Relu6_1x4
 Relu_1x4:
  cmp w13, #0
  beq HW_Add
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.4h}, [x15], x12
  b Relu_1x4
 Write_1x4:
  cmp w13, #0
  beq HW_Add
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  st1 {v0.4h}, [x15], x12
  b Write_1x4

 HW_Add:
  add x1, x1, x6
  b Loop_C4

 Loop_C1:
  cmp w4, #0
  beq End
  mov w13, w5
  ld1 {v16.4h}, [x2], #8
  mov x15,  #2
  mul x14, x10, x15
  add x0, x0, x14

  cmp w4, #1
  beq Loop_C1_1
  cmp w4, #2
  beq Loop_C1_2
  cmp w4, #3
  beq Loop_C1_3

 Loop_C1_1:
  cmp x7, #3
  beq Loop_C1_1_Relu6
  cmp x7, #1
  beq Loop_C1_1_Relu
  b Loop_C1_1_Write
 Loop_C1_1_Relu6:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmin v0.4h, v0.4h, v26.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.h}[0], [x0], x12
  b Loop_C1_1_Relu6
 Loop_C1_1_Relu:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.h}[0], [x0], x12
  b Loop_C1_1_Relu
 Loop_C1_1_Write:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  st1 {v0.h}[0], [x0], x12
  b Loop_C1_1_Write

 Loop_C1_2:
  cmp x7, #3
  beq Loop_C1_2_Relu6
  cmp x7, #1
  beq Loop_C1_2_Relu
  b Loop_C1_2_Write
 Loop_C1_2_Relu6:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmin v0.4h, v0.4h, v26.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.s}[0], [x0], x12
  b Loop_C1_2_Relu6
 Loop_C1_2_Relu:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.s}[0], [x0], x12
  b Loop_C1_2_Relu
 Loop_C1_2_Write:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  st1 {v0.s}[0], [x0], x12
  b Loop_C1_2_Write

 Loop_C1_3:
  add x15, x0, #4
  cmp x7, #3
  beq Loop_C1_3_Relu6
  cmp x7, #1
  beq Loop_C1_3_Relu
  b Loop_C1_3_Write
 Loop_C1_3_Relu6:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmin v0.4h, v0.4h, v26.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.s}[0], [x0], x12
  st1 {v0.h}[2], [x15], x12
  b Loop_C1_3_Relu6
 Loop_C1_3_Relu:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.s}[0], [x0], x12
  st1 {v0.h}[2], [x15], x12
  b Loop_C1_3_Relu
 Loop_C1_3_Write:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  st1 {v0.s}[0], [x0], x12
  st1 {v0.h}[2], [x15], x12
  b Loop_C1_3_Write

 End:
  ret
--- a/mindspore/lite/nnacl/fp16/common_func_fp16.c
+++ b/mindspore/lite/nnacl/fp16/common_func_fp16.c
@@ -50,5 +50,9 @@ void PostConvFuncFp16C8(const float16_t *c8_out, float16_t *nhwc_out, const floa

 void PostConvFuncFp16C4(const float16_t *c4_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane,
                        size_t plane_stride, ActType act_type) {
  PostConvFuncCommFp16(nhwc_out, c4_out, bias, oc, plane, oc, plane_stride, act_type, C4NUM);
  size_t oc4mod = oc % C4NUM;
  size_t oc4div = oc - oc4mod;
  size_t stride_size = (plane_stride - plane) * C4NUM * sizeof(float16_t);
  PostFuncBiasReluC4Fp16(nhwc_out, c4_out, bias, oc4div, oc4mod, plane, stride_size, act_type);
  return;
 }
--- a/mindspore/lite/nnacl/fp16/common_func_fp16.h
+++ b/mindspore/lite/nnacl/fp16/common_func_fp16.h
@@ -32,6 +32,8 @@ void PostFuncBiasReluC8Fp16(float16_t *dst, const float16_t *src, const float16_
 /* deconv winograd */
 void PostConvFuncFp16C4(const float16_t *c4_out, float16_t *nhwc_out, const float16_t *bias, size_t output_channel,
                        size_t plane_size, size_t plane_stride, ActType act_type);
 void PostFuncBiasReluC4Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc4div, size_t oc4mod,
                            size_t plane_size, size_t plane_stride, size_t relu_type);

 #ifdef __cplusplus
 }
--- a/mindspore/lite/nnacl/fp16/deconv_winograd_fp16.c
+++ b/mindspore/lite/nnacl/fp16/deconv_winograd_fp16.c
@@ -24,7 +24,7 @@ void DeConvWgInputPackFp16(float16_t *src_ptr, float16_t *dst_ptr, int channel,
  float16_t *dst = dst_ptr;

  for (int ic = 0; ic < ic4div; ic++) {
    memcpy(dst, src, C4NUM * sizeof(float16_t));
    vst1_f16(dst, vld1_f16(src));
    dst += stride;
    src += C4NUM;
  }
--- a/mindspore/lite/src/common/file_utils.cc
+++ b/mindspore/lite/src/common/file_utils.cc
@@ -85,23 +85,6 @@ std::string RealPath(const char *path) {
  return res;
 }

 int WriteToBin(const std::string &file_path, void *data, size_t size) {
  std::ofstream out_file;

  out_file.open(file_path.c_str(), std::ios::binary);
  if (!out_file.good()) {
    MS_LOG(ERROR) << "file is bad";
    return -1;
  }

  if (!out_file.is_open()) {
    MS_LOG(ERROR) << "file open failed";
    return -1;
  }
  out_file.write(reinterpret_cast<char *>(data), size);
  return 0;
 }

 int CompareOutputData(float *output_data, size_t output_size, float *correct_data, size_t data_size) {
  if (output_size != data_size) {
    printf("compare failed, output_size %zu isn't equal to data_size %zu.\n", output_size, data_size);
--- a/mindspore/lite/src/common/file_utils.h
+++ b/mindspore/lite/src/common/file_utils.h
@@ -48,7 +48,15 @@ void WriteToTxt(const std::string &file_path, void *data, size_t element_size) {
  out_file.close();
 }

 int WriteToBin(const std::string &file_path, void *data, size_t size);
 inline int WriteToBin(const std::string &file_path, void *data, size_t size) {
  std::ofstream out_file;
  out_file.open(file_path.c_str(), std::ios::binary);
  if (!out_file.good() || !out_file.is_open()) {
    return -1;
  }
  out_file.write(reinterpret_cast<char *>(data), size);
  return 0;
 }

 int CompareOutputData(float *output_data, size_t output_num, float *correct_data, size_t data_size);
 int CompareOutput(float *output_data, size_t output_num, std::string file_path);
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
@@ -233,8 +233,7 @@ kernel::LiteKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor *>
  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
  if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
      (conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1)) {
    /* DeConvWinogradFp16CPUKernel */
    kernel = new (std::nothrow) kernel::DeConvolutionFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
    kernel = new (std::nothrow) kernel::DeConvWinogradFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
  } else {
    kernel = new (std::nothrow) kernel::DeConvolutionFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
  }
@@ -266,5 +265,4 @@ kernel::LiteKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor *>
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_DeConv2D, CpuDeConvFp16KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
@@ -248,7 +248,6 @@ kernel::LiteKernel *CpuDeConvFp32KernelCreator(const std::vector<lite::Tensor *>
  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
  if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
      (conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1)) {
    /* DeConvolutionWinogradCPUKernel */
    kernel = new (std::nothrow) kernel::DeConvolutionWinogradCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  } else {
    kernel = new (std::nothrow) kernel::DeConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive);
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.cc
@@ -18,6 +18,7 @@
 #include "src/runtime/runtime_api.h"

 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_MEMORY_FAILED;
 using mindspore::lite::RET_NULL_PTR;
 using mindspore::lite::RET_OK;

@@ -59,20 +60,10 @@ void DeConvolutionWinogradCPUKernel::FreeResizeBuf() {
    wg.buf_init_ = false;
  }

  if (nc4hw4_output_ != nullptr) {
    free(nc4hw4_output_);
    nc4hw4_output_ = nullptr;
  }

  if (tile_input_ != nullptr) {
    free(tile_input_);
    tile_input_ = nullptr;
  }

  if (tile_output_ != nullptr) {
    free(tile_output_);
    tile_output_ = nullptr;
  }
  return;
 }

@@ -108,9 +99,6 @@ int DeConvolutionWinogradCPUKernel::InitParameter() {
  deconv_param_->input_plane_ = conv_param_->input_h_ * conv_param_->input_w_;
  deconv_param_->output_plane_ = conv_param_->output_h_ * conv_param_->output_w_;

  nc4hw4_output_ =
    reinterpret_cast<float *>(malloc(deconv_param_->oc_up4_ * deconv_param_->output_plane_ * sizeof(float)));

  deconv_param_->in_tile_w_count_ = UP_DIV(conv_param_->input_w_, DECONV_WINOGRAD_DEFAULT_UNIT);
  deconv_param_->in_tile_h_count_ = UP_DIV(conv_param_->input_h_, DECONV_WINOGRAD_DEFAULT_UNIT);

@@ -129,9 +117,6 @@ int DeConvolutionWinogradCPUKernel::InitParameter() {

  deconv_param_->out_tile_w_ = (DECONV_WINOGRAD_DEFAULT_UNIT - 1) * conv_param_->stride_w_ + conv_param_->kernel_w_;
  deconv_param_->out_tile_h_ = (DECONV_WINOGRAD_DEFAULT_UNIT - 1) * conv_param_->stride_h_ + conv_param_->kernel_h_;
  size = deconv_param_->thread_num_ * deconv_param_->out_tile_w_ * deconv_param_->out_tile_h_ *
         DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->oc_up4_;
  tile_output_ = reinterpret_cast<float *>(malloc(size * sizeof(float)));

  for (int i = 0; i < deconv_param_->compute_size_; i++) {
    DeConvComputeUnit &unit = deconv_param_->compute_units_[i];
@@ -329,7 +314,44 @@ int DeConvolutionWinogradCPUKernel::DeDeconvPost(int task_id) {
  return RET_OK;
 }

 int DeConvolutionWinogradCPUKernel::InitRunBuf() {
  int size = deconv_param_->oc_up4_ * deconv_param_->output_plane_;
  nc4hw4_output_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(size * sizeof(float)));
  if (nc4hw4_output_ == nullptr) {
    MS_LOG(ERROR) << "de conv wg Malloc nc4hw4_output_ error!";
    return RET_MEMORY_FAILED;
  }

  size = deconv_param_->thread_num_ * deconv_param_->out_tile_w_ * deconv_param_->out_tile_h_ *
         DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->oc_up4_;
  tile_output_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(size * sizeof(float)));
  if (tile_output_ == nullptr) {
    MS_LOG(ERROR) << "de conv wg Malloc tile_output_ error!";
    return RET_MEMORY_FAILED;
  }
  return RET_OK;
 }

 void DeConvolutionWinogradCPUKernel::FreeRunBuf() {
  if (nc4hw4_output_ != nullptr) {
    ctx_->allocator->Free(nc4hw4_output_);
    nc4hw4_output_ = nullptr;
  }

  if (tile_output_ != nullptr) {
    ctx_->allocator->Free(tile_output_);
    tile_output_ = nullptr;
  }
  return;
 }

 int DeConvolutionWinogradCPUKernel::Run() {
  auto ret = InitRunBuf();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "InitRunBuf fail!ret: " << ret;
    return ret;
  }

  float *src_in = reinterpret_cast<float *>(in_tensors_[0]->data_c());
  float *src_out = reinterpret_cast<float *>(out_tensors_[0]->data_c());

@@ -344,6 +366,7 @@ int DeConvolutionWinogradCPUKernel::Run() {
    ParallelLaunch(this->context_->thread_pool_, DeConvWgPostFp32Run, this, thread_num_hw_);
  }

  FreeRunBuf();
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.h
@@ -54,6 +54,8 @@ class DeConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
  int InitParameter();
  void FreeDeconvParam();
  void FreeResizeBuf();
  int InitRunBuf();
  void FreeRunBuf();

 private:
  DeConvParam *deconv_param_;
--- a/mindspore/lite/test/models_arm32.cfg
+++ b/mindspore/lite/test/models_arm32.cfg
@@ -11,6 +11,7 @@ ml_face_contour
 mnet
 ml_face_landmark
 ml_liveness_detect_landmark
 deconv_test_model
 # aware_training
 video_infer.tflite
 mobilenet_v1_1.0_224_quant.tflite
--- a/mindspore/lite/test/models_caffe.cfg
+++ b/mindspore/lite/test/models_caffe.cfg
@@ -56,3 +56,4 @@ hiai_face_attr1
 detect-mbv1-shortcut-400-400_nopostprocess_simplified
 detect_mbv1_640_480_nopostprocess_simplified
 retinaface
 deconv_test_model