!7393 [MSLITE] Add matrix-vector multiplication for fp16 fullconnection

Merge pull request !7393 from zhanyuan/tmp
5 years ago · 57ebdb4545
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16_1xN.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16_1xN.S
@@ -0,0 +1,178 @@
 #ifdef __aarch64__
    .text
    .align 5
    .global MatmulFp16Neon64_1xN
 #ifndef __APPLE__
    .type MatmulFp16Neon64_1xN, %function
 #endif

 // void MatmulFp16Neon64_1xN(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col)
 // x0: a
 // x1: b
 // x2: c
 // x3: bias
 // w4: act_type
 // w5: depth
 // w6: col

 MatmulFp16Neon64_1xN:
  sub sp, sp, #128
  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64

  mov w14, #2      // sizeof(float16)
  mul w8, w14, w5  // rhs depthx1 block stride
  mov w14, #4
  mul w13, w8, w14 // rhs depthx4 block stride 

 Loop:
  mov x15, x0    // reload a ptr
  mov x7, x1    // reload b ptr
  mov w9, w5    // reload depth
  cmp w6, #4
  blt Loop1x1  

 Loop1x4:
  dup v5.8h, wzr  
  dup v6.8h, wzr  
  dup v7.8h, wzr  
  dup v8.8h, wzr  
  dup v9.8h, wzr  
  dup v10.8h, wzr  
  dup v11.8h, wzr  
  dup v12.8h, wzr
  dup v13.8h, wzr

  add x10, x7, x8
  add x11, x10, x8
  add x12, x11, x8

 Depth8_1x4:
  cmp w9, #8
  blt Depth1_1x4

  ld1 {v0.8h}, [x15], #16
  ld1 {v1.8h}, [x7], #16
  ld1 {v2.8h}, [x10], #16
  ld1 {v3.8h}, [x11], #16
  ld1 {v4.8h}, [x12], #16

  fmla v5.8h, v1.8h, v0.8h
  fmla v6.8h, v2.8h, v0.8h
  fmla v7.8h, v3.8h, v0.8h
  fmla v8.8h, v4.8h, v0.8h
  sub w9, w9, #8
  cbz w9, End1x4
  b Depth8_1x4

 Depth1_1x4:
  ld1 {v0.h}[0], [x15], #2
  ld1 {v1.h}[0], [x7], #2
  ld1 {v1.h}[1], [x10], #2
  ld1 {v1.h}[2], [x11], #2
  ld1 {v1.h}[3], [x12], #2

  fmla v9.8h, v1.8h, v0.h[0]
  sub w9, w9, #1
  cbz w9, End1x4
  b Depth1_1x4

 End1x4:
  faddp v10.8h, v5.8h, v6.8h
  faddp v11.8h, v7.8h, v8.8h
  faddp v12.8h, v10.8h, v11.8h
  faddp v13.8h, v12.8h, v12.8h
  fadd v13.8h, v13.8h, v9.8h

  cbz x3, Act1x4
  ld1 {v14.4h}, [x3], #8
  fadd v13.8h, v13.8h, v14.8h 

 Act1x4:
  cmp w4, #3
  beq Relu6_1x4
  cmp w4, #1
  beq Relu1x4
  b Write1x4

 Relu6_1x4:
  movi v14.8h, #0x46, lsl #8
  fmin v13.8h, v13.8h, v14.8h

 Relu1x4:
  dup v14.8h, wzr
  fmax v13.8h, v13.8h, v14.8h

 Write1x4:
  st1 {v13.4h}, [x2], #8
  sub w6, w6, #4
  cbz w6, End
  add x1, x1, x13
  b Loop

 Loop1x1:
  dup v2.8h, wzr
  dup v3.8h, wzr
  dup v4.8h, wzr
  dup v5.8h, wzr
  dup v6.8h, wzr

 Depth8_1x1:
  cmp w9, #8
  blt Depth1_1x1

  ld1 {v0.8h}, [x15], #16
  ld1 {v1.8h}, [x7], #16

  fmla v2.8h, v1.8h, v0.8h
  sub w9, w9, #8
  cbz w9, End1x1
  b Depth8_1x1

 Depth1_1x1:
  ld1 {v0.h}[0], [x15], #2
  ld1 {v1.h}[0], [x7], #2

  fmla v3.8h, v1.8h, v0.h[0]
  sub w9, w9, #1
  cbz w9, End1x1
  b Depth1_1x1

 End1x1:
  faddp v4.8h, v2.8h, v2.8h  
  faddp v5.8h, v4.8h, v4.8h  
  faddp v6.8h, v5.8h, v5.8h  
  fadd v6.8h, v6.8h, v3.8h

  cbz x3, Act1x1
  ld1 {v7.h}[0], [x3], #2
  fadd v6.8h, v6.8h, v7.8h 

 Act1x1:
  cmp w4, #3
  beq Relu6_1x1
  cmp w4, #1
  beq Relu1x1
  b Write1x1

 Relu6_1x1:
  movi v7.8h, #0x46, lsl #8
  fmin v6.8h, v6.8h, v7.8h

 Relu1x1:
  dup v7.8h, wzr
  fmax v6.8h, v6.8h, v7.8h

 Write1x1:
  st1 {v6.h}[0], [x2], #2
  sub w6, w6, #1
  cbz w6, End
  add x1, x1, x8
  b Loop

 End:
  sub sp, sp, #128
  ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
  ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
  ret
 #endif
--- a/mindspore/lite/nnacl/fp16/matmul_fp16.c
+++ b/mindspore/lite/nnacl/fp16/matmul_fp16.c
@@ -289,6 +289,11 @@ void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const floa
  return;
 }

 void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth,
                   int col) {
  MatmulFp16Neon64_1xN(a, b, c, bias, act_type, depth, col);
 }

 void RowMajor2Col16MajorFp16Opt(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
  size_t row_up_16 = UP_ROUND(row, C16NUM);
  size_t row16 = row / C16NUM * C16NUM;
--- a/mindspore/lite/nnacl/fp16/matmul_fp16.h
+++ b/mindspore/lite/nnacl/fp16/matmul_fp16.h
@@ -35,6 +35,9 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
 void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
                int depth, int row, int col, int stride, int out_type);

 void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth,
                   int col);

 void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16);

 void RowMajor2Col16MajorFp16Opt(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col);
@@ -45,6 +48,9 @@ void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, cons
 void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
                         size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc);

 void MatmulFp16Neon64_1xN(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
                          int depth, int col);

 void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src);

 void RowMajor2Row16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src);
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
@@ -62,38 +62,59 @@ int FullconnectionFP16CPUKernel::ReSize() {
  thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_, C8NUM));
  thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_, C8NUM), thread_count_) * C8NUM;

  if (row == 1) is_vector_input_ = true;
  int a_pack_row = 0;
  int b_pack_col = 0;
  if (is_vector_input_) {
    a_pack_row = 1;
    b_pack_col = fc_param_->col_;
  } else {
    a_pack_row = fc_param_->row_16_;
    b_pack_col = fc_param_->col_8_;
  }
  a_pack_ptr_ =
    reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->row_16_ * fc_param_->deep_ * sizeof(float16_t)));
    reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(a_pack_row * fc_param_->deep_ * sizeof(float16_t)));
  if (a_pack_ptr_ == nullptr) {
    FreeTmpBuffer();
    return RET_MEMORY_FAILED;
  }
  memset(a_pack_ptr_, 0, fc_param_->row_16_ * fc_param_->deep_ * sizeof(float16_t));
  memset(a_pack_ptr_, 0, a_pack_row * fc_param_->deep_ * sizeof(float16_t));

  b_pack_ptr_ =
    reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->col_8_ * fc_param_->deep_ * sizeof(float16_t)));
    reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(b_pack_col * fc_param_->deep_ * sizeof(float16_t)));
  if (b_pack_ptr_ == nullptr) {
    FreeTmpBuffer();
    return RET_MEMORY_FAILED;
  }
  memset(b_pack_ptr_, 0, fc_param_->col_8_ * fc_param_->deep_ * sizeof(float16_t));
  memset(b_pack_ptr_, 0, b_pack_col * fc_param_->deep_ * sizeof(float16_t));

  fc_param_->b_const_ = (in_tensors_[1]->data_c() != nullptr);
  if (fc_param_->b_const_) {
    if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
      InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
      if (is_vector_input_) {
        Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_,
                         fc_param_->col_ * fc_param_->deep_);
      } else {
        InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
      }
    } else {
      InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
      if (is_vector_input_) {
        memcpy(b_pack_ptr_, reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()),
               fc_param_->col_ * fc_param_->deep_ * sizeof(float16_t));
      } else {
        InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
      }
    }
    b_ptr_ = b_pack_ptr_;
  }

  if (in_tensors_.size() == 3) {
    bias_ptr_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->col_8_ * sizeof(float16_t)));
    bias_ptr_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(b_pack_col * sizeof(float16_t)));
    if (bias_ptr_ == nullptr) {
      FreeTmpBuffer();
      return RET_MEMORY_FAILED;
    }
    memset(bias_ptr_, 0, fc_param_->col_8_ * sizeof(float16_t));
    memset(bias_ptr_, 0, b_pack_col * sizeof(float16_t));
    Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->data_c()), bias_ptr_, fc_param_->col_);
  }

@@ -102,7 +123,7 @@ int FullconnectionFP16CPUKernel::ReSize() {
      reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(fc_param_->row_ * fc_param_->col_ * sizeof(float16_t)));
  }
  return RET_OK;
 }
 }  // namespace mindspore::kernel

 void FullconnectionFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) {
  RowMajor2Col16MajorFp16(reinterpret_cast<void *>(a_ptr), a_pack_ptr, fc_param_->row_, fc_param_->deep_, true);
@@ -133,11 +154,16 @@ int FullconnectionFP16CPUKernel::RunImpl(int task_id) {
  if (cur_oc <= 0) {
    return RET_OK;
  }
  auto b = b_pack_ptr_ + task_id * thread_stride_ * fc_param_->deep_;
  auto b = b_ptr_ + task_id * thread_stride_ * fc_param_->deep_;
  auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
  auto c = output_ptr_ + task_id * thread_stride_;
  MatMulFp16(a_pack_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_,
             OutType_Nhwc);
  if (is_vector_input_) {
    MatVecMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, cur_oc);
  } else {
    MatMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_,
               OutType_Nhwc);
  }

  return RET_OK;
 }

@@ -163,16 +189,39 @@ int FullconnectionFP16CPUKernel::Run() {
  } else {
    output_ptr_ = reinterpret_cast<float16_t *>(out_tensor->data_c());
  }

  if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
    InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_);
    if (is_vector_input_) {
      Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_, fc_param_->deep_);
    } else {
      InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_);
    }
    a_ptr_ = a_pack_ptr_;
  } else {
    InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->data_c()), a_pack_ptr_);
    if (is_vector_input_) {
      a_ptr_ = reinterpret_cast<float16_t *>(in_tensors_[0]->data_c());
    } else {
      InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->data_c()), a_pack_ptr_);
      a_ptr_ = a_pack_ptr_;
    }
  }

  if (!fc_param_->b_const_) {
    if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
      InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
      if (is_vector_input_) {
        Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_,
                         fc_param_->col_ * fc_param_->deep_);
      } else {
        InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
      }
      b_ptr_ = b_pack_ptr_;
    } else {
      InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
      if (is_vector_input_) {
        b_ptr_ = reinterpret_cast<float16_t *>(in_tensors_[1]->data_c());
      } else {
        InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
        b_ptr_ = b_pack_ptr_;
      }
    }
  }
  ParallelLaunch(this->context_->thread_pool_, FcFP16Run, this, thread_count_);
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h
@@ -51,6 +51,9 @@ class FullconnectionFP16CPUKernel : public FullconnectionBaseCPUKernel {
  float16_t *bias_ptr_ = nullptr;
  float16_t *output_fp16_ = nullptr;
  float16_t *output_ptr_ = nullptr;
  float16_t *a_ptr_ = nullptr;
  float16_t *b_ptr_ = nullptr;
  bool is_vector_input_ = false;
 };
 }  // namespace mindspore::kernel