Browse Source

[MSLITE] deconv winograd fp16 neon

tags/v1.1.0
ling 5 years ago
parent
commit
51fced3767
12 changed files with 340 additions and 40 deletions
  1. +279
    -0
      mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S
  2. +5
    -1
      mindspore/lite/nnacl/fp16/common_func_fp16.c
  3. +2
    -0
      mindspore/lite/nnacl/fp16/common_func_fp16.h
  4. +1
    -1
      mindspore/lite/nnacl/fp16/deconv_winograd_fp16.c
  5. +0
    -17
      mindspore/lite/src/common/file_utils.cc
  6. +9
    -1
      mindspore/lite/src/common/file_utils.h
  7. +1
    -3
      mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
  8. +0
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
  9. +39
    -16
      mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.cc
  10. +2
    -0
      mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.h
  11. +1
    -0
      mindspore/lite/test/models_arm32.cfg
  12. +1
    -0
      mindspore/lite/test/models_caffe.cfg

+ 279
- 0
mindspore/lite/nnacl/assembly/fp16/PostFuncBiasReluC4Fp16.S View File

@@ -0,0 +1,279 @@

.text
.align 5
//.p2align 5,,15
.global PostFuncBiasReluC4Fp16
#ifndef __APPLE__
.type PostFuncBiasReluC4Fp16, %function
#endif

//void PostFuncBiasReluC4Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc4div, size_t oc4mod,
// size_t plane_size, size_t plane_stride, size_t relu_type);
// x0 dst x1 srx x2 bias
// w3 oc4div w4 oc4mod w5 plane_size
// x6 plane_stride x7 relu_type

PostFuncBiasReluC4Fp16:

movi v26.4h, #6
scvtf v26.4h, v26.4h
dup v27.4h, wzr

mov x10, #2
add x12, x3, x4
mul x12, x12, x10

mov w10, #0

Loop_C4:
cmp w10, w3
beq Loop_C1
mov x15, #2
mul x14, x10, x15
add x15, x0, x14
add w10, w10, #4
mov w13, w5
ld1 {v16.4h}, [x2], #8

Loop_8x4:
cmp w13, #8
blt Loop_4x4
sub w13, w13, #8
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1], #32
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32

fadd v0.4h, v0.4h, v16.4h
fadd v1.4h, v1.4h, v16.4h
fadd v2.4h, v2.4h, v16.4h
fadd v3.4h, v3.4h, v16.4h
fadd v4.4h, v4.4h, v16.4h
fadd v5.4h, v5.4h, v16.4h
fadd v6.4h, v6.4h, v16.4h
fadd v7.4h, v7.4h, v16.4h

cmp x7, #3
beq Relu6_8x4
cmp x7, #1
beq Relu_8x4
b Write_8x4
Relu6_8x4:
fmin v0.4h, v0.4h, v26.4h
fmin v1.4h, v1.4h, v26.4h
fmin v2.4h, v2.4h, v26.4h
fmin v3.4h, v3.4h, v26.4h
fmin v4.4h, v4.4h, v26.4h
fmin v5.4h, v5.4h, v26.4h
fmin v6.4h, v6.4h, v26.4h
fmin v7.4h, v7.4h, v26.4h
Relu_8x4:
fmax v0.4h, v0.4h, v27.4h
fmax v1.4h, v1.4h, v27.4h
fmax v2.4h, v2.4h, v27.4h
fmax v3.4h, v3.4h, v27.4h
fmax v4.4h, v4.4h, v27.4h
fmax v5.4h, v5.4h, v27.4h
fmax v6.4h, v6.4h, v27.4h
fmax v7.4h, v7.4h, v27.4h
Write_8x4:
st1 {v0.4h}, [x15], x12
st1 {v1.4h}, [x15], x12
st1 {v2.4h}, [x15], x12
st1 {v3.4h}, [x15], x12
st1 {v4.4h}, [x15], x12
st1 {v5.4h}, [x15], x12
st1 {v6.4h}, [x15], x12
st1 {v7.4h}, [x15], x12
b Loop_8x4

Loop_4x4:
cmp w13, #4
blt Loop_1x4
sub w13, w13, #4
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1], #32
fadd v0.4h, v0.4h, v16.4h
fadd v1.4h, v1.4h, v16.4h
fadd v2.4h, v2.4h, v16.4h
fadd v3.4h, v3.4h, v16.4h
cmp x7, #3
beq Relu6_4x4
cmp x7, #1
beq Relu_4x4
b Write_4x4
Relu6_4x4:
fmin v0.4h, v0.4h, v26.4h
fmin v1.4h, v1.4h, v26.4h
fmin v2.4h, v2.4h, v26.4h
fmin v3.4h, v3.4h, v26.4h
Relu_4x4:
fmax v0.4h, v0.4h, v27.4h
fmax v1.4h, v1.4h, v27.4h
fmax v2.4h, v2.4h, v27.4h
fmax v3.4h, v3.4h, v27.4h
Write_4x4:
st1 {v0.4h}, [x15], x12
st1 {v1.4h}, [x15], x12
st1 {v2.4h}, [x15], x12
st1 {v3.4h}, [x15], x12

Loop_1x4:
cmp x7, #3
beq Relu6_1x4
cmp x7, #1
beq Relu_1x4
b Write_1x4
Relu6_1x4:
cmp w13, #0
beq HW_Add
sub w13, w13, #1
ld1 {v0.4h}, [x1], #8
fadd v0.4h, v0.4h, v16.4h
fmin v0.4h, v0.4h, v26.4h
fmax v0.4h, v0.4h, v27.4h
st1 {v0.4h}, [x15], x12
b Relu6_1x4
Relu_1x4:
cmp w13, #0
beq HW_Add
sub w13, w13, #1
ld1 {v0.4h}, [x1], #8
fadd v0.4h, v0.4h, v16.4h
fmax v0.4h, v0.4h, v27.4h
st1 {v0.4h}, [x15], x12
b Relu_1x4
Write_1x4:
cmp w13, #0
beq HW_Add
sub w13, w13, #1
ld1 {v0.4h}, [x1], #8
fadd v0.4h, v0.4h, v16.4h
st1 {v0.4h}, [x15], x12
b Write_1x4

HW_Add:
add x1, x1, x6
b Loop_C4

Loop_C1:
cmp w4, #0
beq End
mov w13, w5
ld1 {v16.4h}, [x2], #8
mov x15, #2
mul x14, x10, x15
add x0, x0, x14

cmp w4, #1
beq Loop_C1_1
cmp w4, #2
beq Loop_C1_2
cmp w4, #3
beq Loop_C1_3

Loop_C1_1:
cmp x7, #3
beq Loop_C1_1_Relu6
cmp x7, #1
beq Loop_C1_1_Relu
b Loop_C1_1_Write
Loop_C1_1_Relu6:
cmp w13, #0
beq End
sub w13, w13, #1
ld1 {v0.4h}, [x1], #8
fadd v0.4h, v0.4h, v16.4h
fmin v0.4h, v0.4h, v26.4h
fmax v0.4h, v0.4h, v27.4h
st1 {v0.h}[0], [x0], x12
b Loop_C1_1_Relu6
Loop_C1_1_Relu:
cmp w13, #0
beq End
sub w13, w13, #1
ld1 {v0.4h}, [x1], #8
fadd v0.4h, v0.4h, v16.4h
fmax v0.4h, v0.4h, v27.4h
st1 {v0.h}[0], [x0], x12
b Loop_C1_1_Relu
Loop_C1_1_Write:
cmp w13, #0
beq End
sub w13, w13, #1
ld1 {v0.4h}, [x1], #8
fadd v0.4h, v0.4h, v16.4h
st1 {v0.h}[0], [x0], x12
b Loop_C1_1_Write

Loop_C1_2:
cmp x7, #3
beq Loop_C1_2_Relu6
cmp x7, #1
beq Loop_C1_2_Relu
b Loop_C1_2_Write
Loop_C1_2_Relu6:
cmp w13, #0
beq End
sub w13, w13, #1
ld1 {v0.4h}, [x1], #8
fadd v0.4h, v0.4h, v16.4h
fmin v0.4h, v0.4h, v26.4h
fmax v0.4h, v0.4h, v27.4h
st1 {v0.s}[0], [x0], x12
b Loop_C1_2_Relu6
Loop_C1_2_Relu:
cmp w13, #0
beq End
sub w13, w13, #1
ld1 {v0.4h}, [x1], #8
fadd v0.4h, v0.4h, v16.4h
fmax v0.4h, v0.4h, v27.4h
st1 {v0.s}[0], [x0], x12
b Loop_C1_2_Relu
Loop_C1_2_Write:
cmp w13, #0
beq End
sub w13, w13, #1
ld1 {v0.4h}, [x1], #8
fadd v0.4h, v0.4h, v16.4h
st1 {v0.s}[0], [x0], x12
b Loop_C1_2_Write

Loop_C1_3:
add x15, x0, #4
cmp x7, #3
beq Loop_C1_3_Relu6
cmp x7, #1
beq Loop_C1_3_Relu
b Loop_C1_3_Write
Loop_C1_3_Relu6:
cmp w13, #0
beq End
sub w13, w13, #1
ld1 {v0.4h}, [x1], #8
fadd v0.4h, v0.4h, v16.4h
fmin v0.4h, v0.4h, v26.4h
fmax v0.4h, v0.4h, v27.4h
st1 {v0.s}[0], [x0], x12
st1 {v0.h}[2], [x15], x12
b Loop_C1_3_Relu6
Loop_C1_3_Relu:
cmp w13, #0
beq End
sub w13, w13, #1
ld1 {v0.4h}, [x1], #8
fadd v0.4h, v0.4h, v16.4h
fmax v0.4h, v0.4h, v27.4h
st1 {v0.s}[0], [x0], x12
st1 {v0.h}[2], [x15], x12
b Loop_C1_3_Relu
Loop_C1_3_Write:
cmp w13, #0
beq End
sub w13, w13, #1
ld1 {v0.4h}, [x1], #8
fadd v0.4h, v0.4h, v16.4h
st1 {v0.s}[0], [x0], x12
st1 {v0.h}[2], [x15], x12
b Loop_C1_3_Write

End:
ret

+ 5
- 1
mindspore/lite/nnacl/fp16/common_func_fp16.c View File

@@ -50,5 +50,9 @@ void PostConvFuncFp16C8(const float16_t *c8_out, float16_t *nhwc_out, const floa

void PostConvFuncFp16C4(const float16_t *c4_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane,
size_t plane_stride, ActType act_type) {
PostConvFuncCommFp16(nhwc_out, c4_out, bias, oc, plane, oc, plane_stride, act_type, C4NUM);
size_t oc4mod = oc % C4NUM;
size_t oc4div = oc - oc4mod;
size_t stride_size = (plane_stride - plane) * C4NUM * sizeof(float16_t);
PostFuncBiasReluC4Fp16(nhwc_out, c4_out, bias, oc4div, oc4mod, plane, stride_size, act_type);
return;
}

+ 2
- 0
mindspore/lite/nnacl/fp16/common_func_fp16.h View File

@@ -32,6 +32,8 @@ void PostFuncBiasReluC8Fp16(float16_t *dst, const float16_t *src, const float16_
/* deconv winograd */
void PostConvFuncFp16C4(const float16_t *c4_out, float16_t *nhwc_out, const float16_t *bias, size_t output_channel,
size_t plane_size, size_t plane_stride, ActType act_type);
void PostFuncBiasReluC4Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc4div, size_t oc4mod,
size_t plane_size, size_t plane_stride, size_t relu_type);

#ifdef __cplusplus
}


+ 1
- 1
mindspore/lite/nnacl/fp16/deconv_winograd_fp16.c View File

@@ -24,7 +24,7 @@ void DeConvWgInputPackFp16(float16_t *src_ptr, float16_t *dst_ptr, int channel,
float16_t *dst = dst_ptr;

for (int ic = 0; ic < ic4div; ic++) {
memcpy(dst, src, C4NUM * sizeof(float16_t));
vst1_f16(dst, vld1_f16(src));
dst += stride;
src += C4NUM;
}


+ 0
- 17
mindspore/lite/src/common/file_utils.cc View File

@@ -85,23 +85,6 @@ std::string RealPath(const char *path) {
return res;
}

int WriteToBin(const std::string &file_path, void *data, size_t size) {
std::ofstream out_file;

out_file.open(file_path.c_str(), std::ios::binary);
if (!out_file.good()) {
MS_LOG(ERROR) << "file is bad";
return -1;
}

if (!out_file.is_open()) {
MS_LOG(ERROR) << "file open failed";
return -1;
}
out_file.write(reinterpret_cast<char *>(data), size);
return 0;
}

int CompareOutputData(float *output_data, size_t output_size, float *correct_data, size_t data_size) {
if (output_size != data_size) {
printf("compare failed, output_size %zu isn't equal to data_size %zu.\n", output_size, data_size);


+ 9
- 1
mindspore/lite/src/common/file_utils.h View File

@@ -48,7 +48,15 @@ void WriteToTxt(const std::string &file_path, void *data, size_t element_size) {
out_file.close();
}

int WriteToBin(const std::string &file_path, void *data, size_t size);
inline int WriteToBin(const std::string &file_path, void *data, size_t size) {
std::ofstream out_file;
out_file.open(file_path.c_str(), std::ios::binary);
if (!out_file.good() || !out_file.is_open()) {
return -1;
}
out_file.write(reinterpret_cast<char *>(data), size);
return 0;
}

int CompareOutputData(float *output_data, size_t output_num, float *correct_data, size_t data_size);
int CompareOutput(float *output_data, size_t output_num, std::string file_path);


+ 1
- 3
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc View File

@@ -233,8 +233,7 @@ kernel::LiteKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor *>
auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
(conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1)) {
/* DeConvWinogradFp16CPUKernel */
kernel = new (std::nothrow) kernel::DeConvolutionFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
kernel = new (std::nothrow) kernel::DeConvWinogradFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
} else {
kernel = new (std::nothrow) kernel::DeConvolutionFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
}
@@ -266,5 +265,4 @@ kernel::LiteKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor *>
return kernel;
}

REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_DeConv2D, CpuDeConvFp16KernelCreator)
} // namespace mindspore::kernel

+ 0
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc View File

@@ -248,7 +248,6 @@ kernel::LiteKernel *CpuDeConvFp32KernelCreator(const std::vector<lite::Tensor *>
auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
(conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1)) {
/* DeConvolutionWinogradCPUKernel */
kernel = new (std::nothrow) kernel::DeConvolutionWinogradCPUKernel(opParameter, inputs, outputs, ctx, primitive);
} else {
kernel = new (std::nothrow) kernel::DeConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive);


+ 39
- 16
mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.cc View File

@@ -18,6 +18,7 @@
#include "src/runtime/runtime_api.h"

using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_MEMORY_FAILED;
using mindspore::lite::RET_NULL_PTR;
using mindspore::lite::RET_OK;

@@ -59,20 +60,10 @@ void DeConvolutionWinogradCPUKernel::FreeResizeBuf() {
wg.buf_init_ = false;
}

if (nc4hw4_output_ != nullptr) {
free(nc4hw4_output_);
nc4hw4_output_ = nullptr;
}

if (tile_input_ != nullptr) {
free(tile_input_);
tile_input_ = nullptr;
}

if (tile_output_ != nullptr) {
free(tile_output_);
tile_output_ = nullptr;
}
return;
}

@@ -108,9 +99,6 @@ int DeConvolutionWinogradCPUKernel::InitParameter() {
deconv_param_->input_plane_ = conv_param_->input_h_ * conv_param_->input_w_;
deconv_param_->output_plane_ = conv_param_->output_h_ * conv_param_->output_w_;

nc4hw4_output_ =
reinterpret_cast<float *>(malloc(deconv_param_->oc_up4_ * deconv_param_->output_plane_ * sizeof(float)));

deconv_param_->in_tile_w_count_ = UP_DIV(conv_param_->input_w_, DECONV_WINOGRAD_DEFAULT_UNIT);
deconv_param_->in_tile_h_count_ = UP_DIV(conv_param_->input_h_, DECONV_WINOGRAD_DEFAULT_UNIT);

@@ -129,9 +117,6 @@ int DeConvolutionWinogradCPUKernel::InitParameter() {

deconv_param_->out_tile_w_ = (DECONV_WINOGRAD_DEFAULT_UNIT - 1) * conv_param_->stride_w_ + conv_param_->kernel_w_;
deconv_param_->out_tile_h_ = (DECONV_WINOGRAD_DEFAULT_UNIT - 1) * conv_param_->stride_h_ + conv_param_->kernel_h_;
size = deconv_param_->thread_num_ * deconv_param_->out_tile_w_ * deconv_param_->out_tile_h_ *
DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->oc_up4_;
tile_output_ = reinterpret_cast<float *>(malloc(size * sizeof(float)));

for (int i = 0; i < deconv_param_->compute_size_; i++) {
DeConvComputeUnit &unit = deconv_param_->compute_units_[i];
@@ -329,7 +314,44 @@ int DeConvolutionWinogradCPUKernel::DeDeconvPost(int task_id) {
return RET_OK;
}

int DeConvolutionWinogradCPUKernel::InitRunBuf() {
int size = deconv_param_->oc_up4_ * deconv_param_->output_plane_;
nc4hw4_output_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(size * sizeof(float)));
if (nc4hw4_output_ == nullptr) {
MS_LOG(ERROR) << "de conv wg Malloc nc4hw4_output_ error!";
return RET_MEMORY_FAILED;
}

size = deconv_param_->thread_num_ * deconv_param_->out_tile_w_ * deconv_param_->out_tile_h_ *
DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->oc_up4_;
tile_output_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(size * sizeof(float)));
if (tile_output_ == nullptr) {
MS_LOG(ERROR) << "de conv wg Malloc tile_output_ error!";
return RET_MEMORY_FAILED;
}
return RET_OK;
}

void DeConvolutionWinogradCPUKernel::FreeRunBuf() {
if (nc4hw4_output_ != nullptr) {
ctx_->allocator->Free(nc4hw4_output_);
nc4hw4_output_ = nullptr;
}

if (tile_output_ != nullptr) {
ctx_->allocator->Free(tile_output_);
tile_output_ = nullptr;
}
return;
}

int DeConvolutionWinogradCPUKernel::Run() {
auto ret = InitRunBuf();
if (ret != RET_OK) {
MS_LOG(ERROR) << "InitRunBuf fail!ret: " << ret;
return ret;
}

float *src_in = reinterpret_cast<float *>(in_tensors_[0]->data_c());
float *src_out = reinterpret_cast<float *>(out_tensors_[0]->data_c());

@@ -344,6 +366,7 @@ int DeConvolutionWinogradCPUKernel::Run() {
ParallelLaunch(this->context_->thread_pool_, DeConvWgPostFp32Run, this, thread_num_hw_);
}

FreeRunBuf();
return RET_OK;
}



+ 2
- 0
mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.h View File

@@ -54,6 +54,8 @@ class DeConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
int InitParameter();
void FreeDeconvParam();
void FreeResizeBuf();
int InitRunBuf();
void FreeRunBuf();

private:
DeConvParam *deconv_param_;


+ 1
- 0
mindspore/lite/test/models_arm32.cfg View File

@@ -11,6 +11,7 @@ ml_face_contour
mnet
ml_face_landmark
ml_liveness_detect_landmark
deconv_test_model
# aware_training
video_infer.tflite
mobilenet_v1_1.0_224_quant.tflite

+ 1
- 0
mindspore/lite/test/models_caffe.cfg View File

@@ -56,3 +56,4 @@ hiai_face_attr1
detect-mbv1-shortcut-400-400_nopostprocess_simplified
detect_mbv1_640_480_nopostprocess_simplified
retinaface
deconv_test_model

Loading…
Cancel
Save