Browse Source

!5203 [MS][LITE][CPU]extract post process func

Merge pull request !5203 from fuzhiye/tmp
tags/v1.0.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
b2cff2842d
13 changed files with 128 additions and 59 deletions
  1. +7
    -6
      mindspore/lite/nnacl/fp32/conv.c
  2. +2
    -2
      mindspore/lite/nnacl/fp32/conv.h
  3. +1
    -1
      mindspore/lite/nnacl/op_base.h
  4. +2
    -0
      mindspore/lite/nnacl/pack.c
  5. +2
    -0
      mindspore/lite/nnacl/pack.h
  6. +26
    -13
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
  7. +1
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h
  8. +26
    -10
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
  9. +1
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
  10. +28
    -14
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
  11. +1
    -0
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
  12. +30
    -13
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
  13. +1
    -0
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h

+ 7
- 6
mindspore/lite/nnacl/fp32/conv.c View File

@@ -307,8 +307,8 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
}

// step 4 : output transform
WinogradOutputTransform(gemm_out + task_id * gemm_out_offset, tmp_out_data + tmp_out_batch_offset, bias_data,
cal_num, out_tile_index, out_w_block, conv_param, output_trans_func);
WinogradOutputTransform(dst_ptr, tmp_out_data + tmp_out_batch_offset, bias_data, cal_num, out_tile_index,
out_w_block, conv_param, output_trans_func);
}
}
}
@@ -449,8 +449,8 @@ void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int heig
}

// fp32 conv3x3
void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data,
TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) {
void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, TmpBufferAddress *buffer_list,
int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) {
int thread_count = conv_param->thread_num_;
int ic4 = UP_DIV(conv_param->input_channel_, C4NUM);
int output_channel = conv_param->output_channel_;
@@ -461,6 +461,7 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat
int output_count = out_w_block * out_h_block;
int output_tile_count = UP_DIV(output_count, C12NUM);
const int input_unit_square = 4 * 4;

float *tile_buffer = buffer_list[0];
float *block_unit_buffer = buffer_list[1];
float *tmp_dst_buffer = buffer_list[2];
@@ -491,8 +492,8 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat
MatMulOpt(tmp_col_ptr, transed_weight + i * ic4 * C4NUM * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0,
ic4 * C4NUM, real_cal_num, oc8 * C8NUM, input_unit_square, 2);
}
Conv3x3Fp32OutputTransform(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, nc4hw4_out + nc4hw4_buffer_offset,
bias_data, start_index, real_cal_num, out_w_block, conv_param);
Conv3x3Fp32OutputTransform(dst_ptr, nc4hw4_out + nc4hw4_buffer_offset, bias_data, start_index, real_cal_num,
out_w_block, conv_param);
}
}
}

+ 2
- 2
mindspore/lite/nnacl/fp32/conv.h View File

@@ -65,8 +65,8 @@ void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int heig
int output_unit);

// fp32 conv3x3
void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data,
TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func);
void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, TmpBufferAddress *buffer_list,
int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func);
#ifdef __cplusplus
}
#endif


+ 1
- 1
mindspore/lite/nnacl/op_base.h View File

@@ -61,6 +61,6 @@ typedef struct OpParameter {
int thread_num_;
} OpParameter;

typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType;
typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6, ActType_Prelu } ActType;

#endif // MINDSPORE_LITE_NNACL_OP_BASE_H_

+ 2
- 0
mindspore/lite/nnacl/pack.c View File

@@ -761,6 +761,8 @@ void PackNC4HW4ToNHWCRelu6Fp32(const void *src, void *dst, int batch, int plane,
}
}

void PackNC4HW4ToNHWCPreluFp32(const void *src, void *dst, const void *slope, int batch, int plane, int channel) {}

void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel) {
int c4 = UP_DIV(channel, C4NUM);
for (int b = 0; b < batch; b++) {


+ 2
- 0
mindspore/lite/nnacl/pack.h View File

@@ -81,6 +81,8 @@ void PackNC4HW4ToNHWCReluFp32(const void *src, void *dst, int batch, int plane,

void PackNC4HW4ToNHWCRelu6Fp32(const void *src, void *dst, int batch, int plane, int channel);

void PackNC4HW4ToNHWCPreluFp32(const void *src, void *dst, const void *slope, int batch, int plane, int channel);

void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel);

void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int channel);


+ 26
- 13
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc View File

@@ -207,6 +207,28 @@ static int Convolution3x3Fp16Impl(int task_id, LiteParallelGroupEnv *penv, void
return RET_OK;
}

int Convolution3x3FP16CPUKernel::PostProcess() {
auto act_type = conv_param_->act_type_;
switch (act_type) {
case ActType_No:
UnPack3x3OutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_);
break;
case ActType_Relu:
UnPack3x3ReluOutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_);
break;
case ActType_Relu6:
UnPack3x3Relu6OutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_);
break;
default:
MS_LOG(ERROR) << "Unsupport activation type.";
return RET_ERROR;
}
return RET_OK;
}

int Convolution3x3FP16CPUKernel::Run() {
auto ret = Prepare();
if (ret != RET_OK) {
@@ -236,20 +258,11 @@ int Convolution3x3FP16CPUKernel::Run() {
return RET_ERROR;
}

// get real output
bool relu = conv_param_->act_type_ == ActType_Relu;
bool relu6 = conv_param_->act_type_ == ActType_Relu6;
if (relu) {
UnPack3x3ReluOutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_);
} else if (relu6) {
UnPack3x3Relu6OutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_);
} else {
UnPack3x3OutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_);
ret = PostProcess();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Post process failed.";
return ret;
}

ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
FreeTmpBuffer();


+ 1
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h View File

@@ -52,6 +52,7 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
int InitWeightBias();
int InitTmpBuffer();
void ConfigInputOutput();
int PostProcess();

private:
void FreeTmpBuffer() {


+ 26
- 10
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc View File

@@ -358,6 +358,28 @@ static int ConvolutionWinogradFp16Impl(int task_id, LiteParallelGroupEnv *penv,
return RET_OK;
}

int ConvolutionWinogradFP16CPUKernel::PostProcess() {
auto act_type = conv_param_->act_type_;
switch (act_type) {
case ActType_No:
UnPackWinogradOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
break;
case ActType_Relu:
UnPackWinogradReluOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
break;
case ActType_Relu6:
UnPackWinogradRelu6OutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
break;
default:
MS_LOG(ERROR) << "Unsupport activation type.";
return RET_ERROR;
}
return RET_OK;
}

int ConvolutionWinogradFP16CPUKernel::Run() {
auto prepare_ret = Prepare();
if (prepare_ret != RET_OK) {
@@ -390,16 +412,10 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
return RET_ERROR;
}

// get real output
if (conv_param_->act_type_ == ActType_Relu) {
UnPackWinogradReluOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
} else if (conv_param_->act_type_ == ActType_Relu6) {
UnPackWinogradRelu6OutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
} else {
UnPackWinogradOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
ret = PostProcess();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Post process failed.";
return ret;
}
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();


+ 1
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h View File

@@ -56,6 +56,7 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
int MallocFilterMatrix(int oc_block, int oc_block_num);
int InitTmpBuffer();
int ConfigInputOutput();
int PostProcess();

private:
void FreeTmpBuffer() {


+ 28
- 14
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc View File

@@ -207,9 +207,8 @@ int Convolution3x3CPUKernel::RunImpl(int task_id) {
MS_LOG(ERROR) << "gemm_func is nullptr.";
return RET_ERROR;
}
auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
Conv3x3Fp32(reinterpret_cast<float *>(nhwc4_input_), transformed_filter_addr_, reinterpret_cast<float *>(bias_data_),
output_addr, tmp_buffer_address_list_, task_id, conv_param_, gemm_func_);
tmp_buffer_address_list_, task_id, conv_param_, gemm_func_);
return RET_OK;
}

@@ -223,6 +222,29 @@ int Convolution3x3Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
return RET_OK;
}

int Convolution3x3CPUKernel::PostProcess() {
auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
auto act_type = conv_param_->act_type_;
switch (act_type) {
case ActType_No:
PackNC4HW4ToNHWCFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
break;
case ActType_Relu:
PackNC4HW4ToNHWCReluFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
break;
case ActType_Relu6:
PackNC4HW4ToNHWCRelu6Fp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
break;
default:
MS_LOG(ERROR) << "Unsupport activation type.";
return RET_ERROR;
}
return RET_OK;
}

int Convolution3x3CPUKernel::Run() {
auto prepare_ret = Prepare();
if (prepare_ret != RET_OK) {
@@ -247,18 +269,10 @@ int Convolution3x3CPUKernel::Run() {
return RET_ERROR;
}

auto is_relu = conv_param_->act_type_ == ActType_Relu;
auto is_relu6 = conv_param_->act_type_ == ActType_Relu6;
auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
if (is_relu) {
PackNC4HW4ToNHWCReluFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
} else if (is_relu6) {
PackNC4HW4ToNHWCRelu6Fp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
} else {
PackNC4HW4ToNHWCFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
ret = PostProcess();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Post process failed.";
return ret;
}
FreeTmpBuffer();
return RET_OK;


+ 1
- 0
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h View File

@@ -45,6 +45,7 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
int InitWeightBias();
int InitTmpBuffer();
void ConfigInputOutput();
int PostProcess();

private:
void FreeTmpBuffer() {


+ 30
- 13
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc View File

@@ -353,18 +353,43 @@ int ConvolutionWinogradImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata
return RET_OK;
}

int ConvolutionWinogradCPUKernel::PostProcess() {
auto out_tensor = out_tensors_.front();
auto out_data = reinterpret_cast<float *>(out_tensor->Data());
auto act_type = conv_param_->act_type_;
switch (act_type) {
case ActType_No:
UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
break;
case ActType_Relu:
UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
break;
case ActType_Relu6:
UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
break;
default:
MS_LOG(ERROR) << "Unsupport activation type.";
return RET_ERROR;
}
return RET_OK;
}

int ConvolutionWinogradCPUKernel::Run() {
auto prepare_ret = Prepare();
if (prepare_ret != RET_OK) {
MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
return prepare_ret;
}
// malloc tmp buffer
auto ret = InitTmpBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init tmp buffer failed.";
return RET_ERROR;
}

auto input_tensor = in_tensors_.at(kInputIndex);
auto ori_input_data = input_tensor->Data();
PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
@@ -377,18 +402,10 @@ int ConvolutionWinogradCPUKernel::Run() {
return RET_ERROR;
}

// get real output
auto out_tensor = out_tensors_.front();
auto out_data = reinterpret_cast<float *>(out_tensor->Data());
if (conv_param_->act_type_ == ActType_Relu) {
UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
} else if (conv_param_->act_type_ == ActType_Relu6) {
UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
} else {
UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
ret = PostProcess();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Post process failed.";
return ret;
}
FreeTmpBuffer();
return RET_OK;


+ 1
- 0
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h View File

@@ -51,6 +51,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
int MallocFilterMatrix(int oc_block, int oc_block_num);
int InitTmpBuffer();
int ConfigInputOutput();
int PostProcess();

private:
void FreeTmpBuffer() {


Loading…
Cancel
Save