From 4333de375e4478754d4db69bf808246a22411a30 Mon Sep 17 00:00:00 2001 From: fuzhiye Date: Sun, 16 Aug 2020 14:24:32 +0800 Subject: [PATCH] rewrite conv3x3 conv winograd post func --- .../arm/fp16/convolution_winograd_fp16.cc | 13 +-- .../kernel/arm/fp32/convolution_3x3.cc | 13 +-- .../kernel/arm/fp32/convolution_winograd.cc | 14 +-- .../runtime/kernel/arm/nnacl/fp16/conv_fp16.c | 103 +++++++++++++++- .../runtime/kernel/arm/nnacl/fp16/conv_fp16.h | 6 + .../src/runtime/kernel/arm/nnacl/fp32/conv.c | 110 ++++++++++++++++-- .../src/runtime/kernel/arm/nnacl/fp32/conv.h | 6 + .../lite/src/runtime/kernel/arm/nnacl/pack.c | 78 +++++++++++++ .../lite/src/runtime/kernel/arm/nnacl/pack.h | 4 + 9 files changed, 315 insertions(+), 32 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc index a3866c14e3..380c809a36 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc @@ -400,16 +400,15 @@ int ConvolutionWinogradFP16CPUKernel::Run() { } // get real output - UnPackWinogradOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, - conv_param_->output_w_, conv_param_->output_channel_, output_unit_); - int output_num = - conv_param_->output_channel_ * conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_; if (conv_param_->is_relu_) { - ReluFp16(execute_output_, execute_output_, output_num); + UnPackWinogradReluOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_, output_unit_); } else if (conv_param_->is_relu6_) { - Relu6Fp16(execute_output_, execute_output_, output_num); + UnPackWinogradRelu6OutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_, output_unit_); } else { - // do nothing + UnPackWinogradOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_, output_unit_); } ConvolutionBaseFP16CPUKernel::IfCastOutput(); return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc index bcf3665763..4797189ef7 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc @@ -263,16 +263,15 @@ int Convolution3x3CPUKernel::Run() { auto is_relu = conv_param_->is_relu_; auto is_relu6 = conv_param_->is_relu6_; auto output_addr = reinterpret_cast(out_tensors_.at(kOutputIndex)->Data()); - PackNC4HW4ToNHWCFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, - conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); - int output_num = - conv_param_->output_channel_ * conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_; if (is_relu) { - ReluFp32(output_addr, output_addr, output_num); + PackNC4HW4ToNHWCReluFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, + conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); } else if (is_relu6) { - Relu6Fp32(output_addr, output_addr, output_num); + PackNC4HW4ToNHWCRelu6Fp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, + conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); } else { - // do nothing + PackNC4HW4ToNHWCFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_, + conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); } return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc index d803e99f23..2a73b04683 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc @@ -368,18 +368,16 @@ int ConvolutionWinogradCPUKernel::Run() { // get real output auto out_tensor = out_tensors_.front(); auto out_data = reinterpret_cast(out_tensor->Data()); - UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, - conv_param_->output_w_, conv_param_->output_channel_, output_unit_); - int output_num = - conv_param_->output_channel_ * conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_; if (conv_param_->is_relu_) { - ReluFp32(out_data, out_data, output_num); + UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_, output_unit_); } else if (conv_param_->is_relu6_) { - Relu6Fp32(out_data, out_data, output_num); + UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_, output_unit_); } else { - // do nothing + UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_, + conv_param_->output_w_, conv_param_->output_channel_, output_unit_); } - return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c index cf034be09a..78880c3ac2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c @@ -470,8 +470,9 @@ void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, i int out_h_block_num = UP_DIV(height, output_unit); int out_w_block_num = UP_DIV(width, output_unit); int c8 = UP_DIV(channel, C8NUM); + int c8_block = C8NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; for (int b = 0; b < batch; b++) { - int src_batch_offset = b * c8 * C8NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; + int src_batch_offset = b * c8 * c8_block; int dst_batch_offset = b * height * width * channel; for (int h = 0; h < height; h++) { int src_h_offset = src_batch_offset + C8NUM * (h * out_w_block_num * output_unit); @@ -480,7 +481,7 @@ void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, i int src_w_offset = src_h_offset + w * C8NUM; int dst_w_offset = dst_h_offset + w * channel; for (int c = 0; c < c8 - 1; c++) { - int src_c8_offset = src_w_offset + c * C8NUM * out_w_block_num * out_h_block_num * output_unit * output_unit; + int src_c8_offset = src_w_offset + c * c8_block; int dst_c8_offset = dst_w_offset + c * C8NUM; #ifdef ENABLE_NEON vst1q_f16(dst + dst_c8_offset, vld1q_f16(src + src_c8_offset)); @@ -491,7 +492,7 @@ void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, i #endif } int c_res = channel - (c8 - 1) * C8NUM; - int src_c_res_offset = (c8 - 1) * C8NUM * out_w_block_num * out_h_block_num * output_unit * output_unit; + int src_c_res_offset = (c8 - 1) * c8_block; int dst_c_res_offset = (c8 - 1) * C8NUM; for (int c = 0; c < c_res; c++) { int src_c8_res_offset = src_w_offset + src_c_res_offset + c; @@ -502,3 +503,99 @@ void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, i } } } + +void UnPackWinogradReluOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel, + int output_unit) { + int out_h_block_num = UP_DIV(height, output_unit); + int out_w_block_num = UP_DIV(width, output_unit); + int c8 = UP_DIV(channel, C8NUM); + int c8_block = C8NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; + for (int b = 0; b < batch; b++) { + int src_batch_offset = b * c8 * c8_block; + int dst_batch_offset = b * height * width * channel; + for (int h = 0; h < height; h++) { + int src_h_offset = src_batch_offset + C8NUM * (h * out_w_block_num * output_unit); + int dst_h_offset = dst_batch_offset + h * width * channel; + for (int w = 0; w < width; w++) { + int src_w_offset = src_h_offset + w * C8NUM; + int dst_w_offset = dst_h_offset + w * channel; + for (int c = 0; c < c8 - 1; c++) { + int src_c8_offset = src_w_offset + c * c8_block; + int dst_c8_offset = dst_w_offset + c * C8NUM; +#ifdef ENABLE_NEON + float16x8_t input_ptr = vld1q_f16(src + src_c8_offset); + float16x8_t zero = vdupq_n_f16(0); + input_ptr = vmaxq_f16(zero, input_ptr); + vst1q_f16(dst + dst_c8_offset, input_ptr); +#else + for (int i = 0; i < C8NUM; ++i) { + float16_t input_data = src[src_c8_offset + i]; + input_data = input_data < 0 ? 0 : input_data; + dst[dst_c8_offset + i] = input_data; + } +#endif + } + int c_res = channel - (c8 - 1) * C8NUM; + int src_c_res_offset = (c8 - 1) * c8_block; + int dst_c_res_offset = (c8 - 1) * C8NUM; + for (int c = 0; c < c_res; c++) { + int src_c8_res_offset = src_w_offset + src_c_res_offset + c; + int dst_c8_res_offset = dst_w_offset + dst_c_res_offset + c; + float16_t input_data = src[src_c8_res_offset]; + input_data = input_data < 0 ? 0 : input_data; + dst[dst_c8_res_offset] = input_data; + } + } + } + } +} + +void UnPackWinogradRelu6OutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel, + int output_unit) { + int out_h_block_num = UP_DIV(height, output_unit); + int out_w_block_num = UP_DIV(width, output_unit); + int c8 = UP_DIV(channel, C8NUM); + int c8_block = C8NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; + for (int b = 0; b < batch; b++) { + int src_batch_offset = b * c8 * c8_block; + int dst_batch_offset = b * height * width * channel; + for (int h = 0; h < height; h++) { + int src_h_offset = src_batch_offset + C8NUM * (h * out_w_block_num * output_unit); + int dst_h_offset = dst_batch_offset + h * width * channel; + for (int w = 0; w < width; w++) { + int src_w_offset = src_h_offset + w * C8NUM; + int dst_w_offset = dst_h_offset + w * channel; + for (int c = 0; c < c8 - 1; c++) { + int src_c8_offset = src_w_offset + c * c8_block; + int dst_c8_offset = dst_w_offset + c * C8NUM; +#ifdef ENABLE_NEON + float16x8_t input_ptr = vld1q_f16(src + src_c8_offset); + float16x8_t zero = vdupq_n_f16(0); + float16x8_t six = vdupq_n_f16(6); + input_ptr = vmaxq_f16(zero, input_ptr); + input_ptr = vminq_f16(six, input_ptr); + vst1q_f16(dst + dst_c8_offset, input_ptr); +#else + for (int i = 0; i < C8NUM; ++i) { + float16_t input_data = src[src_c8_offset + i]; + input_data = input_data < 0 ? 0 : input_data; + input_data = input_data > 6 ? 6 : input_data; + dst[dst_c8_offset + i] = input_data; + } +#endif + } + int c_res = channel - (c8 - 1) * C8NUM; + int src_c_res_offset = (c8 - 1) * c8_block; + int dst_c_res_offset = (c8 - 1) * C8NUM; + for (int c = 0; c < c_res; c++) { + int src_c8_res_offset = src_w_offset + src_c_res_offset + c; + int dst_c8_res_offset = dst_w_offset + dst_c_res_offset + c; + float16_t input_data = src[src_c8_res_offset]; + input_data = input_data < 0 ? 0 : input_data; + input_data = input_data > 6 ? 6 : input_data; + dst[dst_c8_res_offset] = input_data; + } + } + } + } +} diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h index 918e82c020..a22a6459e1 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h @@ -67,6 +67,12 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel, int output_unit); + +void UnPackWinogradReluOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel, + int output_unit); + +void UnPackWinogradRelu6OutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel, + int output_unit); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c index 0215e87607..91a9317028 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c @@ -296,8 +296,9 @@ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, i int out_h_block_num = UP_DIV(height, output_unit); int out_w_block_num = UP_DIV(width, output_unit); int c4 = UP_DIV(channel, C4NUM); + int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; for (int b = 0; b < batch; b++) { - int src_batch_offset = b * c4 * C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; + int src_batch_offset = b * c4 * c4_block; int dst_batch_offset = b * height * width * channel; for (int h = 0; h < height; h++) { int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit); @@ -306,19 +307,18 @@ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, i int src_w_offset = src_h_offset + w * C4NUM; int dst_w_offset = dst_h_offset + w * channel; for (int c = 0; c < c4 - 1; c++) { - int src_c4_offset = src_w_offset + c * C4NUM * out_w_block_num * out_h_block_num * output_unit * output_unit; + int src_c4_offset = src_w_offset + c * c4_block; int dst_c4_offset = dst_w_offset + c * C4NUM; #ifdef ENABLE_NEON vst1q_f32(dst + dst_c4_offset, vld1q_f32(src + src_c4_offset)); #else - dst[dst_c4_offset] = src[src_c4_offset]; - dst[dst_c4_offset + 1] = src[src_c4_offset + 1]; - dst[dst_c4_offset + 2] = src[src_c4_offset + 2]; - dst[dst_c4_offset + 3] = src[src_c4_offset + 3]; + for (int i = 0; i < C4NUM; ++i) { + dst[dst_c4_offset + i] = src[src_c4_offset + i]; + } #endif } int c_res = channel - (c4 - 1) * C4NUM; - int src_c_res_offset = (c4 - 1) * C4NUM * out_w_block_num * out_h_block_num * output_unit * output_unit; + int src_c_res_offset = (c4 - 1) * c4_block; int dst_c_res_offset = (c4 - 1) * C4NUM; for (int c = 0; c < c_res; c++) { int src_c4_res_offset = src_w_offset + src_c_res_offset + c; @@ -330,6 +330,102 @@ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, i } } +void UnPackWinogradReluOutput(const float *src, float *dst, int batch, int height, int width, int channel, + int output_unit) { + int out_h_block_num = UP_DIV(height, output_unit); + int out_w_block_num = UP_DIV(width, output_unit); + int c4 = UP_DIV(channel, C4NUM); + int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; + for (int b = 0; b < batch; b++) { + int src_batch_offset = b * c4 * c4_block; + int dst_batch_offset = b * height * width * channel; + for (int h = 0; h < height; h++) { + int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit); + int dst_h_offset = dst_batch_offset + h * width * channel; + for (int w = 0; w < width; w++) { + int src_w_offset = src_h_offset + w * C4NUM; + int dst_w_offset = dst_h_offset + w * channel; + for (int c = 0; c < c4 - 1; c++) { + int src_c4_offset = src_w_offset + c * c4_block; + int dst_c4_offset = dst_w_offset + c * C4NUM; +#ifdef ENABLE_NEON + float32x4_t input_ptr = vld1q_f32(src + src_c4_offset); + float32x4_t zero = vdupq_n_f32(0); + input_ptr = vmaxq_f32(zero, input_ptr); + vst1q_f32(dst + dst_c4_offset, input_ptr); +#else + for (int i = 0; i < C4NUM; ++i) { + float input_data = src[src_c4_offset + i]; + input_data = input_data < 0 ? 0 : input_data; + dst[dst_c4_offset + i] = input_data; + } +#endif + } + int c_res = channel - (c4 - 1) * C4NUM; + int src_c_res_offset = (c4 - 1) * c4_block; + int dst_c_res_offset = (c4 - 1) * C4NUM; + for (int c = 0; c < c_res; c++) { + int src_c4_res_offset = src_w_offset + src_c_res_offset + c; + int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c; + float input_data = src[src_c4_res_offset]; + input_data = input_data < 0 ? 0 : input_data; + dst[dst_c4_res_offset] = input_data; + } + } + } + } +} + +void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int height, int width, int channel, + int output_unit) { + int out_h_block_num = UP_DIV(height, output_unit); + int out_w_block_num = UP_DIV(width, output_unit); + int c4 = UP_DIV(channel, C4NUM); + int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit; + for (int b = 0; b < batch; b++) { + int src_batch_offset = b * c4 * c4_block; + int dst_batch_offset = b * height * width * channel; + for (int h = 0; h < height; h++) { + int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit); + int dst_h_offset = dst_batch_offset + h * width * channel; + for (int w = 0; w < width; w++) { + int src_w_offset = src_h_offset + w * C4NUM; + int dst_w_offset = dst_h_offset + w * channel; + for (int c = 0; c < c4 - 1; c++) { + int src_c4_offset = src_w_offset + c * c4_block; + int dst_c4_offset = dst_w_offset + c * C4NUM; +#ifdef ENABLE_NEON + float32x4_t input_ptr = vld1q_f32(src + src_c4_offset); + float32x4_t zero = vdupq_n_f32(0); + float32x4_t six = vdupq_n_f32(6); + input_ptr = vmaxq_f32(zero, input_ptr); + input_ptr = vminq_f32(six, input_ptr); + vst1q_f32(dst + dst_c4_offset, input_ptr); +#else + for (int i = 0; i < C4NUM; ++i) { + float input_data = src[src_c4_offset + i]; + input_data = input_data < 0 ? 0 : input_data; + input_data = input_data > 6 ? 6 : input_data; + dst[dst_c4_offset + i] = input_data; + } +#endif + } + int c_res = channel - (c4 - 1) * C4NUM; + int src_c_res_offset = (c4 - 1) * c4_block; + int dst_c_res_offset = (c4 - 1) * C4NUM; + for (int c = 0; c < c_res; c++) { + int src_c4_res_offset = src_w_offset + src_c_res_offset + c; + int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c; + float input_data = src[src_c4_res_offset]; + input_data = input_data < 0 ? 0 : input_data; + input_data = input_data > 6 ? 6 : input_data; + dst[dst_c4_res_offset] = input_data; + } + } + } + } +} + // fp32 conv3x3 void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) { diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.h index c27854cad8..d9c094636f 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.h @@ -63,6 +63,12 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel, int output_unit); +void UnPackWinogradReluOutput(const float *src, float *dst, int batch, int height, int width, int channel, + int output_unit); + +void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int height, int width, int channel, + int output_unit); + // fp32 conv3x3 void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func); diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c index 1d9c21edf1..d8649227a2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c @@ -582,6 +582,84 @@ void PackNC4HW4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int } } +void PackNC4HW4ToNHWCReluFp32(const void *src, void *dst, int batch, int plane, int channel) { + int c4 = UP_DIV(channel, C4NUM); + for (int b = 0; b < batch; b++) { + int src_offset = b * plane * c4 * C4NUM; + int dst_offset = b * plane * channel; + for (int k = 0; k < plane; k++) { + int src_kernel_offset = src_offset + k * C4NUM; + int dst_kernel_offset = dst_offset + k * channel; + for (int c = 0; c < c4 - 1; c++) { + int src_c_offset = src_kernel_offset + c * plane * C4NUM; + int dst_c_offset = dst_kernel_offset + c * C4NUM; +#ifdef ENABLE_NEON + float32x4_t input_ptr = vld1q_f32((float *)src + src_c_offset); + float32x4_t zero = vdupq_n_f32(0); + input_ptr = vmaxq_f32(zero, input_ptr); + vst1q_f32((float *)dst + dst_c_offset, input_ptr); +#else + for (int i = 0; i < C4NUM; ++i) { + float input_data = ((float *)src + src_c_offset)[i]; + input_data = input_data < 0 ? 0 : input_data; + ((float *)dst + dst_c_offset)[i] = input_data; + } +#endif + } + // res part + int res_c = channel - (c4 - 1) * C4NUM; + for (int i = 0; i < res_c; i++) { + int src_res_c_offset = src_kernel_offset + (c4 - 1) * C4NUM * plane + i; + int dst_res_c_offset = dst_kernel_offset + (c4 - 1) * C4NUM + i; + float input_data = ((float *)src + src_res_c_offset)[0]; + input_data = input_data < 0 ? 0 : input_data; + ((float *)dst + dst_res_c_offset)[0] = input_data; + } + } + } +} + +void PackNC4HW4ToNHWCRelu6Fp32(const void *src, void *dst, int batch, int plane, int channel) { + int c4 = UP_DIV(channel, C4NUM); + for (int b = 0; b < batch; b++) { + int src_offset = b * plane * c4 * C4NUM; + int dst_offset = b * plane * channel; + for (int k = 0; k < plane; k++) { + int src_kernel_offset = src_offset + k * C4NUM; + int dst_kernel_offset = dst_offset + k * channel; + for (int c = 0; c < c4 - 1; c++) { + int src_c_offset = src_kernel_offset + c * plane * C4NUM; + int dst_c_offset = dst_kernel_offset + c * C4NUM; +#ifdef ENABLE_NEON + float32x4_t input_ptr = vld1q_f32((float *)src + src_c_offset); + float32x4_t zero = vdupq_n_f32(0); + float32x4_t six = vdupq_n_f32(6); + input_ptr = vmaxq_f32(zero, input_ptr); + input_ptr = vminq_f32(six, input_ptr); + vst1q_f32((float *)dst + dst_c_offset, input_ptr); +#else + for (int i = 0; i < C4NUM; ++i) { + float input_data = ((float *)src + src_c_offset)[i]; + input_data = input_data < 0 ? 0 : input_data; + input_data = input_data > 6 ? 6 : input_data; + ((float *)dst + dst_c_offset)[i] = input_data; + } +#endif + } + // res part + int res_c = channel - (c4 - 1) * C4NUM; + for (int i = 0; i < res_c; i++) { + int src_res_c_offset = src_kernel_offset + (c4 - 1) * C4NUM * plane + i; + int dst_res_c_offset = dst_kernel_offset + (c4 - 1) * C4NUM + i; + float input_data = ((float *)src + src_res_c_offset)[0]; + input_data = input_data < 0 ? 0 : input_data; + input_data = input_data > 6 ? 6 : input_data; + ((float *)dst + dst_res_c_offset)[0] = input_data; + } + } + } +} + void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel) { int c4 = UP_DIV(channel, C4NUM); for (int b = 0; b < batch; b++) { diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h index 0d8a285153..5e7f05647b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h @@ -70,6 +70,10 @@ void PackNC4HW4ToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int void PackNC4HW4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel); +void PackNC4HW4ToNHWCReluFp32(const void *src, void *dst, int batch, int plane, int channel); + +void PackNC4HW4ToNHWCRelu6Fp32(const void *src, void *dst, int batch, int plane, int channel); + void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel); void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int channel);