From 4333de375e4478754d4db69bf808246a22411a30 Mon Sep 17 00:00:00 2001
From: fuzhiye <fuzhiye@huawei.com>
Date: Sun, 16 Aug 2020 14:24:32 +0800
Subject: [PATCH] rewrite conv3x3 conv winograd post func

---
 .../arm/fp16/convolution_winograd_fp16.cc     |  13 +--
 .../kernel/arm/fp32/convolution_3x3.cc        |  13 +--
 .../kernel/arm/fp32/convolution_winograd.cc   |  14 +--
 .../runtime/kernel/arm/nnacl/fp16/conv_fp16.c | 103 +++++++++++++++-
 .../runtime/kernel/arm/nnacl/fp16/conv_fp16.h |   6 +
 .../src/runtime/kernel/arm/nnacl/fp32/conv.c  | 110 ++++++++++++++++--
 .../src/runtime/kernel/arm/nnacl/fp32/conv.h  |   6 +
 .../lite/src/runtime/kernel/arm/nnacl/pack.c  |  78 +++++++++++++
 .../lite/src/runtime/kernel/arm/nnacl/pack.h  |   4 +
 9 files changed, 315 insertions(+), 32 deletions(-)
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
index a3866c14e3..380c809a36 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
@@ -400,16 +400,15 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
   }
 
   // get real output
-  UnPackWinogradOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
-                           conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
-  int output_num =
-    conv_param_->output_channel_ * conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
   if (conv_param_->is_relu_) {
-    ReluFp16(execute_output_, execute_output_, output_num);
+    UnPackWinogradReluOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
+                                 conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
   } else if (conv_param_->is_relu6_) {
-    Relu6Fp16(execute_output_, execute_output_, output_num);
+    UnPackWinogradRelu6OutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
+                                  conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
   } else {
-    // do nothing
+    UnPackWinogradOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
+                             conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
   }
   ConvolutionBaseFP16CPUKernel::IfCastOutput();
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
index bcf3665763..4797189ef7 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
@@ -263,16 +263,15 @@ int Convolution3x3CPUKernel::Run() {
   auto is_relu = conv_param_->is_relu_;
   auto is_relu6 = conv_param_->is_relu6_;
   auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
-  PackNC4HW4ToNHWCFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
-                       conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
-  int output_num =
-    conv_param_->output_channel_ * conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
   if (is_relu) {
-    ReluFp32(output_addr, output_addr, output_num);
+    PackNC4HW4ToNHWCReluFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
+                             conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
   } else if (is_relu6) {
-    Relu6Fp32(output_addr, output_addr, output_num);
+    PackNC4HW4ToNHWCRelu6Fp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
+                              conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
   } else {
-    // do nothing
+    PackNC4HW4ToNHWCFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
+                         conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
   }
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
index d803e99f23..2a73b04683 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
@@ -368,18 +368,16 @@ int ConvolutionWinogradCPUKernel::Run() {
   // get real output
   auto out_tensor = out_tensors_.front();
   auto out_data = reinterpret_cast<float *>(out_tensor->Data());
-  UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
-                       conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
-  int output_num =
-    conv_param_->output_channel_ * conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
   if (conv_param_->is_relu_) {
-    ReluFp32(out_data, out_data, output_num);
+    UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
+                             conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
   } else if (conv_param_->is_relu6_) {
-    Relu6Fp32(out_data, out_data, output_num);
+    UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
+                              conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
   } else {
-    // do nothing
+    UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
+                         conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
   }
-
   return RET_OK;
 }
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c
index cf034be09a..78880c3ac2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c
@@ -470,8 +470,9 @@ void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, i
   int out_h_block_num = UP_DIV(height, output_unit);
   int out_w_block_num = UP_DIV(width, output_unit);
   int c8 = UP_DIV(channel, C8NUM);
+  int c8_block = C8NUM * out_h_block_num * output_unit * out_w_block_num * output_unit;
   for (int b = 0; b < batch; b++) {
-    int src_batch_offset = b * c8 * C8NUM * out_h_block_num * output_unit * out_w_block_num * output_unit;
+    int src_batch_offset = b * c8 * c8_block;
     int dst_batch_offset = b * height * width * channel;
     for (int h = 0; h < height; h++) {
       int src_h_offset = src_batch_offset + C8NUM * (h * out_w_block_num * output_unit);
@@ -480,7 +481,7 @@ void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, i
         int src_w_offset = src_h_offset + w * C8NUM;
         int dst_w_offset = dst_h_offset + w * channel;
         for (int c = 0; c < c8 - 1; c++) {
-          int src_c8_offset = src_w_offset + c * C8NUM * out_w_block_num * out_h_block_num * output_unit * output_unit;
+          int src_c8_offset = src_w_offset + c * c8_block;
           int dst_c8_offset = dst_w_offset + c * C8NUM;
 #ifdef ENABLE_NEON
           vst1q_f16(dst + dst_c8_offset, vld1q_f16(src + src_c8_offset));
@@ -491,7 +492,7 @@ void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, i
 #endif
         }
         int c_res = channel - (c8 - 1) * C8NUM;
-        int src_c_res_offset = (c8 - 1) * C8NUM * out_w_block_num * out_h_block_num * output_unit * output_unit;
+        int src_c_res_offset = (c8 - 1) * c8_block;
         int dst_c_res_offset = (c8 - 1) * C8NUM;
         for (int c = 0; c < c_res; c++) {
           int src_c8_res_offset = src_w_offset + src_c_res_offset + c;
@@ -502,3 +503,99 @@ void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, i
     }
   }
 }
+
+void UnPackWinogradReluOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel,
+                                  int output_unit) {
+  int out_h_block_num = UP_DIV(height, output_unit);
+  int out_w_block_num = UP_DIV(width, output_unit);
+  int c8 = UP_DIV(channel, C8NUM);
+  int c8_block = C8NUM * out_h_block_num * output_unit * out_w_block_num * output_unit;
+  for (int b = 0; b < batch; b++) {
+    int src_batch_offset = b * c8 * c8_block;
+    int dst_batch_offset = b * height * width * channel;
+    for (int h = 0; h < height; h++) {
+      int src_h_offset = src_batch_offset + C8NUM * (h * out_w_block_num * output_unit);
+      int dst_h_offset = dst_batch_offset + h * width * channel;
+      for (int w = 0; w < width; w++) {
+        int src_w_offset = src_h_offset + w * C8NUM;
+        int dst_w_offset = dst_h_offset + w * channel;
+        for (int c = 0; c < c8 - 1; c++) {
+          int src_c8_offset = src_w_offset + c * c8_block;
+          int dst_c8_offset = dst_w_offset + c * C8NUM;
+#ifdef ENABLE_NEON
+          float16x8_t input_ptr = vld1q_f16(src + src_c8_offset);
+          float16x8_t zero = vdupq_n_f16(0);
+          input_ptr = vmaxq_f16(zero, input_ptr);
+          vst1q_f16(dst + dst_c8_offset, input_ptr);
+#else
+          for (int i = 0; i < C8NUM; ++i) {
+            float16_t input_data = src[src_c8_offset + i];
+            input_data = input_data < 0 ? 0 : input_data;
+            dst[dst_c8_offset + i] = input_data;
+          }
+#endif
+        }
+        int c_res = channel - (c8 - 1) * C8NUM;
+        int src_c_res_offset = (c8 - 1) * c8_block;
+        int dst_c_res_offset = (c8 - 1) * C8NUM;
+        for (int c = 0; c < c_res; c++) {
+          int src_c8_res_offset = src_w_offset + src_c_res_offset + c;
+          int dst_c8_res_offset = dst_w_offset + dst_c_res_offset + c;
+          float16_t input_data = src[src_c8_res_offset];
+          input_data = input_data < 0 ? 0 : input_data;
+          dst[dst_c8_res_offset] = input_data;
+        }
+      }
+    }
+  }
+}
+
+void UnPackWinogradRelu6OutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel,
+                                   int output_unit) {
+  int out_h_block_num = UP_DIV(height, output_unit);
+  int out_w_block_num = UP_DIV(width, output_unit);
+  int c8 = UP_DIV(channel, C8NUM);
+  int c8_block = C8NUM * out_h_block_num * output_unit * out_w_block_num * output_unit;
+  for (int b = 0; b < batch; b++) {
+    int src_batch_offset = b * c8 * c8_block;
+    int dst_batch_offset = b * height * width * channel;
+    for (int h = 0; h < height; h++) {
+      int src_h_offset = src_batch_offset + C8NUM * (h * out_w_block_num * output_unit);
+      int dst_h_offset = dst_batch_offset + h * width * channel;
+      for (int w = 0; w < width; w++) {
+        int src_w_offset = src_h_offset + w * C8NUM;
+        int dst_w_offset = dst_h_offset + w * channel;
+        for (int c = 0; c < c8 - 1; c++) {
+          int src_c8_offset = src_w_offset + c * c8_block;
+          int dst_c8_offset = dst_w_offset + c * C8NUM;
+#ifdef ENABLE_NEON
+          float16x8_t input_ptr = vld1q_f16(src + src_c8_offset);
+          float16x8_t zero = vdupq_n_f16(0);
+          float16x8_t six = vdupq_n_f16(6);
+          input_ptr = vmaxq_f16(zero, input_ptr);
+          input_ptr = vminq_f16(six, input_ptr);
+          vst1q_f16(dst + dst_c8_offset, input_ptr);
+#else
+          for (int i = 0; i < C8NUM; ++i) {
+            float16_t input_data = src[src_c8_offset + i];
+            input_data = input_data < 0 ? 0 : input_data;
+            input_data = input_data > 6 ? 6 : input_data;
+            dst[dst_c8_offset + i] = input_data;
+          }
+#endif
+        }
+        int c_res = channel - (c8 - 1) * C8NUM;
+        int src_c_res_offset = (c8 - 1) * c8_block;
+        int dst_c_res_offset = (c8 - 1) * C8NUM;
+        for (int c = 0; c < c_res; c++) {
+          int src_c8_res_offset = src_w_offset + src_c_res_offset + c;
+          int dst_c8_res_offset = dst_w_offset + dst_c_res_offset + c;
+          float16_t input_data = src[src_c8_res_offset];
+          input_data = input_data < 0 ? 0 : input_data;
+          input_data = input_data > 6 ? 6 : input_data;
+          dst[dst_c8_res_offset] = input_data;
+        }
+      }
+    }
+  }
+}
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h
index 918e82c020..a22a6459e1 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.h
@@ -67,6 +67,12 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa
 
 void UnPackWinogradOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel,
                               int output_unit);
+
+void UnPackWinogradReluOutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel,
+                                  int output_unit);
+
+void UnPackWinogradRelu6OutputFp16(const float16_t *src, float16_t *dst, int batch, int height, int width, int channel,
+                                   int output_unit);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c
index 0215e87607..91a9317028 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c
@@ -296,8 +296,9 @@ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, i
   int out_h_block_num = UP_DIV(height, output_unit);
   int out_w_block_num = UP_DIV(width, output_unit);
   int c4 = UP_DIV(channel, C4NUM);
+  int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit;
   for (int b = 0; b < batch; b++) {
-    int src_batch_offset = b * c4 * C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit;
+    int src_batch_offset = b * c4 * c4_block;
     int dst_batch_offset = b * height * width * channel;
     for (int h = 0; h < height; h++) {
       int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit);
@@ -306,19 +307,18 @@ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, i
         int src_w_offset = src_h_offset + w * C4NUM;
         int dst_w_offset = dst_h_offset + w * channel;
         for (int c = 0; c < c4 - 1; c++) {
-          int src_c4_offset = src_w_offset + c * C4NUM * out_w_block_num * out_h_block_num * output_unit * output_unit;
+          int src_c4_offset = src_w_offset + c * c4_block;
           int dst_c4_offset = dst_w_offset + c * C4NUM;
 #ifdef ENABLE_NEON
           vst1q_f32(dst + dst_c4_offset, vld1q_f32(src + src_c4_offset));
 #else
-          dst[dst_c4_offset] = src[src_c4_offset];
-          dst[dst_c4_offset + 1] = src[src_c4_offset + 1];
-          dst[dst_c4_offset + 2] = src[src_c4_offset + 2];
-          dst[dst_c4_offset + 3] = src[src_c4_offset + 3];
+          for (int i = 0; i < C4NUM; ++i) {
+            dst[dst_c4_offset + i] = src[src_c4_offset + i];
+          }
 #endif
         }
         int c_res = channel - (c4 - 1) * C4NUM;
-        int src_c_res_offset = (c4 - 1) * C4NUM * out_w_block_num * out_h_block_num * output_unit * output_unit;
+        int src_c_res_offset = (c4 - 1) * c4_block;
         int dst_c_res_offset = (c4 - 1) * C4NUM;
         for (int c = 0; c < c_res; c++) {
           int src_c4_res_offset = src_w_offset + src_c_res_offset + c;
@@ -330,6 +330,102 @@ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, i
   }
 }
 
+void UnPackWinogradReluOutput(const float *src, float *dst, int batch, int height, int width, int channel,
+                              int output_unit) {
+  int out_h_block_num = UP_DIV(height, output_unit);
+  int out_w_block_num = UP_DIV(width, output_unit);
+  int c4 = UP_DIV(channel, C4NUM);
+  int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit;
+  for (int b = 0; b < batch; b++) {
+    int src_batch_offset = b * c4 * c4_block;
+    int dst_batch_offset = b * height * width * channel;
+    for (int h = 0; h < height; h++) {
+      int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit);
+      int dst_h_offset = dst_batch_offset + h * width * channel;
+      for (int w = 0; w < width; w++) {
+        int src_w_offset = src_h_offset + w * C4NUM;
+        int dst_w_offset = dst_h_offset + w * channel;
+        for (int c = 0; c < c4 - 1; c++) {
+          int src_c4_offset = src_w_offset + c * c4_block;
+          int dst_c4_offset = dst_w_offset + c * C4NUM;
+#ifdef ENABLE_NEON
+          float32x4_t input_ptr = vld1q_f32(src + src_c4_offset);
+          float32x4_t zero = vdupq_n_f32(0);
+          input_ptr = vmaxq_f32(zero, input_ptr);
+          vst1q_f32(dst + dst_c4_offset, input_ptr);
+#else
+          for (int i = 0; i < C4NUM; ++i) {
+            float input_data = src[src_c4_offset + i];
+            input_data = input_data < 0 ? 0 : input_data;
+            dst[dst_c4_offset + i] = input_data;
+          }
+#endif
+        }
+        int c_res = channel - (c4 - 1) * C4NUM;
+        int src_c_res_offset = (c4 - 1) * c4_block;
+        int dst_c_res_offset = (c4 - 1) * C4NUM;
+        for (int c = 0; c < c_res; c++) {
+          int src_c4_res_offset = src_w_offset + src_c_res_offset + c;
+          int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c;
+          float input_data = src[src_c4_res_offset];
+          input_data = input_data < 0 ? 0 : input_data;
+          dst[dst_c4_res_offset] = input_data;
+        }
+      }
+    }
+  }
+}
+
+void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int height, int width, int channel,
+                               int output_unit) {
+  int out_h_block_num = UP_DIV(height, output_unit);
+  int out_w_block_num = UP_DIV(width, output_unit);
+  int c4 = UP_DIV(channel, C4NUM);
+  int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit;
+  for (int b = 0; b < batch; b++) {
+    int src_batch_offset = b * c4 * c4_block;
+    int dst_batch_offset = b * height * width * channel;
+    for (int h = 0; h < height; h++) {
+      int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit);
+      int dst_h_offset = dst_batch_offset + h * width * channel;
+      for (int w = 0; w < width; w++) {
+        int src_w_offset = src_h_offset + w * C4NUM;
+        int dst_w_offset = dst_h_offset + w * channel;
+        for (int c = 0; c < c4 - 1; c++) {
+          int src_c4_offset = src_w_offset + c * c4_block;
+          int dst_c4_offset = dst_w_offset + c * C4NUM;
+#ifdef ENABLE_NEON
+          float32x4_t input_ptr = vld1q_f32(src + src_c4_offset);
+          float32x4_t zero = vdupq_n_f32(0);
+          float32x4_t six = vdupq_n_f32(6);
+          input_ptr = vmaxq_f32(zero, input_ptr);
+          input_ptr = vminq_f32(six, input_ptr);
+          vst1q_f32(dst + dst_c4_offset, input_ptr);
+#else
+          for (int i = 0; i < C4NUM; ++i) {
+            float input_data = src[src_c4_offset + i];
+            input_data = input_data < 0 ? 0 : input_data;
+            input_data = input_data > 6 ? 6 : input_data;
+            dst[dst_c4_offset + i] = input_data;
+          }
+#endif
+        }
+        int c_res = channel - (c4 - 1) * C4NUM;
+        int src_c_res_offset = (c4 - 1) * c4_block;
+        int dst_c_res_offset = (c4 - 1) * C4NUM;
+        for (int c = 0; c < c_res; c++) {
+          int src_c4_res_offset = src_w_offset + src_c_res_offset + c;
+          int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c;
+          float input_data = src[src_c4_res_offset];
+          input_data = input_data < 0 ? 0 : input_data;
+          input_data = input_data > 6 ? 6 : input_data;
+          dst[dst_c4_res_offset] = input_data;
+        }
+      }
+    }
+  }
+}
+
 // fp32 conv3x3
 void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data,
                  TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.h
index c27854cad8..d9c094636f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.h
@@ -63,6 +63,12 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
 
 void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel, int output_unit);
 
+void UnPackWinogradReluOutput(const float *src, float *dst, int batch, int height, int width, int channel,
+                              int output_unit);
+
+void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int height, int width, int channel,
+                               int output_unit);
+
 // fp32 conv3x3
 void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data,
                  TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func);
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c
index 1d9c21edf1..d8649227a2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c
@@ -582,6 +582,84 @@ void PackNC4HW4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int
   }
 }
 
+void PackNC4HW4ToNHWCReluFp32(const void *src, void *dst, int batch, int plane, int channel) {
+  int c4 = UP_DIV(channel, C4NUM);
+  for (int b = 0; b < batch; b++) {
+    int src_offset = b * plane * c4 * C4NUM;
+    int dst_offset = b * plane * channel;
+    for (int k = 0; k < plane; k++) {
+      int src_kernel_offset = src_offset + k * C4NUM;
+      int dst_kernel_offset = dst_offset + k * channel;
+      for (int c = 0; c < c4 - 1; c++) {
+        int src_c_offset = src_kernel_offset + c * plane * C4NUM;
+        int dst_c_offset = dst_kernel_offset + c * C4NUM;
+#ifdef ENABLE_NEON
+        float32x4_t input_ptr = vld1q_f32((float *)src + src_c_offset);
+        float32x4_t zero = vdupq_n_f32(0);
+        input_ptr = vmaxq_f32(zero, input_ptr);
+        vst1q_f32((float *)dst + dst_c_offset, input_ptr);
+#else
+        for (int i = 0; i < C4NUM; ++i) {
+          float input_data = ((float *)src + src_c_offset)[i];
+          input_data = input_data < 0 ? 0 : input_data;
+          ((float *)dst + dst_c_offset)[i] = input_data;
+        }
+#endif
+      }
+      // res part
+      int res_c = channel - (c4 - 1) * C4NUM;
+      for (int i = 0; i < res_c; i++) {
+        int src_res_c_offset = src_kernel_offset + (c4 - 1) * C4NUM * plane + i;
+        int dst_res_c_offset = dst_kernel_offset + (c4 - 1) * C4NUM + i;
+        float input_data = ((float *)src + src_res_c_offset)[0];
+        input_data = input_data < 0 ? 0 : input_data;
+        ((float *)dst + dst_res_c_offset)[0] = input_data;
+      }
+    }
+  }
+}
+
+void PackNC4HW4ToNHWCRelu6Fp32(const void *src, void *dst, int batch, int plane, int channel) {
+  int c4 = UP_DIV(channel, C4NUM);
+  for (int b = 0; b < batch; b++) {
+    int src_offset = b * plane * c4 * C4NUM;
+    int dst_offset = b * plane * channel;
+    for (int k = 0; k < plane; k++) {
+      int src_kernel_offset = src_offset + k * C4NUM;
+      int dst_kernel_offset = dst_offset + k * channel;
+      for (int c = 0; c < c4 - 1; c++) {
+        int src_c_offset = src_kernel_offset + c * plane * C4NUM;
+        int dst_c_offset = dst_kernel_offset + c * C4NUM;
+#ifdef ENABLE_NEON
+        float32x4_t input_ptr = vld1q_f32((float *)src + src_c_offset);
+        float32x4_t zero = vdupq_n_f32(0);
+        float32x4_t six = vdupq_n_f32(6);
+        input_ptr = vmaxq_f32(zero, input_ptr);
+        input_ptr = vminq_f32(six, input_ptr);
+        vst1q_f32((float *)dst + dst_c_offset, input_ptr);
+#else
+        for (int i = 0; i < C4NUM; ++i) {
+          float input_data = ((float *)src + src_c_offset)[i];
+          input_data = input_data < 0 ? 0 : input_data;
+          input_data = input_data > 6 ? 6 : input_data;
+          ((float *)dst + dst_c_offset)[i] = input_data;
+        }
+#endif
+      }
+      // res part
+      int res_c = channel - (c4 - 1) * C4NUM;
+      for (int i = 0; i < res_c; i++) {
+        int src_res_c_offset = src_kernel_offset + (c4 - 1) * C4NUM * plane + i;
+        int dst_res_c_offset = dst_kernel_offset + (c4 - 1) * C4NUM + i;
+        float input_data = ((float *)src + src_res_c_offset)[0];
+        input_data = input_data < 0 ? 0 : input_data;
+        input_data = input_data > 6 ? 6 : input_data;
+        ((float *)dst + dst_res_c_offset)[0] = input_data;
+      }
+    }
+  }
+}
+
 void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel) {
   int c4 = UP_DIV(channel, C4NUM);
   for (int b = 0; b < batch; b++) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h
index 0d8a285153..5e7f05647b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h
@@ -70,6 +70,10 @@ void PackNC4HW4ToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int
 
 void PackNC4HW4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel);
 
+void PackNC4HW4ToNHWCReluFp32(const void *src, void *dst, int batch, int plane, int channel);
+
+void PackNC4HW4ToNHWCRelu6Fp32(const void *src, void *dst, int batch, int plane, int channel);
+
 void PackNC4HW4ToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel);
 
 void PackNHWCToC8HWN8Fp32(const void *src, void *dst, int batch, int plane, int channel);