diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3BorderPixelInt8.S b/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S
similarity index 95%
rename from mindspore/lite/nnacl/assembly/arm32/ConvDw3x3BorderPixelInt8.S
rename to mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S
index 1913b1a8e8..8189a0e7fe 100644
--- a/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3BorderPixelInt8.S
+++ b/mindspore/lite/nnacl/assembly/arm32/ConvDw3x3Int8BorderPixel.S
@@ -3,19 +3,19 @@
 
 .text
 .align 5
-.global ConvDw3x3BorderPixelInt8
+.global ConvDw3x3Int8BorderPixel
 #ifndef __APPLE__
-.type ConvDw3x3BorderPixelInt8, %function
+.type ConvDw3x3Int8BorderPixel, %function
 #endif
 
-// void ConvDw3x3BorderPixelInt8(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height,
+// void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height,
 //                               size_t width, size_t in_kh_step, size_t in_kw_step, size_t channel, size_t in_zp, size_t out_zp,
 //                               size_t out_multiplier, size_t left_shift, size_t right_shift, size_t acc_min, size_t acc_max) {
 
 // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step,
 // r8: channel, r9: in_zp,  r10: out_zp, r11: out_multiplier, r12: left_shift, r13: right_shift
 // r14: acc_min, r15: acc_max
-ConvDw3x3BorderPixelInt8:
+ConvDw3x3Int8BorderPixel:
     // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
     // according to https://stackoverflow.com/questions/53625807
     // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
diff --git a/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S b/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S
new file mode 100644
index 0000000000..86a3cd29cd
--- /dev/null
+++ b/mindspore/lite/nnacl/assembly/arm32/DeconvDwInt8Post.S
@@ -0,0 +1,74 @@
+#ifdef __arm__
+#ifndef __aarch64__
+
+.text
+.align 5
+.global DeconvDwInt8Post
+#ifndef __APPLE__
+.type DeconvDwInt8Post, %function
+#endif
+
+// void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums,
+//                       int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min,
+//                       int32_t acc_max)
+// r0: dst, r1: output_buffer, r2: bias, r3: block_channel, r4: pixel_nums, r5: out_multiplier,
+// r6: left_shift, r7: right_shift, r8: out_zp, r9: acc_min, r10: acc_max
+
+DeconvDwInt8Post:
+    // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
+    // according to https://stackoverflow.com/questions/53625807
+    // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
+    // clang's rule seems more simple, though there are no subroutine calls here
+    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
+    push {r4-r8}
+    add sp, sp, #20
+
+    vld1.32 {q9}, [r2]
+    ldr r4, [sp]
+    ldr r5, [sp, #4]
+    vdup.32 q14, r5   // out_multiplier
+    ldr r6, [sp, #8]
+    vdup.32 q13, r6   // left_shift
+    ldr r5, [sp, #12]
+    vdup.32 q12, r5   // right_shift
+    ldr r6, [sp, #16]
+    vdup.32 q15, r6   // output_zp
+    ldr r7, [sp, #20]
+    vdup.32 q11, r7   // acc_min
+    ldr r8, [sp, #24]
+    vdup.32 q10, r8   // acc_max
+
+    LoopCount:
+        mov r8, r0
+        vld1.32 {q0}, [r1]!
+        vand q0, q0, q9
+
+        vshl.s32 q0, q0, q13
+        vqrdmulh.s32 q0, q0, q14
+        vand q4, q0, q12
+        vshr.s32 q4, q4, #31
+        vqadd.s32 q0, q0, q4
+        vrshl.s32 q0, q0, q12
+        vadd.i32 q0, q0, q15
+        vmax.s32 q0, q0, q11
+        vmin.s32 q0, q0, q10
+
+        vqmovn.s32 d0, q0
+        vqmovn.s16 d0, q0
+
+        vst1.8 {d0[0]}, [r8]!
+        vst1.8 {d0[1]}, [r8]!
+        vst1.8 {d0[2]}, [r8]!
+        vst1.8 {d0[3]}, [r8]!
+        add r0, r0, r3
+
+        sub r4, r4, #1
+        cmp r4, #1
+        bge LoopCount
+    End:
+        sub sp, sp, #20
+        pop {r4-r8}
+        bx lr
+
+#endif
+#endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S b/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S
new file mode 100644
index 0000000000..8f4c9a7208
--- /dev/null
+++ b/mindspore/lite/nnacl/assembly/arm64/DeconvDwInt8Post.S
@@ -0,0 +1,58 @@
+#ifdef __aarch64__
+
+.text
+.align 5
+.global DeconvDwInt8Post
+#ifndef __APPLE__
+.type DeconvDwInt8Post, %function
+#endif
+
+// void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums,
+//                       int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min,
+//                       int32_t acc_max)
+// x0: dst, x1: output_buffer, x2: bias, x3: block_channel, x4: pixel_nums, x5: out_multiplier
+// x6: left_shift, x7: right_shift, x8: out_zp, x9: acc_min, x10: acc_max
+
+DeconvDwInt8Post:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+    ld1 {v25.4s}, [x2]
+
+    dup v26.4s, w6    // left_shift
+    dup v27.4s, w5    // out_multiplier
+    dup v28.4s, w7    // right_shift
+
+    ldr w8, [sp]
+    dup v29.4s, w8    // out_zp
+    ldr w9, [sp, #8]
+    dup v30.4s, w9   // acc_min
+    ldr w10, [sp, #16]
+    dup v31.4s, w10   // acc_max
+
+    LoopCount:
+        ld1 {v0.4s}, [x1], #16
+        add v0.4s, v0.4s, v25.4s
+        sqshl v0.4s, v0.4s, v26.4s
+        sqrdmulh v0.4s, v0.4s, v27.4s
+
+        and v16.16b, v28.16b, v0.16b
+        sshr v16.4s, v16.4s, #31
+        sqadd v0.4s, v0.4s, v16.4s
+        srshl v0.4s, v0.4s, v28.4s
+
+        add v0.4s, v0.4s, v29.4s
+        smax v0.4s, v0.4s, v30.4s
+        smin v0.4s, v0.4s, v31.4s
+
+        sqxtn v0.4h, v0.4s
+        sqxtn v0.8b, v0.8h
+
+        st1 {v0.s}[0], [x0], x3
+
+        sub x4, x4, #1
+        cmp x4, #1
+        bge LoopCount
+    ret
+#endif
diff --git a/mindspore/lite/nnacl/int8/common_func_int8.h b/mindspore/lite/nnacl/int8/common_func_int8.h
index ae0dcf196a..c9a555cf75 100644
--- a/mindspore/lite/nnacl/int8/common_func_int8.h
+++ b/mindspore/lite/nnacl/int8/common_func_int8.h
@@ -47,6 +47,9 @@ void ConvDwInt8Center(int8_t *dst, const int8_t *src, const int16_t *weight, con
                       size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int8_t *in_zp,
                       int32_t *out_zp, int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift,
                       int32_t *acc_min, int32_t *acc_max);
+void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums,
+                      int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min,
+                      int32_t acc_max);
 #endif
 
 #ifdef ENABLE_ARM32
@@ -54,6 +57,9 @@ void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *wei
                           size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min,
                           size_t act_max, size_t out_zp, int32_t *out_multiplier, int32_t *shift_before,
                           int32_t *shift_after, size_t asymmetric, size_t per_channel, size_t per_channel_offset);
+void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int height,
+                              int width, int in_kh_step, int in_kw_step, int channel, int8_t in_zp, int32_t out_zp,
+                              int out_multiplier, int left_shift, int right_shift, int32_t acc_min, int32_t acc_max);
 #endif
 
 #ifdef ENABLE_ARM64
diff --git a/mindspore/lite/nnacl/int8/conv_depthwise_int8.c b/mindspore/lite/nnacl/int8/conv_depthwise_int8.c
index 6f25680bf6..4e46ee4d69 100644
--- a/mindspore/lite/nnacl/int8/conv_depthwise_int8.c
+++ b/mindspore/lite/nnacl/int8/conv_depthwise_int8.c
@@ -302,6 +302,7 @@ void ConvDw3x3Int8(int8_t *output_data, int8_t *buffer, const int8_t *input_data
   }
 }
 
+#ifndef ENABLE_ARM32
 void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int height,
                               int width, int in_kh_step, int in_kw_step, int channel, int8_t in_zp, int32_t out_zp,
                               int out_multiplier, int left_shift, int right_shift, int32_t acc_min, int32_t acc_max) {
@@ -338,6 +339,7 @@ void ConvDw3x3Int8BorderPixel(int8_t *dst, const int8_t *src, const int16_t *wei
     }
   }
 }
+#endif
 
 #ifndef ENABLE_ARM64
 void ConvDw3x3Int8Corner(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int in_kh_step,
@@ -730,12 +732,13 @@ void DeconvDepthwiseCenterInt8(int32_t *dst, const int16_t *src, const int16_t *
 }
 #endif
 
-void DeconvDepthwisePostFuncInt8(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel,
-                                 const ConvParameter *conv_param, int out_multiplier, int left_shift, int right_shift,
-                                 int32_t out_zp, int32_t acc_min, int32_t acc_max) {
+#ifndef ENABLE_ARM
+void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums,
+                      int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min,
+                      int32_t acc_max) {
   int8_t *dst_k = dst;
   int32_t *buffer_k = output_buffer;
-  for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) {
+  for (int k = 0; k < pixel_nums; k++) {
     for (int c = 0; c < C4NUM; c++) {
       buffer_k[c] += bias[c];
       buffer_k[c] = RoundingDivideByPOT(
@@ -749,6 +752,7 @@ void DeconvDepthwisePostFuncInt8(int8_t *dst, int32_t *output_buffer, const int3
     buffer_k += C4NUM;
   }
 }
+#endif
 
 void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *input_data, const int16_t *weight_data,
                   const int32_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
@@ -791,11 +795,11 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in
                                   sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_);
 #endif
       }
-      DeconvDepthwisePostFuncInt8(
-        dst_data, output_buffer, bias, sliding->block_channel_, conv_param,
-        conv_param->conv_quant_arg_.quant_multiplier_[0], conv_param->conv_quant_arg_.left_shift_[0],
-        conv_param->conv_quant_arg_.right_shift_[0], conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
-        conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0]);
+      DeconvDwInt8Post(dst_data, output_buffer, bias, sliding->block_channel_,
+                       conv_param->output_h_ * conv_param->output_w_, conv_param->conv_quant_arg_.quant_multiplier_[0],
+                       conv_param->conv_quant_arg_.left_shift_[0], conv_param->conv_quant_arg_.right_shift_[0],
+                       conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
+                       conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0]);
     }  // output C4 loop
     src += sliding->in_step_;
     dst += sliding->out_step_;