diff --git a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
index a6622a729a..20cfa58a8c 100644
--- a/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm32/MatmulFp32Opt.S
@@ -381,6 +381,8 @@ LoopRow:
         ldr lr, [sp, #20]
         cmp lr, #0
         beq C8DstStep
+        cmp lr, #2
+        beq WinoDstStep
         mov lr, #4
         ldr r7, [sp, #12] // reload rhs col
         mul lr, lr, r7
@@ -391,6 +393,10 @@ LoopRow:
         ldr lr, [sp, #-40]
         add r2, lr, #128
         str r2, [sp, #-40]
+        b NoDstStep
+    WinoDstStep:
+        add r2, r2, r10
+        str r2, [sp, #-40]
     NoDstStep:
         cmp r6, #4
         ble LoopRowEnd
diff --git a/mindspore/lite/nnacl/assembly/arm64/bias_add.S b/mindspore/lite/nnacl/assembly/arm64/bias_add.S
deleted file mode 100644
index 181de0de72..0000000000
--- a/mindspore/lite/nnacl/assembly/arm64/bias_add.S
+++ /dev/null
@@ -1,82 +0,0 @@
-
-#ifdef __aarch64__
-    .text
-    .align 5
-    //.p2align 5,,15
-    .global BiasAdd
-#ifndef __APPLE__
-    .type BiasAdd, %function
-#endif
-
-
-
-//void BiasAdd(const float* bias, float* data, size_t oc4, size_t plan_size)
-
-//Auto: x0:bias, x1: data, x2:oc4,x3: plan_size,
-
-BiasAdd:
-cmp x2, #0
-beq BiasAddEnd
-
-cmp x3, #0
-beq BiasAddEnd
-
-LoopOc4:
-ld1 {v0.4s}, [x0], #16
-mov x6, x3
-mov x5, x1
-
-Loop16LineIn:
-cmp x6, #4
-blt L4
-sub x6, x6, #4
-
-ld1 {v1.4s, v2.4s}, [x5], #32
-
-fadd v5.4s, v0.4s, v1.4s
-fadd v6.4s, v0.4s, v2.4s
-ld1 {v3.4s, v4.4s}, [x5], #32
-
-cmp x6, #4
-blt Loop16LineOut
-
-Loop16:
-st1 {v5.4s, v6.4s}, [x1], #32
-fadd v7.4s, v0.4s, v3.4s
-fadd v8.4s, v0.4s, v4.4s
-ld1 {v1.4s, v2.4s}, [x5], #32
-
-st1 {v7.4s, v8.4s}, [x1], #32
-fadd v5.4s, v0.4s, v1.4s
-fadd v6.4s, v0.4s, v2.4s
-ld1 {v3.4s, v4.4s}, [x5], #32
-
-sub x6, x6, #4
-cmp x6, #4
-bge Loop16
-
-Loop16LineOut:
-st1 {v5.4s, v6.4s}, [x1], #32
-fadd v7.4s, v0.4s, v3.4s
-fadd v8.4s, v0.4s, v4.4s
-
-st1 {v7.4s, v8.4s}, [x1], #32
-
-L4:
-cmp x6, #0
-beq Loop16LineEnd
-Loop4:
-ld1 {v1.4s}, [x5], #16
-fadd v2.4s, v1.4s, v0.4s
-subs x6, x6, #1
-st1 {v2.4s}, [x1], #16
-bne Loop4
-
-Loop16LineEnd:
-subs x2, x2, #1
-bne LoopOc4
-
-BiasAddEnd:
-
-ret
-#endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/bias_add_relu.S b/mindspore/lite/nnacl/assembly/arm64/bias_add_relu.S
deleted file mode 100644
index f9e4eccc69..0000000000
--- a/mindspore/lite/nnacl/assembly/arm64/bias_add_relu.S
+++ /dev/null
@@ -1,94 +0,0 @@
-
-#ifdef __aarch64__
-    .text
-    .align 5
-    //.p2align 5,,15
-    .global BiasAddRelu
-#ifndef __APPLE__
-    .type BiasAddRelu, %function
-#endif
-
-
-//void BiasAddRelu(const float* bias, float* data, size_t oc4, size_t plan_size)
-
-//Auto: x0:bias, x1: data, x2:oc4,x3: plan_size,
-
-BiasAddRelu:
-cmp x2, #0
-beq BiasAddEnd
-
-cmp x3, #0
-beq BiasAddEnd
-
-dup v16.4s, wzr
-
-LoopOc4:
-ld1 {v0.4s}, [x0], #16
-mov x6, x3
-mov x5, x1
-
-Loop16LineIn:
-cmp x6, #4
-blt L4
-sub x6, x6, #4
-
-ld1 {v1.4s, v2.4s}, [x5], #32
-
-fadd v21.4s, v0.4s, v1.4s
-fadd v22.4s, v0.4s, v2.4s
-ld1 {v3.4s, v4.4s}, [x5], #32
-
-fmax v23.4s, v21.4s, v16.4s
-fmax v24.4s, v22.4s, v16.4s
-
-cmp x6, #4
-blt Loop16LineOut
-
-Loop16:
-st1 {v23.4s, v24.4s}, [x1], #32
-fadd v25.4s, v0.4s, v3.4s
-fadd v26.4s, v0.4s, v4.4s
-ld1 {v1.4s, v2.4s}, [x5], #32
-
-fmax v27.4s, v25.4s, v16.4s
-fmax v28.4s, v26.4s, v16.4s
-fadd v21.4s, v0.4s, v1.4s
-fadd v22.4s, v0.4s, v2.4s
-
-st1 {v27.4s, v28.4s}, [x1], #32
-ld1 {v3.4s, v4.4s}, [x5], #32
-fmax v23.4s, v21.4s, v16.4s
-fmax v24.4s, v22.4s, v16.4s
-sub x6, x6, #4
-cmp x6, #4
-bge Loop16
-
-Loop16LineOut:
-st1 {v23.4s, v24.4s}, [x1], #32
-fadd v25.4s, v0.4s, v3.4s
-fadd v26.4s, v0.4s, v4.4s
-
-fmax v27.4s, v25.4s, v16.4s
-fmax v28.4s, v26.4s, v16.4s
-st1 {v27.4s, v28.4s}, [x1], #32
-
-L4:
-cmp x6, #0
-beq Loop16LineEnd
-Loop4:
-ld1 {v1.4s}, [x5], #16
-fadd v1.4s, v1.4s, v0.4s
-fmax v1.4s, v1.4s, v16.4s
-
-subs x6, x6, #1
-st1 {v1.4s}, [x1], #16
-bne Loop4
-
-Loop16LineEnd:
-subs x2, x2, #1
-bne LoopOc4
-
-BiasAddEnd:
-
-ret
-#endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/bias_add_relu6.S b/mindspore/lite/nnacl/assembly/arm64/bias_add_relu6.S
deleted file mode 100644
index 77c563a812..0000000000
--- a/mindspore/lite/nnacl/assembly/arm64/bias_add_relu6.S
+++ /dev/null
@@ -1,113 +0,0 @@
-
-#ifdef __aarch64__
-    .text
-    .align 5
-    //.p2align 5,,15
-    .global BiasAddRelu6
-#ifndef __APPLE__
-    .type BiasAddRelu6, %function
-#endif
-
-
-
-//void BiasAddRelu6(const float* bias, float* data, size_t oc4, size_t plan_size)
-
-//Auto: x0:bias, x1: data, x2:oc4,x3: plan_size,
-
-BiasAddRelu6:
-cmp x2, #0
-beq BiasAddEnd
-
-cmp x3, #0
-beq BiasAddEnd
-
-dup v16.4s, wzr
-movi v17.4s, #6
-scvtf v17.4s, v17.4s
-
-LoopOc4:
-ld1 {v0.4s}, [x0], #16
-mov x6, x3
-mov x5, x1
-
-Loop16LineIn:
-cmp x6, #4
-blt L4
-sub x6, x6, #4
-
-ld1 {v1.4s, v2.4s}, [x5], #32
-
-fadd v21.4s, v0.4s, v1.4s
-fadd v22.4s, v0.4s, v2.4s
-ld1 {v3.4s, v4.4s}, [x5], #32
-
-fmax v23.4s, v21.4s, v16.4s
-fmax v24.4s, v22.4s, v16.4s
-
-
-
-cmp x6, #4
-blt Loop16LineOut
-
-Loop16:
-fmin v23.4s, v23.4s, v17.4s
-fmin v24.4s, v24.4s, v17.4s
-fadd v25.4s, v0.4s, v3.4s
-fadd v26.4s, v0.4s, v4.4s
-ld1 {v1.4s, v2.4s}, [x5], #32
-
-st1 {v23.4s, v24.4s}, [x1], #32
-fmax v27.4s, v25.4s, v16.4s
-fmax v28.4s, v26.4s, v16.4s
-fadd v21.4s, v0.4s, v1.4s
-fadd v22.4s, v0.4s, v2.4s
-
-fmin v27.4s, v27.4s, v17.4s
-fmin v28.4s, v28.4s, v17.4s
-fmax v23.4s, v21.4s, v16.4s
-fmax v24.4s, v22.4s, v16.4s
-ld1 {v3.4s, v4.4s}, [x5], #32
-
-st1 {v27.4s, v28.4s}, [x1], #32
-
-
-sub x6, x6, #4
-cmp x6, #4
-bge Loop16
-
-Loop16LineOut:
-fmin v23.4s, v23.4s, v17.4s
-fmin v24.4s, v24.4s, v17.4s
-fadd v25.4s, v0.4s, v3.4s
-fadd v26.4s, v0.4s, v4.4s
-
-fmax v27.4s, v25.4s, v16.4s
-fmax v28.4s, v26.4s, v16.4s
-st1 {v23.4s, v24.4s}, [x1], #32
-
-fmin v27.4s, v27.4s, v17.4s
-fmin v28.4s, v28.4s, v17.4s
-
-st1 {v27.4s, v28.4s}, [x1], #32
-
-L4:
-cmp x6, #0
-beq Loop16LineEnd
-Loop4:
-ld1 {v1.4s}, [x5], #16
-fadd v1.4s, v1.4s, v0.4s
-fmax v1.4s, v1.4s, v16.4s
-fmin v1.4s, v1.4s, v17.4s
-
-subs x6, x6, #1
-st1 {v1.4s}, [x1], #16
-bne Loop4
-
-Loop16LineEnd:
-subs x2, x2, #1
-bne LoopOc4
-
-BiasAddEnd:
-
-ret
-#endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/relu.S b/mindspore/lite/nnacl/assembly/arm64/relu.S
deleted file mode 100644
index 74c40a135b..0000000000
--- a/mindspore/lite/nnacl/assembly/arm64/relu.S
+++ /dev/null
@@ -1,73 +0,0 @@
-
-#ifdef __aarch64__
-    .text
-    .align 5
-    //.p2align 5,,15
-    .global Relu
-#ifndef __APPLE__
-    .type Relu, %function
-#endif
-
-
-//void Relu(float* data, size_t element4)
-
-//Auto: x0:data, x1: element4
-
-Relu:
-cmp x1, #0
-beq ReluEnd
-
-dup v16.4s, wzr
-
-mov x5, x0
-
-Loop16LineIn:
-cmp x1, #4
-blt L4
-sub x1, x1, #4
-
-ld1 {v1.4s, v2.4s}, [x5], #32
-
-fmax v5.4s, v16.4s, v1.4s
-fmax v6.4s, v16.4s, v2.4s
-ld1 {v3.4s, v4.4s}, [x5], #32
-
-cmp x1, #4
-blt Loop16LineOut
-
-Loop16:
-st1 {v5.4s, v6.4s}, [x0], #32
-fmax v7.4s, v16.4s, v3.4s
-fmax v8.4s, v16.4s, v4.4s
-ld1 {v1.4s, v2.4s}, [x5], #32
-
-st1 {v7.4s, v8.4s}, [x0], #32
-fmax v5.4s, v16.4s, v1.4s
-fmax v6.4s, v16.4s, v2.4s
-ld1 {v3.4s, v4.4s}, [x5], #32
-
-sub x1, x1, #4
-cmp x1, #4
-bge Loop16
-
-Loop16LineOut:
-st1 {v5.4s, v6.4s}, [x0], #32
-fmax v7.4s, v16.4s, v3.4s
-fmax v8.4s, v16.4s, v4.4s
-
-st1 {v7.4s, v8.4s}, [x0], #32
-
-L4:
-cmp x1, #0
-beq ReluEnd
-Loop4:
-ld1 {v1.4s}, [x5], #16
-fmax v2.4s, v16.4s, v0.4s
-subs x1, x1, #1
-st1 {v2.4s}, [x0], #16
-bne Loop4
-
-ReluEnd:
-
-ret
-#endif
diff --git a/mindspore/lite/nnacl/assembly/arm64/relu6.S b/mindspore/lite/nnacl/assembly/arm64/relu6.S
deleted file mode 100644
index c1789845ee..0000000000
--- a/mindspore/lite/nnacl/assembly/arm64/relu6.S
+++ /dev/null
@@ -1,89 +0,0 @@
-
-#ifdef __aarch64__
-    .text
-    .align 5
-    //.p2align 5,,15
-    .global Relu6
-#ifndef __APPLE__
-    .type Relu6, %function
-#endif
-
-
-//void Relu6(float* data, size_t element4)
-
-//Auto: x0:data, x1: element4
-
-Relu6:
-cmp x1, #0
-beq Relu6End
-
-dup v16.4s, wzr
-movi v17.4s, #6
-scvtf v17.4s, v17.4s
-
-mov x5, x0
-
-Loop16LineIn:
-cmp x1, #4
-blt L4
-sub x1, x1, #4
-
-ld1 {v1.4s, v2.4s}, [x5], #32
-
-fmax v21.4s, v1.4s, v16.4s
-fmax v22.4s, v2.4s, v16.4s
-ld1 {v3.4s, v4.4s}, [x5], #32
-
-fmin v23.4s, v21.4s, v17.4s
-fmin v24.4s, v22.4s, v17.4s
-
-
-cmp x1, #4
-blt Loop16LineOut
-
-Loop16:
-st1 {v23.4s, v24.4s}, [x0], #32
-fmax v25.4s, v3.4s, v16.4s
-fmax v26.4s, v4.4s, v16.4s
-ld1 {v1.4s, v2.4s}, [x5], #32
-
-fmin v27.4s, v25.4s, v17.4s
-fmin v28.4s, v26.4s, v17.4s
-fmax v21.4s, v1.4s, v16.4s
-fmax v22.4s, v2.4s, v16.4s
-
-st1 {v27.4s, v28.4s}, [x0], #32
-ld1 {v3.4s, v4.4s}, [x5], #32
-fmin v23.4s, v21.4s, v17.4s
-fmin v24.4s, v22.4s, v17.4s
-
-sub x1, x1, #4
-cmp x1, #4
-bge Loop16
-
-Loop16LineOut:
-st1 {v23.4s, v24.4s}, [x0], #32
-fmax v25.4s, v3.4s, v16.4s
-fmax v26.4s, v4.4s, v16.4s
-
-fmin v27.4s, v25.4s, v17.4s
-fmin v28.4s, v26.4s, v17.4s
-st1 {v27.4s, v28.4s}, [x0], #32
-
-L4:
-cmp x1, #0
-beq Relu6End
-Loop4:
-ld1 {v1.4s}, [x5], #16
-fmax v1.4s, v1.4s, v16.4s
-
-fmin v1.4s, v1.4s, v17.4s
-
-subs x1, x1, #1
-st1 {v1.4s}, [x0], #16
-bne Loop4
-
-Relu6End:
-
-ret
-#endif
diff --git a/mindspore/lite/nnacl/fp32/conv.c b/mindspore/lite/nnacl/fp32/conv.c
index 92643dddb8..3c89acc73d 100644
--- a/mindspore/lite/nnacl/fp32/conv.c
+++ b/mindspore/lite/nnacl/fp32/conv.c
@@ -81,11 +81,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
   int out_w_block = UP_DIV(conv_param->output_w_, out_unit);
   int out_h_block = UP_DIV(conv_param->output_h_, out_unit);
   int output_count = out_w_block * out_h_block;
-#ifdef ENABLE_ARM32
-  const int tile_num = 4;
-#else
-  const int tile_num = 12;
-#endif
+  const int tile_num = C12NUM;
   int output_tile_count = UP_DIV(output_count, tile_num);
   int out_channel = conv_param->output_channel_;
   int oc8 = UP_DIV(out_channel, C8NUM);
@@ -117,7 +113,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
       float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
       for (int i = 0; i < input_unit_square; ++i) {
 #ifdef ENABLE_ARM32
-        RowMajor2Col4Major(src_ptr + i * C4NUM * in_channel, tmp_col_ptr, C4NUM, in_channel);
+        RowMajor2Col4Major(src_ptr + i * C12NUM * in_channel, tmp_col_ptr, C12NUM, in_channel);
 #else
         RowMajor2Col12Major(src_ptr + i * C12NUM * in_channel, tmp_col_ptr, C12NUM, in_channel);
 #endif
diff --git a/mindspore/lite/nnacl/winograd_transform.c b/mindspore/lite/nnacl/winograd_transform.c
index 42967768f9..ed1f8ebea4 100644
--- a/mindspore/lite/nnacl/winograd_transform.c
+++ b/mindspore/lite/nnacl/winograd_transform.c
@@ -85,11 +85,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
         }    // interval y loop
       }
       // input transform
-#ifdef ENABLE_ARM32
-      const int tile_num = 4;
-#else
-      const int tile_num = 12;
-#endif
+      const int tile_num = C12NUM;
       int dst_ic4_offset = dst_plane_offset + ic * C4NUM;
       size_t dst_step = tile_num * in_channel;
       float *trans_input_ptr = trans_input + dst_ic4_offset;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
index 925577c64d..47107834e5 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
@@ -184,11 +184,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
 int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
   int channel_out = conv_param_->output_channel_;
   int oc8 = UP_DIV(channel_out, C8NUM);
-#ifdef ENABLE_ARM32
-  int tile_num = 4;
-#else
-  int tile_num = 12;
-#endif
+  int tile_num = C12NUM;
   MS_ASSERT(ctx_->allocator != nullptr);
 
   size_t tile_buffer_size =