diff --git a/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S b/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S
index 4a81030f11..69c014c5f7 100644
--- a/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S
+++ b/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S
@@ -16,12 +16,16 @@
 
 // v0 ~ v15 value
 // v16  v17 bias data
-// x24  x25  weite loop tmp buf
-// x26  relu6  #6;    x27 relu #0
+// x14  x15  weite loop tmp buf
+// x16  relu6  #6;    x17 relu #0
 // w10  oc8 loop control
 // w13  hw  loop control
 
 PostFuncBiasReluC8:
+  sub sp, sp, #128
+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+
   movi v26.4s, #6
   scvtf v26.4s, v26.4s
   dup v27.4s, wzr
@@ -30,9 +34,9 @@ PostFuncBiasReluC8:
 Loop_C8:
   cmp w10, w3
   beq Loop_C1
-  mov x25,  #4
-  mul x24, x10, x25
-  add x25, x0, x24
+  mov x15,  #4
+  mul x14, x10, x15
+  add x15, x0, x14
   add w10, w10, #8
   mov w13, w5
   ld1 {v16.4s, v17.4s}, [x2], #32
@@ -103,14 +107,14 @@ Relu_8x8:
   fmax v14.4s, v14.4s, v27.4s
   fmax v15.4s, v15.4s, v27.4s
 Write_8x8:
-  st1 {v0.4s, v1.4s}, [x25], x6
-  st1 {v2.4s, v3.4s}, [x25], x6
-  st1 {v4.4s, v5.4s}, [x25], x6
-  st1 {v6.4s, v7.4s}, [x25], x6
-  st1 {v8.4s, v9.4s}, [x25], x6
-  st1 {v10.4s, v11.4s}, [x25], x6
-  st1 {v12.4s, v13.4s}, [x25], x6
-  st1 {v14.4s, v15.4s}, [x25], x6
+  st1 {v0.4s, v1.4s}, [x15], x6
+  st1 {v2.4s, v3.4s}, [x15], x6
+  st1 {v4.4s, v5.4s}, [x15], x6
+  st1 {v6.4s, v7.4s}, [x15], x6
+  st1 {v8.4s, v9.4s}, [x15], x6
+  st1 {v10.4s, v11.4s}, [x15], x6
+  st1 {v12.4s, v13.4s}, [x15], x6
+  st1 {v14.4s, v15.4s}, [x15], x6
   b Loop8x8
 
 Loop_4x8:
@@ -153,10 +157,10 @@ Relu_4x8:
   fmax v6.4s, v6.4s, v27.4s
   fmax v7.4s, v7.4s, v27.4s
 Write_4x8:
-  st1 {v0.4s, v1.4s}, [x25], x6
-  st1 {v2.4s, v3.4s}, [x25], x6
-  st1 {v4.4s, v5.4s}, [x25], x6
-  st1 {v6.4s, v7.4s}, [x25], x6
+  st1 {v0.4s, v1.4s}, [x15], x6
+  st1 {v2.4s, v3.4s}, [x15], x6
+  st1 {v4.4s, v5.4s}, [x15], x6
+  st1 {v6.4s, v7.4s}, [x15], x6
 
 Loop_1x8:
   cmp w7, #2
@@ -175,7 +179,7 @@ Relu6_1x8:
   fmin v1.4s, v1.4s, v26.4s
   fmax v0.4s, v0.4s, v27.4s
   fmax v1.4s, v1.4s, v27.4s
-  st1 {v0.4s, v1.4s}, [x25], x6
+  st1 {v0.4s, v1.4s}, [x15], x6
   b Relu6_1x8
 Relu_1x8:
   cmp w13, #0
@@ -186,7 +190,7 @@ Relu_1x8:
   fadd v1.4s, v1.4s, v17.4s
   fmax v0.4s, v0.4s, v27.4s
   fmax v1.4s, v1.4s, v27.4s
-  st1 {v0.4s, v1.4s}, [x25], x6
+  st1 {v0.4s, v1.4s}, [x15], x6
   b Relu_1x8
 Write_1x8:
   cmp w13, #0
@@ -195,7 +199,7 @@ Write_1x8:
   ld1 {v0.4s, v1.4s}, [x1], #32
   fadd v0.4s, v0.4s, v16.4s
   fadd v1.4s, v1.4s, v17.4s
-  st1 {v0.4s, v1.4s}, [x25], x6
+  st1 {v0.4s, v1.4s}, [x15], x6
   b Write_1x8
 
 
@@ -204,9 +208,9 @@ Loop_C1:
   beq End
   mov w13, w5
   ld1 {v16.4s, v17.4s}, [x2], #32
-  mov x25,  #4
-  mul x24, x10, x25
-  add x0, x0, x24
+  mov x15,  #4
+  mul x14, x10, x15
+  add x0, x0, x14
 
   cmp x4, #1
   beq Loop_C1_1
@@ -302,7 +306,7 @@ Loop_C1_2_Write:
 
 
 Loop_C1_3:
-  add x25, x0, #8
+  add x15, x0, #8
   cmp w7, #2
   beq Loop_C1_3_Relu6
   cmp w7, #1
@@ -319,7 +323,7 @@ Loop_C1_3_Relu6:
   dup s1, v0.s[1]
   stp s0, s1, [x0]
   add x0, x0, x6
-  st1 {v0.s}[2], [x25], x6
+  st1 {v0.s}[2], [x15], x6
   b Loop_C1_3_Relu6
 Loop_C1_3_Relu:
   cmp w13, #0
@@ -331,7 +335,7 @@ Loop_C1_3_Relu:
   dup s1, v0.s[1]
   stp s0, s1, [x0]
   add x0, x0, x6
-  st1 {v0.s}[2], [x25], x6
+  st1 {v0.s}[2], [x15], x6
   b Loop_C1_3_Relu
 Loop_C1_3_Write:
   cmp w13, #0
@@ -342,7 +346,7 @@ Loop_C1_3_Write:
   dup s1, v0.s[1]
   stp s0, s1, [x0]
   add x0, x0, x6
-  st1 {v0.s}[2], [x25], x6
+  st1 {v0.s}[2], [x15], x6
   b Loop_C1_3_Write
 
 Loop_C1_4:
@@ -380,7 +384,7 @@ Loop_C1_4_Write:
   b Loop_C1_4_Write
 
 Loop_C1_5:
-  add x25, x0, #16
+  add x15, x0, #16
   cmp w7, #2
   beq Loop_C1_5_Relu6
   cmp w7, #1
@@ -398,8 +402,8 @@ Loop_C1_5_Relu6:
   fmax v0.4s, v0.4s, v27.4s
   fmax v1.4s, v1.4s, v27.4s
   st1 {v0.4s}, [x0], x6
-  str s1, [x25]
-  add x25, x25, x6
+  str s1, [x15]
+  add x15, x15, x6
   b Loop_C1_5_Relu6
 Loop_C1_5_Relu:
   cmp w13, #0
@@ -411,8 +415,8 @@ Loop_C1_5_Relu:
   fmax v0.4s, v0.4s, v27.4s
   fmax v1.4s, v1.4s, v27.4s
   st1 {v0.4s}, [x0], x6
-  str s1, [x25]
-  add x25, x25, x6
+  str s1, [x15]
+  add x15, x15, x6
   b Loop_C1_5_Relu
 Loop_C1_5_Write:
   cmp w13, #0
@@ -422,12 +426,12 @@ Loop_C1_5_Write:
   fadd v0.4s, v0.4s, v16.4s
   fadd v1.4s, v1.4s, v17.4s
   st1 {v0.4s}, [x0], x6
-  str s1, [x25]
-  add x25, x25, x6
+  str s1, [x15]
+  add x15, x15, x6
   b Loop_C1_5_Write
 
 Loop_C1_6:
-  add x25, x0, #16
+  add x15, x0, #16
   cmp w7, #2
   beq Loop_C1_6_Relu6
   cmp w7, #1
@@ -446,8 +450,8 @@ Loop_C1_6_Relu6:
   fmax v1.4s, v1.4s, v27.4s
   st1 {v0.4s}, [x0], x6
   dup s0, v1.s[1]
-  stp s1, s0, [x25]
-  add x25, x25, x6
+  stp s1, s0, [x15]
+  add x15, x15, x6
   b Loop_C1_6_Relu6
 Loop_C1_6_Relu:
   cmp w13, #0
@@ -460,8 +464,8 @@ Loop_C1_6_Relu:
   fmax v1.4s, v1.4s, v27.4s
   st1 {v0.4s}, [x0], x6
   dup s0, v1.s[1]
-  stp s1, s0, [x25]
-  add x25, x25, x6
+  stp s1, s0, [x15]
+  add x15, x15, x6
   b Loop_C1_6_Relu
 Loop_C1_6_Write:
   cmp w13, #0
@@ -472,13 +476,13 @@ Loop_C1_6_Write:
   fadd v1.4s, v1.4s, v17.4s
   st1 {v0.4s}, [x0], x6
   dup s0, v1.s[1]
-  stp s1, s0, [x25]
-  add x25, x25, x6
+  stp s1, s0, [x15]
+  add x15, x15, x6
   b Loop_C1_6_Write
 
 Loop_C1_7:
-  add x25, x0, #16
-  add x24, x0, #24
+  add x15, x0, #16
+  add x14, x0, #24
   cmp w7, #2
   beq Loop_C1_7_Relu6
   cmp w7, #1
@@ -497,9 +501,9 @@ Loop_C1_7_Relu6:
   fmax v1.4s, v1.4s, v27.4s
   st1 {v0.4s}, [x0], x6
   dup s0, v1.s[1]
-  stp s1, s0, [x25]
-  add x25, x25, x6
-  st1 {v1.s}[2], [x24], x6
+  stp s1, s0, [x15]
+  add x15, x15, x6
+  st1 {v1.s}[2], [x14], x6
   b Loop_C1_7_Relu6
 Loop_C1_7_Relu:
   cmp w13, #0
@@ -512,9 +516,9 @@ Loop_C1_7_Relu:
   fmax v1.4s, v1.4s, v27.4s
   st1 {v0.4s}, [x0], x6
   dup s0, v1.s[1]
-  stp s1, s0, [x25]
-  add x25, x25, x6
-  st1 {v1.s}[2], [x24], x6
+  stp s1, s0, [x15]
+  add x15, x15, x6
+  st1 {v1.s}[2], [x14], x6
   b Loop_C1_7_Relu
 Loop_C1_7_Write:
   cmp w13, #0
@@ -525,11 +529,14 @@ Loop_C1_7_Write:
   fadd v1.4s, v1.4s, v17.4s
   st1 {v0.4s}, [x0], x6
   dup s0, v1.s[1]
-  stp s1, s0, [x25]
-  add x25, x25, x6
-  st1 {v1.s}[2], [x24], x6
+  stp s1, s0, [x15]
+  add x15, x15, x6
+  st1 {v1.s}[2], [x14], x6
   b Loop_C1_7_Write
 
 End:
+  sub sp, sp, #128
+  ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+  ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
   ret
 #endif