diff --git a/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S b/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S index 4a81030f11..69c014c5f7 100644 --- a/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S +++ b/mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S @@ -16,12 +16,16 @@ // v0 ~ v15 value // v16 v17 bias data -// x24 x25 weite loop tmp buf -// x26 relu6 #6; x27 relu #0 +// x14 x15 weite loop tmp buf +// x16 relu6 #6; x17 relu #0 // w10 oc8 loop control // w13 hw loop control PostFuncBiasReluC8: + sub sp, sp, #128 + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 + movi v26.4s, #6 scvtf v26.4s, v26.4s dup v27.4s, wzr @@ -30,9 +34,9 @@ PostFuncBiasReluC8: Loop_C8: cmp w10, w3 beq Loop_C1 - mov x25, #4 - mul x24, x10, x25 - add x25, x0, x24 + mov x15, #4 + mul x14, x10, x15 + add x15, x0, x14 add w10, w10, #8 mov w13, w5 ld1 {v16.4s, v17.4s}, [x2], #32 @@ -103,14 +107,14 @@ Relu_8x8: fmax v14.4s, v14.4s, v27.4s fmax v15.4s, v15.4s, v27.4s Write_8x8: - st1 {v0.4s, v1.4s}, [x25], x6 - st1 {v2.4s, v3.4s}, [x25], x6 - st1 {v4.4s, v5.4s}, [x25], x6 - st1 {v6.4s, v7.4s}, [x25], x6 - st1 {v8.4s, v9.4s}, [x25], x6 - st1 {v10.4s, v11.4s}, [x25], x6 - st1 {v12.4s, v13.4s}, [x25], x6 - st1 {v14.4s, v15.4s}, [x25], x6 + st1 {v0.4s, v1.4s}, [x15], x6 + st1 {v2.4s, v3.4s}, [x15], x6 + st1 {v4.4s, v5.4s}, [x15], x6 + st1 {v6.4s, v7.4s}, [x15], x6 + st1 {v8.4s, v9.4s}, [x15], x6 + st1 {v10.4s, v11.4s}, [x15], x6 + st1 {v12.4s, v13.4s}, [x15], x6 + st1 {v14.4s, v15.4s}, [x15], x6 b Loop8x8 Loop_4x8: @@ -153,10 +157,10 @@ Relu_4x8: fmax v6.4s, v6.4s, v27.4s fmax v7.4s, v7.4s, v27.4s Write_4x8: - st1 {v0.4s, v1.4s}, [x25], x6 - st1 {v2.4s, v3.4s}, [x25], x6 - st1 {v4.4s, v5.4s}, [x25], x6 - st1 {v6.4s, v7.4s}, [x25], x6 + st1 {v0.4s, v1.4s}, [x15], x6 + st1 {v2.4s, v3.4s}, [x15], x6 + st1 {v4.4s, v5.4s}, [x15], x6 + st1 {v6.4s, v7.4s}, [x15], x6 Loop_1x8: cmp w7, #2 @@ -175,7 +179,7 @@ Relu6_1x8: fmin v1.4s, v1.4s, v26.4s fmax v0.4s, v0.4s, v27.4s fmax v1.4s, v1.4s, v27.4s - st1 {v0.4s, v1.4s}, [x25], x6 + st1 {v0.4s, v1.4s}, [x15], x6 b Relu6_1x8 Relu_1x8: cmp w13, #0 @@ -186,7 +190,7 @@ Relu_1x8: fadd v1.4s, v1.4s, v17.4s fmax v0.4s, v0.4s, v27.4s fmax v1.4s, v1.4s, v27.4s - st1 {v0.4s, v1.4s}, [x25], x6 + st1 {v0.4s, v1.4s}, [x15], x6 b Relu_1x8 Write_1x8: cmp w13, #0 @@ -195,7 +199,7 @@ Write_1x8: ld1 {v0.4s, v1.4s}, [x1], #32 fadd v0.4s, v0.4s, v16.4s fadd v1.4s, v1.4s, v17.4s - st1 {v0.4s, v1.4s}, [x25], x6 + st1 {v0.4s, v1.4s}, [x15], x6 b Write_1x8 @@ -204,9 +208,9 @@ Loop_C1: beq End mov w13, w5 ld1 {v16.4s, v17.4s}, [x2], #32 - mov x25, #4 - mul x24, x10, x25 - add x0, x0, x24 + mov x15, #4 + mul x14, x10, x15 + add x0, x0, x14 cmp x4, #1 beq Loop_C1_1 @@ -302,7 +306,7 @@ Loop_C1_2_Write: Loop_C1_3: - add x25, x0, #8 + add x15, x0, #8 cmp w7, #2 beq Loop_C1_3_Relu6 cmp w7, #1 @@ -319,7 +323,7 @@ Loop_C1_3_Relu6: dup s1, v0.s[1] stp s0, s1, [x0] add x0, x0, x6 - st1 {v0.s}[2], [x25], x6 + st1 {v0.s}[2], [x15], x6 b Loop_C1_3_Relu6 Loop_C1_3_Relu: cmp w13, #0 @@ -331,7 +335,7 @@ Loop_C1_3_Relu: dup s1, v0.s[1] stp s0, s1, [x0] add x0, x0, x6 - st1 {v0.s}[2], [x25], x6 + st1 {v0.s}[2], [x15], x6 b Loop_C1_3_Relu Loop_C1_3_Write: cmp w13, #0 @@ -342,7 +346,7 @@ Loop_C1_3_Write: dup s1, v0.s[1] stp s0, s1, [x0] add x0, x0, x6 - st1 {v0.s}[2], [x25], x6 + st1 {v0.s}[2], [x15], x6 b Loop_C1_3_Write Loop_C1_4: @@ -380,7 +384,7 @@ Loop_C1_4_Write: b Loop_C1_4_Write Loop_C1_5: - add x25, x0, #16 + add x15, x0, #16 cmp w7, #2 beq Loop_C1_5_Relu6 cmp w7, #1 @@ -398,8 +402,8 @@ Loop_C1_5_Relu6: fmax v0.4s, v0.4s, v27.4s fmax v1.4s, v1.4s, v27.4s st1 {v0.4s}, [x0], x6 - str s1, [x25] - add x25, x25, x6 + str s1, [x15] + add x15, x15, x6 b Loop_C1_5_Relu6 Loop_C1_5_Relu: cmp w13, #0 @@ -411,8 +415,8 @@ Loop_C1_5_Relu: fmax v0.4s, v0.4s, v27.4s fmax v1.4s, v1.4s, v27.4s st1 {v0.4s}, [x0], x6 - str s1, [x25] - add x25, x25, x6 + str s1, [x15] + add x15, x15, x6 b Loop_C1_5_Relu Loop_C1_5_Write: cmp w13, #0 @@ -422,12 +426,12 @@ Loop_C1_5_Write: fadd v0.4s, v0.4s, v16.4s fadd v1.4s, v1.4s, v17.4s st1 {v0.4s}, [x0], x6 - str s1, [x25] - add x25, x25, x6 + str s1, [x15] + add x15, x15, x6 b Loop_C1_5_Write Loop_C1_6: - add x25, x0, #16 + add x15, x0, #16 cmp w7, #2 beq Loop_C1_6_Relu6 cmp w7, #1 @@ -446,8 +450,8 @@ Loop_C1_6_Relu6: fmax v1.4s, v1.4s, v27.4s st1 {v0.4s}, [x0], x6 dup s0, v1.s[1] - stp s1, s0, [x25] - add x25, x25, x6 + stp s1, s0, [x15] + add x15, x15, x6 b Loop_C1_6_Relu6 Loop_C1_6_Relu: cmp w13, #0 @@ -460,8 +464,8 @@ Loop_C1_6_Relu: fmax v1.4s, v1.4s, v27.4s st1 {v0.4s}, [x0], x6 dup s0, v1.s[1] - stp s1, s0, [x25] - add x25, x25, x6 + stp s1, s0, [x15] + add x15, x15, x6 b Loop_C1_6_Relu Loop_C1_6_Write: cmp w13, #0 @@ -472,13 +476,13 @@ Loop_C1_6_Write: fadd v1.4s, v1.4s, v17.4s st1 {v0.4s}, [x0], x6 dup s0, v1.s[1] - stp s1, s0, [x25] - add x25, x25, x6 + stp s1, s0, [x15] + add x15, x15, x6 b Loop_C1_6_Write Loop_C1_7: - add x25, x0, #16 - add x24, x0, #24 + add x15, x0, #16 + add x14, x0, #24 cmp w7, #2 beq Loop_C1_7_Relu6 cmp w7, #1 @@ -497,9 +501,9 @@ Loop_C1_7_Relu6: fmax v1.4s, v1.4s, v27.4s st1 {v0.4s}, [x0], x6 dup s0, v1.s[1] - stp s1, s0, [x25] - add x25, x25, x6 - st1 {v1.s}[2], [x24], x6 + stp s1, s0, [x15] + add x15, x15, x6 + st1 {v1.s}[2], [x14], x6 b Loop_C1_7_Relu6 Loop_C1_7_Relu: cmp w13, #0 @@ -512,9 +516,9 @@ Loop_C1_7_Relu: fmax v1.4s, v1.4s, v27.4s st1 {v0.4s}, [x0], x6 dup s0, v1.s[1] - stp s1, s0, [x25] - add x25, x25, x6 - st1 {v1.s}[2], [x24], x6 + stp s1, s0, [x15] + add x15, x15, x6 + st1 {v1.s}[2], [x14], x6 b Loop_C1_7_Relu Loop_C1_7_Write: cmp w13, #0 @@ -525,11 +529,14 @@ Loop_C1_7_Write: fadd v1.4s, v1.4s, v17.4s st1 {v0.4s}, [x0], x6 dup s0, v1.s[1] - stp s1, s0, [x25] - add x25, x25, x6 - st1 {v1.s}[2], [x24], x6 + stp s1, s0, [x15] + add x15, x15, x6 + st1 {v1.s}[2], [x14], x6 b Loop_C1_7_Write End: + sub sp, sp, #128 + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 + ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 ret #endif