Browse Source

[MSLITE][Develop] deconv post c8 neon v8~v15 bug

tags/v1.0.0
ling 5 years ago
parent
commit
eb0be0d29f
1 changed files with 59 additions and 52 deletions
  1. +59
    -52
      mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S

+ 59
- 52
mindspore/lite/nnacl/assembly/arm64/PostFuncBiasReluC8.S View File

@@ -16,12 +16,16 @@


// v0 ~ v15 value // v0 ~ v15 value
// v16 v17 bias data // v16 v17 bias data
// x24 x25 weite loop tmp buf
// x26 relu6 #6; x27 relu #0
// x14 x15 weite loop tmp buf
// x16 relu6 #6; x17 relu #0
// w10 oc8 loop control // w10 oc8 loop control
// w13 hw loop control // w13 hw loop control


PostFuncBiasReluC8: PostFuncBiasReluC8:
sub sp, sp, #128
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64

movi v26.4s, #6 movi v26.4s, #6
scvtf v26.4s, v26.4s scvtf v26.4s, v26.4s
dup v27.4s, wzr dup v27.4s, wzr
@@ -30,9 +34,9 @@ PostFuncBiasReluC8:
Loop_C8: Loop_C8:
cmp w10, w3 cmp w10, w3
beq Loop_C1 beq Loop_C1
mov x25, #4
mul x24, x10, x25
add x25, x0, x24
mov x15, #4
mul x14, x10, x15
add x15, x0, x14
add w10, w10, #8 add w10, w10, #8
mov w13, w5 mov w13, w5
ld1 {v16.4s, v17.4s}, [x2], #32 ld1 {v16.4s, v17.4s}, [x2], #32
@@ -103,14 +107,14 @@ Relu_8x8:
fmax v14.4s, v14.4s, v27.4s fmax v14.4s, v14.4s, v27.4s
fmax v15.4s, v15.4s, v27.4s fmax v15.4s, v15.4s, v27.4s
Write_8x8: Write_8x8:
st1 {v0.4s, v1.4s}, [x25], x6
st1 {v2.4s, v3.4s}, [x25], x6
st1 {v4.4s, v5.4s}, [x25], x6
st1 {v6.4s, v7.4s}, [x25], x6
st1 {v8.4s, v9.4s}, [x25], x6
st1 {v10.4s, v11.4s}, [x25], x6
st1 {v12.4s, v13.4s}, [x25], x6
st1 {v14.4s, v15.4s}, [x25], x6
st1 {v0.4s, v1.4s}, [x15], x6
st1 {v2.4s, v3.4s}, [x15], x6
st1 {v4.4s, v5.4s}, [x15], x6
st1 {v6.4s, v7.4s}, [x15], x6
st1 {v8.4s, v9.4s}, [x15], x6
st1 {v10.4s, v11.4s}, [x15], x6
st1 {v12.4s, v13.4s}, [x15], x6
st1 {v14.4s, v15.4s}, [x15], x6
b Loop8x8 b Loop8x8


Loop_4x8: Loop_4x8:
@@ -153,10 +157,10 @@ Relu_4x8:
fmax v6.4s, v6.4s, v27.4s fmax v6.4s, v6.4s, v27.4s
fmax v7.4s, v7.4s, v27.4s fmax v7.4s, v7.4s, v27.4s
Write_4x8: Write_4x8:
st1 {v0.4s, v1.4s}, [x25], x6
st1 {v2.4s, v3.4s}, [x25], x6
st1 {v4.4s, v5.4s}, [x25], x6
st1 {v6.4s, v7.4s}, [x25], x6
st1 {v0.4s, v1.4s}, [x15], x6
st1 {v2.4s, v3.4s}, [x15], x6
st1 {v4.4s, v5.4s}, [x15], x6
st1 {v6.4s, v7.4s}, [x15], x6


Loop_1x8: Loop_1x8:
cmp w7, #2 cmp w7, #2
@@ -175,7 +179,7 @@ Relu6_1x8:
fmin v1.4s, v1.4s, v26.4s fmin v1.4s, v1.4s, v26.4s
fmax v0.4s, v0.4s, v27.4s fmax v0.4s, v0.4s, v27.4s
fmax v1.4s, v1.4s, v27.4s fmax v1.4s, v1.4s, v27.4s
st1 {v0.4s, v1.4s}, [x25], x6
st1 {v0.4s, v1.4s}, [x15], x6
b Relu6_1x8 b Relu6_1x8
Relu_1x8: Relu_1x8:
cmp w13, #0 cmp w13, #0
@@ -186,7 +190,7 @@ Relu_1x8:
fadd v1.4s, v1.4s, v17.4s fadd v1.4s, v1.4s, v17.4s
fmax v0.4s, v0.4s, v27.4s fmax v0.4s, v0.4s, v27.4s
fmax v1.4s, v1.4s, v27.4s fmax v1.4s, v1.4s, v27.4s
st1 {v0.4s, v1.4s}, [x25], x6
st1 {v0.4s, v1.4s}, [x15], x6
b Relu_1x8 b Relu_1x8
Write_1x8: Write_1x8:
cmp w13, #0 cmp w13, #0
@@ -195,7 +199,7 @@ Write_1x8:
ld1 {v0.4s, v1.4s}, [x1], #32 ld1 {v0.4s, v1.4s}, [x1], #32
fadd v0.4s, v0.4s, v16.4s fadd v0.4s, v0.4s, v16.4s
fadd v1.4s, v1.4s, v17.4s fadd v1.4s, v1.4s, v17.4s
st1 {v0.4s, v1.4s}, [x25], x6
st1 {v0.4s, v1.4s}, [x15], x6
b Write_1x8 b Write_1x8




@@ -204,9 +208,9 @@ Loop_C1:
beq End beq End
mov w13, w5 mov w13, w5
ld1 {v16.4s, v17.4s}, [x2], #32 ld1 {v16.4s, v17.4s}, [x2], #32
mov x25, #4
mul x24, x10, x25
add x0, x0, x24
mov x15, #4
mul x14, x10, x15
add x0, x0, x14


cmp x4, #1 cmp x4, #1
beq Loop_C1_1 beq Loop_C1_1
@@ -302,7 +306,7 @@ Loop_C1_2_Write:




Loop_C1_3: Loop_C1_3:
add x25, x0, #8
add x15, x0, #8
cmp w7, #2 cmp w7, #2
beq Loop_C1_3_Relu6 beq Loop_C1_3_Relu6
cmp w7, #1 cmp w7, #1
@@ -319,7 +323,7 @@ Loop_C1_3_Relu6:
dup s1, v0.s[1] dup s1, v0.s[1]
stp s0, s1, [x0] stp s0, s1, [x0]
add x0, x0, x6 add x0, x0, x6
st1 {v0.s}[2], [x25], x6
st1 {v0.s}[2], [x15], x6
b Loop_C1_3_Relu6 b Loop_C1_3_Relu6
Loop_C1_3_Relu: Loop_C1_3_Relu:
cmp w13, #0 cmp w13, #0
@@ -331,7 +335,7 @@ Loop_C1_3_Relu:
dup s1, v0.s[1] dup s1, v0.s[1]
stp s0, s1, [x0] stp s0, s1, [x0]
add x0, x0, x6 add x0, x0, x6
st1 {v0.s}[2], [x25], x6
st1 {v0.s}[2], [x15], x6
b Loop_C1_3_Relu b Loop_C1_3_Relu
Loop_C1_3_Write: Loop_C1_3_Write:
cmp w13, #0 cmp w13, #0
@@ -342,7 +346,7 @@ Loop_C1_3_Write:
dup s1, v0.s[1] dup s1, v0.s[1]
stp s0, s1, [x0] stp s0, s1, [x0]
add x0, x0, x6 add x0, x0, x6
st1 {v0.s}[2], [x25], x6
st1 {v0.s}[2], [x15], x6
b Loop_C1_3_Write b Loop_C1_3_Write


Loop_C1_4: Loop_C1_4:
@@ -380,7 +384,7 @@ Loop_C1_4_Write:
b Loop_C1_4_Write b Loop_C1_4_Write


Loop_C1_5: Loop_C1_5:
add x25, x0, #16
add x15, x0, #16
cmp w7, #2 cmp w7, #2
beq Loop_C1_5_Relu6 beq Loop_C1_5_Relu6
cmp w7, #1 cmp w7, #1
@@ -398,8 +402,8 @@ Loop_C1_5_Relu6:
fmax v0.4s, v0.4s, v27.4s fmax v0.4s, v0.4s, v27.4s
fmax v1.4s, v1.4s, v27.4s fmax v1.4s, v1.4s, v27.4s
st1 {v0.4s}, [x0], x6 st1 {v0.4s}, [x0], x6
str s1, [x25]
add x25, x25, x6
str s1, [x15]
add x15, x15, x6
b Loop_C1_5_Relu6 b Loop_C1_5_Relu6
Loop_C1_5_Relu: Loop_C1_5_Relu:
cmp w13, #0 cmp w13, #0
@@ -411,8 +415,8 @@ Loop_C1_5_Relu:
fmax v0.4s, v0.4s, v27.4s fmax v0.4s, v0.4s, v27.4s
fmax v1.4s, v1.4s, v27.4s fmax v1.4s, v1.4s, v27.4s
st1 {v0.4s}, [x0], x6 st1 {v0.4s}, [x0], x6
str s1, [x25]
add x25, x25, x6
str s1, [x15]
add x15, x15, x6
b Loop_C1_5_Relu b Loop_C1_5_Relu
Loop_C1_5_Write: Loop_C1_5_Write:
cmp w13, #0 cmp w13, #0
@@ -422,12 +426,12 @@ Loop_C1_5_Write:
fadd v0.4s, v0.4s, v16.4s fadd v0.4s, v0.4s, v16.4s
fadd v1.4s, v1.4s, v17.4s fadd v1.4s, v1.4s, v17.4s
st1 {v0.4s}, [x0], x6 st1 {v0.4s}, [x0], x6
str s1, [x25]
add x25, x25, x6
str s1, [x15]
add x15, x15, x6
b Loop_C1_5_Write b Loop_C1_5_Write


Loop_C1_6: Loop_C1_6:
add x25, x0, #16
add x15, x0, #16
cmp w7, #2 cmp w7, #2
beq Loop_C1_6_Relu6 beq Loop_C1_6_Relu6
cmp w7, #1 cmp w7, #1
@@ -446,8 +450,8 @@ Loop_C1_6_Relu6:
fmax v1.4s, v1.4s, v27.4s fmax v1.4s, v1.4s, v27.4s
st1 {v0.4s}, [x0], x6 st1 {v0.4s}, [x0], x6
dup s0, v1.s[1] dup s0, v1.s[1]
stp s1, s0, [x25]
add x25, x25, x6
stp s1, s0, [x15]
add x15, x15, x6
b Loop_C1_6_Relu6 b Loop_C1_6_Relu6
Loop_C1_6_Relu: Loop_C1_6_Relu:
cmp w13, #0 cmp w13, #0
@@ -460,8 +464,8 @@ Loop_C1_6_Relu:
fmax v1.4s, v1.4s, v27.4s fmax v1.4s, v1.4s, v27.4s
st1 {v0.4s}, [x0], x6 st1 {v0.4s}, [x0], x6
dup s0, v1.s[1] dup s0, v1.s[1]
stp s1, s0, [x25]
add x25, x25, x6
stp s1, s0, [x15]
add x15, x15, x6
b Loop_C1_6_Relu b Loop_C1_6_Relu
Loop_C1_6_Write: Loop_C1_6_Write:
cmp w13, #0 cmp w13, #0
@@ -472,13 +476,13 @@ Loop_C1_6_Write:
fadd v1.4s, v1.4s, v17.4s fadd v1.4s, v1.4s, v17.4s
st1 {v0.4s}, [x0], x6 st1 {v0.4s}, [x0], x6
dup s0, v1.s[1] dup s0, v1.s[1]
stp s1, s0, [x25]
add x25, x25, x6
stp s1, s0, [x15]
add x15, x15, x6
b Loop_C1_6_Write b Loop_C1_6_Write


Loop_C1_7: Loop_C1_7:
add x25, x0, #16
add x24, x0, #24
add x15, x0, #16
add x14, x0, #24
cmp w7, #2 cmp w7, #2
beq Loop_C1_7_Relu6 beq Loop_C1_7_Relu6
cmp w7, #1 cmp w7, #1
@@ -497,9 +501,9 @@ Loop_C1_7_Relu6:
fmax v1.4s, v1.4s, v27.4s fmax v1.4s, v1.4s, v27.4s
st1 {v0.4s}, [x0], x6 st1 {v0.4s}, [x0], x6
dup s0, v1.s[1] dup s0, v1.s[1]
stp s1, s0, [x25]
add x25, x25, x6
st1 {v1.s}[2], [x24], x6
stp s1, s0, [x15]
add x15, x15, x6
st1 {v1.s}[2], [x14], x6
b Loop_C1_7_Relu6 b Loop_C1_7_Relu6
Loop_C1_7_Relu: Loop_C1_7_Relu:
cmp w13, #0 cmp w13, #0
@@ -512,9 +516,9 @@ Loop_C1_7_Relu:
fmax v1.4s, v1.4s, v27.4s fmax v1.4s, v1.4s, v27.4s
st1 {v0.4s}, [x0], x6 st1 {v0.4s}, [x0], x6
dup s0, v1.s[1] dup s0, v1.s[1]
stp s1, s0, [x25]
add x25, x25, x6
st1 {v1.s}[2], [x24], x6
stp s1, s0, [x15]
add x15, x15, x6
st1 {v1.s}[2], [x14], x6
b Loop_C1_7_Relu b Loop_C1_7_Relu
Loop_C1_7_Write: Loop_C1_7_Write:
cmp w13, #0 cmp w13, #0
@@ -525,11 +529,14 @@ Loop_C1_7_Write:
fadd v1.4s, v1.4s, v17.4s fadd v1.4s, v1.4s, v17.4s
st1 {v0.4s}, [x0], x6 st1 {v0.4s}, [x0], x6
dup s0, v1.s[1] dup s0, v1.s[1]
stp s1, s0, [x25]
add x25, x25, x6
st1 {v1.s}[2], [x24], x6
stp s1, s0, [x15]
add x15, x15, x6
st1 {v1.s}[2], [x14], x6
b Loop_C1_7_Write b Loop_C1_7_Write


End: End:
sub sp, sp, #128
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ret ret
#endif #endif

Loading…
Cancel
Save