|
|
|
@@ -16,12 +16,16 @@ |
|
|
|
|
|
|
|
// v0 ~ v15 value |
|
|
|
// v16 v17 bias data |
|
|
|
// x24 x25 weite loop tmp buf |
|
|
|
// x26 relu6 #6; x27 relu #0 |
|
|
|
// x14 x15 weite loop tmp buf |
|
|
|
// x16 relu6 #6; x17 relu #0 |
|
|
|
// w10 oc8 loop control |
|
|
|
// w13 hw loop control |
|
|
|
|
|
|
|
PostFuncBiasReluC8: |
|
|
|
sub sp, sp, #128 |
|
|
|
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 |
|
|
|
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 |
|
|
|
|
|
|
|
movi v26.4s, #6 |
|
|
|
scvtf v26.4s, v26.4s |
|
|
|
dup v27.4s, wzr |
|
|
|
@@ -30,9 +34,9 @@ PostFuncBiasReluC8: |
|
|
|
Loop_C8: |
|
|
|
cmp w10, w3 |
|
|
|
beq Loop_C1 |
|
|
|
mov x25, #4 |
|
|
|
mul x24, x10, x25 |
|
|
|
add x25, x0, x24 |
|
|
|
mov x15, #4 |
|
|
|
mul x14, x10, x15 |
|
|
|
add x15, x0, x14 |
|
|
|
add w10, w10, #8 |
|
|
|
mov w13, w5 |
|
|
|
ld1 {v16.4s, v17.4s}, [x2], #32 |
|
|
|
@@ -103,14 +107,14 @@ Relu_8x8: |
|
|
|
fmax v14.4s, v14.4s, v27.4s |
|
|
|
fmax v15.4s, v15.4s, v27.4s |
|
|
|
Write_8x8: |
|
|
|
st1 {v0.4s, v1.4s}, [x25], x6 |
|
|
|
st1 {v2.4s, v3.4s}, [x25], x6 |
|
|
|
st1 {v4.4s, v5.4s}, [x25], x6 |
|
|
|
st1 {v6.4s, v7.4s}, [x25], x6 |
|
|
|
st1 {v8.4s, v9.4s}, [x25], x6 |
|
|
|
st1 {v10.4s, v11.4s}, [x25], x6 |
|
|
|
st1 {v12.4s, v13.4s}, [x25], x6 |
|
|
|
st1 {v14.4s, v15.4s}, [x25], x6 |
|
|
|
st1 {v0.4s, v1.4s}, [x15], x6 |
|
|
|
st1 {v2.4s, v3.4s}, [x15], x6 |
|
|
|
st1 {v4.4s, v5.4s}, [x15], x6 |
|
|
|
st1 {v6.4s, v7.4s}, [x15], x6 |
|
|
|
st1 {v8.4s, v9.4s}, [x15], x6 |
|
|
|
st1 {v10.4s, v11.4s}, [x15], x6 |
|
|
|
st1 {v12.4s, v13.4s}, [x15], x6 |
|
|
|
st1 {v14.4s, v15.4s}, [x15], x6 |
|
|
|
b Loop8x8 |
|
|
|
|
|
|
|
Loop_4x8: |
|
|
|
@@ -153,10 +157,10 @@ Relu_4x8: |
|
|
|
fmax v6.4s, v6.4s, v27.4s |
|
|
|
fmax v7.4s, v7.4s, v27.4s |
|
|
|
Write_4x8: |
|
|
|
st1 {v0.4s, v1.4s}, [x25], x6 |
|
|
|
st1 {v2.4s, v3.4s}, [x25], x6 |
|
|
|
st1 {v4.4s, v5.4s}, [x25], x6 |
|
|
|
st1 {v6.4s, v7.4s}, [x25], x6 |
|
|
|
st1 {v0.4s, v1.4s}, [x15], x6 |
|
|
|
st1 {v2.4s, v3.4s}, [x15], x6 |
|
|
|
st1 {v4.4s, v5.4s}, [x15], x6 |
|
|
|
st1 {v6.4s, v7.4s}, [x15], x6 |
|
|
|
|
|
|
|
Loop_1x8: |
|
|
|
cmp w7, #2 |
|
|
|
@@ -175,7 +179,7 @@ Relu6_1x8: |
|
|
|
fmin v1.4s, v1.4s, v26.4s |
|
|
|
fmax v0.4s, v0.4s, v27.4s |
|
|
|
fmax v1.4s, v1.4s, v27.4s |
|
|
|
st1 {v0.4s, v1.4s}, [x25], x6 |
|
|
|
st1 {v0.4s, v1.4s}, [x15], x6 |
|
|
|
b Relu6_1x8 |
|
|
|
Relu_1x8: |
|
|
|
cmp w13, #0 |
|
|
|
@@ -186,7 +190,7 @@ Relu_1x8: |
|
|
|
fadd v1.4s, v1.4s, v17.4s |
|
|
|
fmax v0.4s, v0.4s, v27.4s |
|
|
|
fmax v1.4s, v1.4s, v27.4s |
|
|
|
st1 {v0.4s, v1.4s}, [x25], x6 |
|
|
|
st1 {v0.4s, v1.4s}, [x15], x6 |
|
|
|
b Relu_1x8 |
|
|
|
Write_1x8: |
|
|
|
cmp w13, #0 |
|
|
|
@@ -195,7 +199,7 @@ Write_1x8: |
|
|
|
ld1 {v0.4s, v1.4s}, [x1], #32 |
|
|
|
fadd v0.4s, v0.4s, v16.4s |
|
|
|
fadd v1.4s, v1.4s, v17.4s |
|
|
|
st1 {v0.4s, v1.4s}, [x25], x6 |
|
|
|
st1 {v0.4s, v1.4s}, [x15], x6 |
|
|
|
b Write_1x8 |
|
|
|
|
|
|
|
|
|
|
|
@@ -204,9 +208,9 @@ Loop_C1: |
|
|
|
beq End |
|
|
|
mov w13, w5 |
|
|
|
ld1 {v16.4s, v17.4s}, [x2], #32 |
|
|
|
mov x25, #4 |
|
|
|
mul x24, x10, x25 |
|
|
|
add x0, x0, x24 |
|
|
|
mov x15, #4 |
|
|
|
mul x14, x10, x15 |
|
|
|
add x0, x0, x14 |
|
|
|
|
|
|
|
cmp x4, #1 |
|
|
|
beq Loop_C1_1 |
|
|
|
@@ -302,7 +306,7 @@ Loop_C1_2_Write: |
|
|
|
|
|
|
|
|
|
|
|
Loop_C1_3: |
|
|
|
add x25, x0, #8 |
|
|
|
add x15, x0, #8 |
|
|
|
cmp w7, #2 |
|
|
|
beq Loop_C1_3_Relu6 |
|
|
|
cmp w7, #1 |
|
|
|
@@ -319,7 +323,7 @@ Loop_C1_3_Relu6: |
|
|
|
dup s1, v0.s[1] |
|
|
|
stp s0, s1, [x0] |
|
|
|
add x0, x0, x6 |
|
|
|
st1 {v0.s}[2], [x25], x6 |
|
|
|
st1 {v0.s}[2], [x15], x6 |
|
|
|
b Loop_C1_3_Relu6 |
|
|
|
Loop_C1_3_Relu: |
|
|
|
cmp w13, #0 |
|
|
|
@@ -331,7 +335,7 @@ Loop_C1_3_Relu: |
|
|
|
dup s1, v0.s[1] |
|
|
|
stp s0, s1, [x0] |
|
|
|
add x0, x0, x6 |
|
|
|
st1 {v0.s}[2], [x25], x6 |
|
|
|
st1 {v0.s}[2], [x15], x6 |
|
|
|
b Loop_C1_3_Relu |
|
|
|
Loop_C1_3_Write: |
|
|
|
cmp w13, #0 |
|
|
|
@@ -342,7 +346,7 @@ Loop_C1_3_Write: |
|
|
|
dup s1, v0.s[1] |
|
|
|
stp s0, s1, [x0] |
|
|
|
add x0, x0, x6 |
|
|
|
st1 {v0.s}[2], [x25], x6 |
|
|
|
st1 {v0.s}[2], [x15], x6 |
|
|
|
b Loop_C1_3_Write |
|
|
|
|
|
|
|
Loop_C1_4: |
|
|
|
@@ -380,7 +384,7 @@ Loop_C1_4_Write: |
|
|
|
b Loop_C1_4_Write |
|
|
|
|
|
|
|
Loop_C1_5: |
|
|
|
add x25, x0, #16 |
|
|
|
add x15, x0, #16 |
|
|
|
cmp w7, #2 |
|
|
|
beq Loop_C1_5_Relu6 |
|
|
|
cmp w7, #1 |
|
|
|
@@ -398,8 +402,8 @@ Loop_C1_5_Relu6: |
|
|
|
fmax v0.4s, v0.4s, v27.4s |
|
|
|
fmax v1.4s, v1.4s, v27.4s |
|
|
|
st1 {v0.4s}, [x0], x6 |
|
|
|
str s1, [x25] |
|
|
|
add x25, x25, x6 |
|
|
|
str s1, [x15] |
|
|
|
add x15, x15, x6 |
|
|
|
b Loop_C1_5_Relu6 |
|
|
|
Loop_C1_5_Relu: |
|
|
|
cmp w13, #0 |
|
|
|
@@ -411,8 +415,8 @@ Loop_C1_5_Relu: |
|
|
|
fmax v0.4s, v0.4s, v27.4s |
|
|
|
fmax v1.4s, v1.4s, v27.4s |
|
|
|
st1 {v0.4s}, [x0], x6 |
|
|
|
str s1, [x25] |
|
|
|
add x25, x25, x6 |
|
|
|
str s1, [x15] |
|
|
|
add x15, x15, x6 |
|
|
|
b Loop_C1_5_Relu |
|
|
|
Loop_C1_5_Write: |
|
|
|
cmp w13, #0 |
|
|
|
@@ -422,12 +426,12 @@ Loop_C1_5_Write: |
|
|
|
fadd v0.4s, v0.4s, v16.4s |
|
|
|
fadd v1.4s, v1.4s, v17.4s |
|
|
|
st1 {v0.4s}, [x0], x6 |
|
|
|
str s1, [x25] |
|
|
|
add x25, x25, x6 |
|
|
|
str s1, [x15] |
|
|
|
add x15, x15, x6 |
|
|
|
b Loop_C1_5_Write |
|
|
|
|
|
|
|
Loop_C1_6: |
|
|
|
add x25, x0, #16 |
|
|
|
add x15, x0, #16 |
|
|
|
cmp w7, #2 |
|
|
|
beq Loop_C1_6_Relu6 |
|
|
|
cmp w7, #1 |
|
|
|
@@ -446,8 +450,8 @@ Loop_C1_6_Relu6: |
|
|
|
fmax v1.4s, v1.4s, v27.4s |
|
|
|
st1 {v0.4s}, [x0], x6 |
|
|
|
dup s0, v1.s[1] |
|
|
|
stp s1, s0, [x25] |
|
|
|
add x25, x25, x6 |
|
|
|
stp s1, s0, [x15] |
|
|
|
add x15, x15, x6 |
|
|
|
b Loop_C1_6_Relu6 |
|
|
|
Loop_C1_6_Relu: |
|
|
|
cmp w13, #0 |
|
|
|
@@ -460,8 +464,8 @@ Loop_C1_6_Relu: |
|
|
|
fmax v1.4s, v1.4s, v27.4s |
|
|
|
st1 {v0.4s}, [x0], x6 |
|
|
|
dup s0, v1.s[1] |
|
|
|
stp s1, s0, [x25] |
|
|
|
add x25, x25, x6 |
|
|
|
stp s1, s0, [x15] |
|
|
|
add x15, x15, x6 |
|
|
|
b Loop_C1_6_Relu |
|
|
|
Loop_C1_6_Write: |
|
|
|
cmp w13, #0 |
|
|
|
@@ -472,13 +476,13 @@ Loop_C1_6_Write: |
|
|
|
fadd v1.4s, v1.4s, v17.4s |
|
|
|
st1 {v0.4s}, [x0], x6 |
|
|
|
dup s0, v1.s[1] |
|
|
|
stp s1, s0, [x25] |
|
|
|
add x25, x25, x6 |
|
|
|
stp s1, s0, [x15] |
|
|
|
add x15, x15, x6 |
|
|
|
b Loop_C1_6_Write |
|
|
|
|
|
|
|
Loop_C1_7: |
|
|
|
add x25, x0, #16 |
|
|
|
add x24, x0, #24 |
|
|
|
add x15, x0, #16 |
|
|
|
add x14, x0, #24 |
|
|
|
cmp w7, #2 |
|
|
|
beq Loop_C1_7_Relu6 |
|
|
|
cmp w7, #1 |
|
|
|
@@ -497,9 +501,9 @@ Loop_C1_7_Relu6: |
|
|
|
fmax v1.4s, v1.4s, v27.4s |
|
|
|
st1 {v0.4s}, [x0], x6 |
|
|
|
dup s0, v1.s[1] |
|
|
|
stp s1, s0, [x25] |
|
|
|
add x25, x25, x6 |
|
|
|
st1 {v1.s}[2], [x24], x6 |
|
|
|
stp s1, s0, [x15] |
|
|
|
add x15, x15, x6 |
|
|
|
st1 {v1.s}[2], [x14], x6 |
|
|
|
b Loop_C1_7_Relu6 |
|
|
|
Loop_C1_7_Relu: |
|
|
|
cmp w13, #0 |
|
|
|
@@ -512,9 +516,9 @@ Loop_C1_7_Relu: |
|
|
|
fmax v1.4s, v1.4s, v27.4s |
|
|
|
st1 {v0.4s}, [x0], x6 |
|
|
|
dup s0, v1.s[1] |
|
|
|
stp s1, s0, [x25] |
|
|
|
add x25, x25, x6 |
|
|
|
st1 {v1.s}[2], [x24], x6 |
|
|
|
stp s1, s0, [x15] |
|
|
|
add x15, x15, x6 |
|
|
|
st1 {v1.s}[2], [x14], x6 |
|
|
|
b Loop_C1_7_Relu |
|
|
|
Loop_C1_7_Write: |
|
|
|
cmp w13, #0 |
|
|
|
@@ -525,11 +529,14 @@ Loop_C1_7_Write: |
|
|
|
fadd v1.4s, v1.4s, v17.4s |
|
|
|
st1 {v0.4s}, [x0], x6 |
|
|
|
dup s0, v1.s[1] |
|
|
|
stp s1, s0, [x25] |
|
|
|
add x25, x25, x6 |
|
|
|
st1 {v1.s}[2], [x24], x6 |
|
|
|
stp s1, s0, [x15] |
|
|
|
add x15, x15, x6 |
|
|
|
st1 {v1.s}[2], [x14], x6 |
|
|
|
b Loop_C1_7_Write |
|
|
|
|
|
|
|
End: |
|
|
|
sub sp, sp, #128 |
|
|
|
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 |
|
|
|
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 |
|
|
|
ret |
|
|
|
#endif |