| @@ -16,12 +16,16 @@ | |||
| // v0 ~ v15 value | |||
| // v16 v17 bias data | |||
| // x24 x25 weite loop tmp buf | |||
| // x26 relu6 #6; x27 relu #0 | |||
| // x14 x15 weite loop tmp buf | |||
| // x16 relu6 #6; x17 relu #0 | |||
| // w10 oc8 loop control | |||
| // w13 hw loop control | |||
| PostFuncBiasReluC8: | |||
| sub sp, sp, #128 | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| movi v26.4s, #6 | |||
| scvtf v26.4s, v26.4s | |||
| dup v27.4s, wzr | |||
| @@ -30,9 +34,9 @@ PostFuncBiasReluC8: | |||
| Loop_C8: | |||
| cmp w10, w3 | |||
| beq Loop_C1 | |||
| mov x25, #4 | |||
| mul x24, x10, x25 | |||
| add x25, x0, x24 | |||
| mov x15, #4 | |||
| mul x14, x10, x15 | |||
| add x15, x0, x14 | |||
| add w10, w10, #8 | |||
| mov w13, w5 | |||
| ld1 {v16.4s, v17.4s}, [x2], #32 | |||
| @@ -103,14 +107,14 @@ Relu_8x8: | |||
| fmax v14.4s, v14.4s, v27.4s | |||
| fmax v15.4s, v15.4s, v27.4s | |||
| Write_8x8: | |||
| st1 {v0.4s, v1.4s}, [x25], x6 | |||
| st1 {v2.4s, v3.4s}, [x25], x6 | |||
| st1 {v4.4s, v5.4s}, [x25], x6 | |||
| st1 {v6.4s, v7.4s}, [x25], x6 | |||
| st1 {v8.4s, v9.4s}, [x25], x6 | |||
| st1 {v10.4s, v11.4s}, [x25], x6 | |||
| st1 {v12.4s, v13.4s}, [x25], x6 | |||
| st1 {v14.4s, v15.4s}, [x25], x6 | |||
| st1 {v0.4s, v1.4s}, [x15], x6 | |||
| st1 {v2.4s, v3.4s}, [x15], x6 | |||
| st1 {v4.4s, v5.4s}, [x15], x6 | |||
| st1 {v6.4s, v7.4s}, [x15], x6 | |||
| st1 {v8.4s, v9.4s}, [x15], x6 | |||
| st1 {v10.4s, v11.4s}, [x15], x6 | |||
| st1 {v12.4s, v13.4s}, [x15], x6 | |||
| st1 {v14.4s, v15.4s}, [x15], x6 | |||
| b Loop8x8 | |||
| Loop_4x8: | |||
| @@ -153,10 +157,10 @@ Relu_4x8: | |||
| fmax v6.4s, v6.4s, v27.4s | |||
| fmax v7.4s, v7.4s, v27.4s | |||
| Write_4x8: | |||
| st1 {v0.4s, v1.4s}, [x25], x6 | |||
| st1 {v2.4s, v3.4s}, [x25], x6 | |||
| st1 {v4.4s, v5.4s}, [x25], x6 | |||
| st1 {v6.4s, v7.4s}, [x25], x6 | |||
| st1 {v0.4s, v1.4s}, [x15], x6 | |||
| st1 {v2.4s, v3.4s}, [x15], x6 | |||
| st1 {v4.4s, v5.4s}, [x15], x6 | |||
| st1 {v6.4s, v7.4s}, [x15], x6 | |||
| Loop_1x8: | |||
| cmp w7, #2 | |||
| @@ -175,7 +179,7 @@ Relu6_1x8: | |||
| fmin v1.4s, v1.4s, v26.4s | |||
| fmax v0.4s, v0.4s, v27.4s | |||
| fmax v1.4s, v1.4s, v27.4s | |||
| st1 {v0.4s, v1.4s}, [x25], x6 | |||
| st1 {v0.4s, v1.4s}, [x15], x6 | |||
| b Relu6_1x8 | |||
| Relu_1x8: | |||
| cmp w13, #0 | |||
| @@ -186,7 +190,7 @@ Relu_1x8: | |||
| fadd v1.4s, v1.4s, v17.4s | |||
| fmax v0.4s, v0.4s, v27.4s | |||
| fmax v1.4s, v1.4s, v27.4s | |||
| st1 {v0.4s, v1.4s}, [x25], x6 | |||
| st1 {v0.4s, v1.4s}, [x15], x6 | |||
| b Relu_1x8 | |||
| Write_1x8: | |||
| cmp w13, #0 | |||
| @@ -195,7 +199,7 @@ Write_1x8: | |||
| ld1 {v0.4s, v1.4s}, [x1], #32 | |||
| fadd v0.4s, v0.4s, v16.4s | |||
| fadd v1.4s, v1.4s, v17.4s | |||
| st1 {v0.4s, v1.4s}, [x25], x6 | |||
| st1 {v0.4s, v1.4s}, [x15], x6 | |||
| b Write_1x8 | |||
| @@ -204,9 +208,9 @@ Loop_C1: | |||
| beq End | |||
| mov w13, w5 | |||
| ld1 {v16.4s, v17.4s}, [x2], #32 | |||
| mov x25, #4 | |||
| mul x24, x10, x25 | |||
| add x0, x0, x24 | |||
| mov x15, #4 | |||
| mul x14, x10, x15 | |||
| add x0, x0, x14 | |||
| cmp x4, #1 | |||
| beq Loop_C1_1 | |||
| @@ -302,7 +306,7 @@ Loop_C1_2_Write: | |||
| Loop_C1_3: | |||
| add x25, x0, #8 | |||
| add x15, x0, #8 | |||
| cmp w7, #2 | |||
| beq Loop_C1_3_Relu6 | |||
| cmp w7, #1 | |||
| @@ -319,7 +323,7 @@ Loop_C1_3_Relu6: | |||
| dup s1, v0.s[1] | |||
| stp s0, s1, [x0] | |||
| add x0, x0, x6 | |||
| st1 {v0.s}[2], [x25], x6 | |||
| st1 {v0.s}[2], [x15], x6 | |||
| b Loop_C1_3_Relu6 | |||
| Loop_C1_3_Relu: | |||
| cmp w13, #0 | |||
| @@ -331,7 +335,7 @@ Loop_C1_3_Relu: | |||
| dup s1, v0.s[1] | |||
| stp s0, s1, [x0] | |||
| add x0, x0, x6 | |||
| st1 {v0.s}[2], [x25], x6 | |||
| st1 {v0.s}[2], [x15], x6 | |||
| b Loop_C1_3_Relu | |||
| Loop_C1_3_Write: | |||
| cmp w13, #0 | |||
| @@ -342,7 +346,7 @@ Loop_C1_3_Write: | |||
| dup s1, v0.s[1] | |||
| stp s0, s1, [x0] | |||
| add x0, x0, x6 | |||
| st1 {v0.s}[2], [x25], x6 | |||
| st1 {v0.s}[2], [x15], x6 | |||
| b Loop_C1_3_Write | |||
| Loop_C1_4: | |||
| @@ -380,7 +384,7 @@ Loop_C1_4_Write: | |||
| b Loop_C1_4_Write | |||
| Loop_C1_5: | |||
| add x25, x0, #16 | |||
| add x15, x0, #16 | |||
| cmp w7, #2 | |||
| beq Loop_C1_5_Relu6 | |||
| cmp w7, #1 | |||
| @@ -398,8 +402,8 @@ Loop_C1_5_Relu6: | |||
| fmax v0.4s, v0.4s, v27.4s | |||
| fmax v1.4s, v1.4s, v27.4s | |||
| st1 {v0.4s}, [x0], x6 | |||
| str s1, [x25] | |||
| add x25, x25, x6 | |||
| str s1, [x15] | |||
| add x15, x15, x6 | |||
| b Loop_C1_5_Relu6 | |||
| Loop_C1_5_Relu: | |||
| cmp w13, #0 | |||
| @@ -411,8 +415,8 @@ Loop_C1_5_Relu: | |||
| fmax v0.4s, v0.4s, v27.4s | |||
| fmax v1.4s, v1.4s, v27.4s | |||
| st1 {v0.4s}, [x0], x6 | |||
| str s1, [x25] | |||
| add x25, x25, x6 | |||
| str s1, [x15] | |||
| add x15, x15, x6 | |||
| b Loop_C1_5_Relu | |||
| Loop_C1_5_Write: | |||
| cmp w13, #0 | |||
| @@ -422,12 +426,12 @@ Loop_C1_5_Write: | |||
| fadd v0.4s, v0.4s, v16.4s | |||
| fadd v1.4s, v1.4s, v17.4s | |||
| st1 {v0.4s}, [x0], x6 | |||
| str s1, [x25] | |||
| add x25, x25, x6 | |||
| str s1, [x15] | |||
| add x15, x15, x6 | |||
| b Loop_C1_5_Write | |||
| Loop_C1_6: | |||
| add x25, x0, #16 | |||
| add x15, x0, #16 | |||
| cmp w7, #2 | |||
| beq Loop_C1_6_Relu6 | |||
| cmp w7, #1 | |||
| @@ -446,8 +450,8 @@ Loop_C1_6_Relu6: | |||
| fmax v1.4s, v1.4s, v27.4s | |||
| st1 {v0.4s}, [x0], x6 | |||
| dup s0, v1.s[1] | |||
| stp s1, s0, [x25] | |||
| add x25, x25, x6 | |||
| stp s1, s0, [x15] | |||
| add x15, x15, x6 | |||
| b Loop_C1_6_Relu6 | |||
| Loop_C1_6_Relu: | |||
| cmp w13, #0 | |||
| @@ -460,8 +464,8 @@ Loop_C1_6_Relu: | |||
| fmax v1.4s, v1.4s, v27.4s | |||
| st1 {v0.4s}, [x0], x6 | |||
| dup s0, v1.s[1] | |||
| stp s1, s0, [x25] | |||
| add x25, x25, x6 | |||
| stp s1, s0, [x15] | |||
| add x15, x15, x6 | |||
| b Loop_C1_6_Relu | |||
| Loop_C1_6_Write: | |||
| cmp w13, #0 | |||
| @@ -472,13 +476,13 @@ Loop_C1_6_Write: | |||
| fadd v1.4s, v1.4s, v17.4s | |||
| st1 {v0.4s}, [x0], x6 | |||
| dup s0, v1.s[1] | |||
| stp s1, s0, [x25] | |||
| add x25, x25, x6 | |||
| stp s1, s0, [x15] | |||
| add x15, x15, x6 | |||
| b Loop_C1_6_Write | |||
| Loop_C1_7: | |||
| add x25, x0, #16 | |||
| add x24, x0, #24 | |||
| add x15, x0, #16 | |||
| add x14, x0, #24 | |||
| cmp w7, #2 | |||
| beq Loop_C1_7_Relu6 | |||
| cmp w7, #1 | |||
| @@ -497,9 +501,9 @@ Loop_C1_7_Relu6: | |||
| fmax v1.4s, v1.4s, v27.4s | |||
| st1 {v0.4s}, [x0], x6 | |||
| dup s0, v1.s[1] | |||
| stp s1, s0, [x25] | |||
| add x25, x25, x6 | |||
| st1 {v1.s}[2], [x24], x6 | |||
| stp s1, s0, [x15] | |||
| add x15, x15, x6 | |||
| st1 {v1.s}[2], [x14], x6 | |||
| b Loop_C1_7_Relu6 | |||
| Loop_C1_7_Relu: | |||
| cmp w13, #0 | |||
| @@ -512,9 +516,9 @@ Loop_C1_7_Relu: | |||
| fmax v1.4s, v1.4s, v27.4s | |||
| st1 {v0.4s}, [x0], x6 | |||
| dup s0, v1.s[1] | |||
| stp s1, s0, [x25] | |||
| add x25, x25, x6 | |||
| st1 {v1.s}[2], [x24], x6 | |||
| stp s1, s0, [x15] | |||
| add x15, x15, x6 | |||
| st1 {v1.s}[2], [x14], x6 | |||
| b Loop_C1_7_Relu | |||
| Loop_C1_7_Write: | |||
| cmp w13, #0 | |||
| @@ -525,11 +529,14 @@ Loop_C1_7_Write: | |||
| fadd v1.4s, v1.4s, v17.4s | |||
| st1 {v0.4s}, [x0], x6 | |||
| dup s0, v1.s[1] | |||
| stp s1, s0, [x25] | |||
| add x25, x25, x6 | |||
| st1 {v1.s}[2], [x24], x6 | |||
| stp s1, s0, [x15] | |||
| add x15, x15, x6 | |||
| st1 {v1.s}[2], [x14], x6 | |||
| b Loop_C1_7_Write | |||
| End: | |||
| sub sp, sp, #128 | |||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||
| ret | |||
| #endif | |||