| @@ -16,12 +16,16 @@ | |||||
| // v0 ~ v15 value | // v0 ~ v15 value | ||||
| // v16 v17 bias data | // v16 v17 bias data | ||||
| // x24 x25 weite loop tmp buf | |||||
| // x26 relu6 #6; x27 relu #0 | |||||
| // x14 x15 weite loop tmp buf | |||||
| // x16 relu6 #6; x17 relu #0 | |||||
| // w10 oc8 loop control | // w10 oc8 loop control | ||||
| // w13 hw loop control | // w13 hw loop control | ||||
| PostFuncBiasReluC8: | PostFuncBiasReluC8: | ||||
| sub sp, sp, #128 | |||||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||||
| movi v26.4s, #6 | movi v26.4s, #6 | ||||
| scvtf v26.4s, v26.4s | scvtf v26.4s, v26.4s | ||||
| dup v27.4s, wzr | dup v27.4s, wzr | ||||
| @@ -30,9 +34,9 @@ PostFuncBiasReluC8: | |||||
| Loop_C8: | Loop_C8: | ||||
| cmp w10, w3 | cmp w10, w3 | ||||
| beq Loop_C1 | beq Loop_C1 | ||||
| mov x25, #4 | |||||
| mul x24, x10, x25 | |||||
| add x25, x0, x24 | |||||
| mov x15, #4 | |||||
| mul x14, x10, x15 | |||||
| add x15, x0, x14 | |||||
| add w10, w10, #8 | add w10, w10, #8 | ||||
| mov w13, w5 | mov w13, w5 | ||||
| ld1 {v16.4s, v17.4s}, [x2], #32 | ld1 {v16.4s, v17.4s}, [x2], #32 | ||||
| @@ -103,14 +107,14 @@ Relu_8x8: | |||||
| fmax v14.4s, v14.4s, v27.4s | fmax v14.4s, v14.4s, v27.4s | ||||
| fmax v15.4s, v15.4s, v27.4s | fmax v15.4s, v15.4s, v27.4s | ||||
| Write_8x8: | Write_8x8: | ||||
| st1 {v0.4s, v1.4s}, [x25], x6 | |||||
| st1 {v2.4s, v3.4s}, [x25], x6 | |||||
| st1 {v4.4s, v5.4s}, [x25], x6 | |||||
| st1 {v6.4s, v7.4s}, [x25], x6 | |||||
| st1 {v8.4s, v9.4s}, [x25], x6 | |||||
| st1 {v10.4s, v11.4s}, [x25], x6 | |||||
| st1 {v12.4s, v13.4s}, [x25], x6 | |||||
| st1 {v14.4s, v15.4s}, [x25], x6 | |||||
| st1 {v0.4s, v1.4s}, [x15], x6 | |||||
| st1 {v2.4s, v3.4s}, [x15], x6 | |||||
| st1 {v4.4s, v5.4s}, [x15], x6 | |||||
| st1 {v6.4s, v7.4s}, [x15], x6 | |||||
| st1 {v8.4s, v9.4s}, [x15], x6 | |||||
| st1 {v10.4s, v11.4s}, [x15], x6 | |||||
| st1 {v12.4s, v13.4s}, [x15], x6 | |||||
| st1 {v14.4s, v15.4s}, [x15], x6 | |||||
| b Loop8x8 | b Loop8x8 | ||||
| Loop_4x8: | Loop_4x8: | ||||
| @@ -153,10 +157,10 @@ Relu_4x8: | |||||
| fmax v6.4s, v6.4s, v27.4s | fmax v6.4s, v6.4s, v27.4s | ||||
| fmax v7.4s, v7.4s, v27.4s | fmax v7.4s, v7.4s, v27.4s | ||||
| Write_4x8: | Write_4x8: | ||||
| st1 {v0.4s, v1.4s}, [x25], x6 | |||||
| st1 {v2.4s, v3.4s}, [x25], x6 | |||||
| st1 {v4.4s, v5.4s}, [x25], x6 | |||||
| st1 {v6.4s, v7.4s}, [x25], x6 | |||||
| st1 {v0.4s, v1.4s}, [x15], x6 | |||||
| st1 {v2.4s, v3.4s}, [x15], x6 | |||||
| st1 {v4.4s, v5.4s}, [x15], x6 | |||||
| st1 {v6.4s, v7.4s}, [x15], x6 | |||||
| Loop_1x8: | Loop_1x8: | ||||
| cmp w7, #2 | cmp w7, #2 | ||||
| @@ -175,7 +179,7 @@ Relu6_1x8: | |||||
| fmin v1.4s, v1.4s, v26.4s | fmin v1.4s, v1.4s, v26.4s | ||||
| fmax v0.4s, v0.4s, v27.4s | fmax v0.4s, v0.4s, v27.4s | ||||
| fmax v1.4s, v1.4s, v27.4s | fmax v1.4s, v1.4s, v27.4s | ||||
| st1 {v0.4s, v1.4s}, [x25], x6 | |||||
| st1 {v0.4s, v1.4s}, [x15], x6 | |||||
| b Relu6_1x8 | b Relu6_1x8 | ||||
| Relu_1x8: | Relu_1x8: | ||||
| cmp w13, #0 | cmp w13, #0 | ||||
| @@ -186,7 +190,7 @@ Relu_1x8: | |||||
| fadd v1.4s, v1.4s, v17.4s | fadd v1.4s, v1.4s, v17.4s | ||||
| fmax v0.4s, v0.4s, v27.4s | fmax v0.4s, v0.4s, v27.4s | ||||
| fmax v1.4s, v1.4s, v27.4s | fmax v1.4s, v1.4s, v27.4s | ||||
| st1 {v0.4s, v1.4s}, [x25], x6 | |||||
| st1 {v0.4s, v1.4s}, [x15], x6 | |||||
| b Relu_1x8 | b Relu_1x8 | ||||
| Write_1x8: | Write_1x8: | ||||
| cmp w13, #0 | cmp w13, #0 | ||||
| @@ -195,7 +199,7 @@ Write_1x8: | |||||
| ld1 {v0.4s, v1.4s}, [x1], #32 | ld1 {v0.4s, v1.4s}, [x1], #32 | ||||
| fadd v0.4s, v0.4s, v16.4s | fadd v0.4s, v0.4s, v16.4s | ||||
| fadd v1.4s, v1.4s, v17.4s | fadd v1.4s, v1.4s, v17.4s | ||||
| st1 {v0.4s, v1.4s}, [x25], x6 | |||||
| st1 {v0.4s, v1.4s}, [x15], x6 | |||||
| b Write_1x8 | b Write_1x8 | ||||
| @@ -204,9 +208,9 @@ Loop_C1: | |||||
| beq End | beq End | ||||
| mov w13, w5 | mov w13, w5 | ||||
| ld1 {v16.4s, v17.4s}, [x2], #32 | ld1 {v16.4s, v17.4s}, [x2], #32 | ||||
| mov x25, #4 | |||||
| mul x24, x10, x25 | |||||
| add x0, x0, x24 | |||||
| mov x15, #4 | |||||
| mul x14, x10, x15 | |||||
| add x0, x0, x14 | |||||
| cmp x4, #1 | cmp x4, #1 | ||||
| beq Loop_C1_1 | beq Loop_C1_1 | ||||
| @@ -302,7 +306,7 @@ Loop_C1_2_Write: | |||||
| Loop_C1_3: | Loop_C1_3: | ||||
| add x25, x0, #8 | |||||
| add x15, x0, #8 | |||||
| cmp w7, #2 | cmp w7, #2 | ||||
| beq Loop_C1_3_Relu6 | beq Loop_C1_3_Relu6 | ||||
| cmp w7, #1 | cmp w7, #1 | ||||
| @@ -319,7 +323,7 @@ Loop_C1_3_Relu6: | |||||
| dup s1, v0.s[1] | dup s1, v0.s[1] | ||||
| stp s0, s1, [x0] | stp s0, s1, [x0] | ||||
| add x0, x0, x6 | add x0, x0, x6 | ||||
| st1 {v0.s}[2], [x25], x6 | |||||
| st1 {v0.s}[2], [x15], x6 | |||||
| b Loop_C1_3_Relu6 | b Loop_C1_3_Relu6 | ||||
| Loop_C1_3_Relu: | Loop_C1_3_Relu: | ||||
| cmp w13, #0 | cmp w13, #0 | ||||
| @@ -331,7 +335,7 @@ Loop_C1_3_Relu: | |||||
| dup s1, v0.s[1] | dup s1, v0.s[1] | ||||
| stp s0, s1, [x0] | stp s0, s1, [x0] | ||||
| add x0, x0, x6 | add x0, x0, x6 | ||||
| st1 {v0.s}[2], [x25], x6 | |||||
| st1 {v0.s}[2], [x15], x6 | |||||
| b Loop_C1_3_Relu | b Loop_C1_3_Relu | ||||
| Loop_C1_3_Write: | Loop_C1_3_Write: | ||||
| cmp w13, #0 | cmp w13, #0 | ||||
| @@ -342,7 +346,7 @@ Loop_C1_3_Write: | |||||
| dup s1, v0.s[1] | dup s1, v0.s[1] | ||||
| stp s0, s1, [x0] | stp s0, s1, [x0] | ||||
| add x0, x0, x6 | add x0, x0, x6 | ||||
| st1 {v0.s}[2], [x25], x6 | |||||
| st1 {v0.s}[2], [x15], x6 | |||||
| b Loop_C1_3_Write | b Loop_C1_3_Write | ||||
| Loop_C1_4: | Loop_C1_4: | ||||
| @@ -380,7 +384,7 @@ Loop_C1_4_Write: | |||||
| b Loop_C1_4_Write | b Loop_C1_4_Write | ||||
| Loop_C1_5: | Loop_C1_5: | ||||
| add x25, x0, #16 | |||||
| add x15, x0, #16 | |||||
| cmp w7, #2 | cmp w7, #2 | ||||
| beq Loop_C1_5_Relu6 | beq Loop_C1_5_Relu6 | ||||
| cmp w7, #1 | cmp w7, #1 | ||||
| @@ -398,8 +402,8 @@ Loop_C1_5_Relu6: | |||||
| fmax v0.4s, v0.4s, v27.4s | fmax v0.4s, v0.4s, v27.4s | ||||
| fmax v1.4s, v1.4s, v27.4s | fmax v1.4s, v1.4s, v27.4s | ||||
| st1 {v0.4s}, [x0], x6 | st1 {v0.4s}, [x0], x6 | ||||
| str s1, [x25] | |||||
| add x25, x25, x6 | |||||
| str s1, [x15] | |||||
| add x15, x15, x6 | |||||
| b Loop_C1_5_Relu6 | b Loop_C1_5_Relu6 | ||||
| Loop_C1_5_Relu: | Loop_C1_5_Relu: | ||||
| cmp w13, #0 | cmp w13, #0 | ||||
| @@ -411,8 +415,8 @@ Loop_C1_5_Relu: | |||||
| fmax v0.4s, v0.4s, v27.4s | fmax v0.4s, v0.4s, v27.4s | ||||
| fmax v1.4s, v1.4s, v27.4s | fmax v1.4s, v1.4s, v27.4s | ||||
| st1 {v0.4s}, [x0], x6 | st1 {v0.4s}, [x0], x6 | ||||
| str s1, [x25] | |||||
| add x25, x25, x6 | |||||
| str s1, [x15] | |||||
| add x15, x15, x6 | |||||
| b Loop_C1_5_Relu | b Loop_C1_5_Relu | ||||
| Loop_C1_5_Write: | Loop_C1_5_Write: | ||||
| cmp w13, #0 | cmp w13, #0 | ||||
| @@ -422,12 +426,12 @@ Loop_C1_5_Write: | |||||
| fadd v0.4s, v0.4s, v16.4s | fadd v0.4s, v0.4s, v16.4s | ||||
| fadd v1.4s, v1.4s, v17.4s | fadd v1.4s, v1.4s, v17.4s | ||||
| st1 {v0.4s}, [x0], x6 | st1 {v0.4s}, [x0], x6 | ||||
| str s1, [x25] | |||||
| add x25, x25, x6 | |||||
| str s1, [x15] | |||||
| add x15, x15, x6 | |||||
| b Loop_C1_5_Write | b Loop_C1_5_Write | ||||
| Loop_C1_6: | Loop_C1_6: | ||||
| add x25, x0, #16 | |||||
| add x15, x0, #16 | |||||
| cmp w7, #2 | cmp w7, #2 | ||||
| beq Loop_C1_6_Relu6 | beq Loop_C1_6_Relu6 | ||||
| cmp w7, #1 | cmp w7, #1 | ||||
| @@ -446,8 +450,8 @@ Loop_C1_6_Relu6: | |||||
| fmax v1.4s, v1.4s, v27.4s | fmax v1.4s, v1.4s, v27.4s | ||||
| st1 {v0.4s}, [x0], x6 | st1 {v0.4s}, [x0], x6 | ||||
| dup s0, v1.s[1] | dup s0, v1.s[1] | ||||
| stp s1, s0, [x25] | |||||
| add x25, x25, x6 | |||||
| stp s1, s0, [x15] | |||||
| add x15, x15, x6 | |||||
| b Loop_C1_6_Relu6 | b Loop_C1_6_Relu6 | ||||
| Loop_C1_6_Relu: | Loop_C1_6_Relu: | ||||
| cmp w13, #0 | cmp w13, #0 | ||||
| @@ -460,8 +464,8 @@ Loop_C1_6_Relu: | |||||
| fmax v1.4s, v1.4s, v27.4s | fmax v1.4s, v1.4s, v27.4s | ||||
| st1 {v0.4s}, [x0], x6 | st1 {v0.4s}, [x0], x6 | ||||
| dup s0, v1.s[1] | dup s0, v1.s[1] | ||||
| stp s1, s0, [x25] | |||||
| add x25, x25, x6 | |||||
| stp s1, s0, [x15] | |||||
| add x15, x15, x6 | |||||
| b Loop_C1_6_Relu | b Loop_C1_6_Relu | ||||
| Loop_C1_6_Write: | Loop_C1_6_Write: | ||||
| cmp w13, #0 | cmp w13, #0 | ||||
| @@ -472,13 +476,13 @@ Loop_C1_6_Write: | |||||
| fadd v1.4s, v1.4s, v17.4s | fadd v1.4s, v1.4s, v17.4s | ||||
| st1 {v0.4s}, [x0], x6 | st1 {v0.4s}, [x0], x6 | ||||
| dup s0, v1.s[1] | dup s0, v1.s[1] | ||||
| stp s1, s0, [x25] | |||||
| add x25, x25, x6 | |||||
| stp s1, s0, [x15] | |||||
| add x15, x15, x6 | |||||
| b Loop_C1_6_Write | b Loop_C1_6_Write | ||||
| Loop_C1_7: | Loop_C1_7: | ||||
| add x25, x0, #16 | |||||
| add x24, x0, #24 | |||||
| add x15, x0, #16 | |||||
| add x14, x0, #24 | |||||
| cmp w7, #2 | cmp w7, #2 | ||||
| beq Loop_C1_7_Relu6 | beq Loop_C1_7_Relu6 | ||||
| cmp w7, #1 | cmp w7, #1 | ||||
| @@ -497,9 +501,9 @@ Loop_C1_7_Relu6: | |||||
| fmax v1.4s, v1.4s, v27.4s | fmax v1.4s, v1.4s, v27.4s | ||||
| st1 {v0.4s}, [x0], x6 | st1 {v0.4s}, [x0], x6 | ||||
| dup s0, v1.s[1] | dup s0, v1.s[1] | ||||
| stp s1, s0, [x25] | |||||
| add x25, x25, x6 | |||||
| st1 {v1.s}[2], [x24], x6 | |||||
| stp s1, s0, [x15] | |||||
| add x15, x15, x6 | |||||
| st1 {v1.s}[2], [x14], x6 | |||||
| b Loop_C1_7_Relu6 | b Loop_C1_7_Relu6 | ||||
| Loop_C1_7_Relu: | Loop_C1_7_Relu: | ||||
| cmp w13, #0 | cmp w13, #0 | ||||
| @@ -512,9 +516,9 @@ Loop_C1_7_Relu: | |||||
| fmax v1.4s, v1.4s, v27.4s | fmax v1.4s, v1.4s, v27.4s | ||||
| st1 {v0.4s}, [x0], x6 | st1 {v0.4s}, [x0], x6 | ||||
| dup s0, v1.s[1] | dup s0, v1.s[1] | ||||
| stp s1, s0, [x25] | |||||
| add x25, x25, x6 | |||||
| st1 {v1.s}[2], [x24], x6 | |||||
| stp s1, s0, [x15] | |||||
| add x15, x15, x6 | |||||
| st1 {v1.s}[2], [x14], x6 | |||||
| b Loop_C1_7_Relu | b Loop_C1_7_Relu | ||||
| Loop_C1_7_Write: | Loop_C1_7_Write: | ||||
| cmp w13, #0 | cmp w13, #0 | ||||
| @@ -525,11 +529,14 @@ Loop_C1_7_Write: | |||||
| fadd v1.4s, v1.4s, v17.4s | fadd v1.4s, v1.4s, v17.4s | ||||
| st1 {v0.4s}, [x0], x6 | st1 {v0.4s}, [x0], x6 | ||||
| dup s0, v1.s[1] | dup s0, v1.s[1] | ||||
| stp s1, s0, [x25] | |||||
| add x25, x25, x6 | |||||
| st1 {v1.s}[2], [x24], x6 | |||||
| stp s1, s0, [x15] | |||||
| add x15, x15, x6 | |||||
| st1 {v1.s}[2], [x14], x6 | |||||
| b Loop_C1_7_Write | b Loop_C1_7_Write | ||||
| End: | End: | ||||
| sub sp, sp, #128 | |||||
| ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 | |||||
| ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 | |||||
| ret | ret | ||||
| #endif | #endif | ||||