| @@ -1,131 +0,0 @@ | |||||
| #ifdef __aarch64__ | |||||
| .text | |||||
| .align 5 | |||||
| //.p2align 5,,15 | |||||
| .global C4BiasAdd | |||||
| #ifndef __APPLE__ | |||||
| .type C4BiasAdd, %function | |||||
| #endif | |||||
| //void C4BiasAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) | |||||
| //x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride | |||||
| C4BiasAdd: | |||||
| LoopOc: | |||||
| ld1 {v4.4s}, [x2], #16 | |||||
| mov x6, x4 | |||||
| mov x7, x0 | |||||
| cmp x6, #4 | |||||
| blt Loop1 | |||||
| Loop4: | |||||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||||
| fadd v0.4s, v0.4s, v4.4s | |||||
| fadd v1.4s, v1.4s, v4.4s | |||||
| fadd v2.4s, v2.4s, v4.4s | |||||
| fadd v3.4s, v3.4s, v4.4s | |||||
| cmp x3, #4 | |||||
| bge Write4x4 | |||||
| cmp x3, #3 | |||||
| beq Write3x4 | |||||
| cmp x3, #2 | |||||
| beq Write2x4 | |||||
| Write1x4: | |||||
| str s0, [x7] | |||||
| add x7, x7, x5 | |||||
| str s1, [x7] | |||||
| add x7, x7, x5 | |||||
| str s2, [x7] | |||||
| add x7, x7, x5 | |||||
| str s3, [x7] | |||||
| add x7, x7, x5 | |||||
| b WriteEndx4 | |||||
| Write2x4: | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x5 | |||||
| dup s17, v1.s[1] | |||||
| stp s1, s17, [x7] | |||||
| add x7, x7, x5 | |||||
| dup s18, v2.s[1] | |||||
| stp s2, s18, [x7] | |||||
| add x7, x7, x5 | |||||
| dup s19, v3.s[1] | |||||
| stp s3, s19, [x7] | |||||
| add x7, x7, x5 | |||||
| b WriteEndx4 | |||||
| Write3x4: | |||||
| add x8, x7, #8 | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v0.s}[2], [x8], x5 | |||||
| dup s17, v1.s[1] | |||||
| stp s1, s17, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v1.s}[2], [x8], x5 | |||||
| dup s18, v2.s[1] | |||||
| stp s2, s18, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v2.s}[2], [x8], x5 | |||||
| dup s19, v3.s[1] | |||||
| stp s3, s19, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v3.s}[2], [x8], x5 | |||||
| b WriteEndx4 | |||||
| Write4x4: | |||||
| st1 {v0.4s}, [x7], x5 | |||||
| st1 {v1.4s}, [x7], x5 | |||||
| st1 {v2.4s}, [x7], x5 | |||||
| st1 {v3.4s}, [x7], x5 | |||||
| WriteEndx4: | |||||
| subs x6, x6, #4 | |||||
| beq LoopOcEnd | |||||
| cmp x6, #4 | |||||
| blt Loop1 | |||||
| b Loop4 | |||||
| Loop1: | |||||
| ld1 {v0.4s}, [x1], #16 | |||||
| fadd v0.4s, v0.4s, v4.4s | |||||
| cmp x3, #4 | |||||
| bge Write4 | |||||
| cmp x3, #3 | |||||
| beq Write3 | |||||
| cmp x3, #2 | |||||
| beq Write2 | |||||
| Write1: | |||||
| str s0, [x7] | |||||
| add x7, x7, x5 | |||||
| b WriteEnd | |||||
| Write2: | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x5 | |||||
| b WriteEnd | |||||
| Write3: | |||||
| add x8, x7, #8 | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v0.s}[2], [x8], x5 | |||||
| b WriteEnd | |||||
| Write4: | |||||
| st1 {v0.4s}, [x7], x5 | |||||
| WriteEnd: | |||||
| subs x6, x6, #1 | |||||
| bne Loop1 | |||||
| LoopOcEnd: | |||||
| subs x3, x3, #4 | |||||
| add x0, x0, #16 | |||||
| bgt LoopOc | |||||
| ret | |||||
| #endif | |||||
| @@ -1,137 +0,0 @@ | |||||
| #ifdef __aarch64__ | |||||
| .text | |||||
| .align 5 | |||||
| //.p2align 5,,15 | |||||
| .global C4BiasAddRelu | |||||
| #ifndef __APPLE__ | |||||
| .type C4BiasAddRelu, %function | |||||
| #endif | |||||
| //void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) | |||||
| //x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride | |||||
| C4BiasAddRelu: | |||||
| dup v5.4s, wzr | |||||
| LoopOc: | |||||
| ld1 {v4.4s}, [x2], #16 | |||||
| mov x6, x4 | |||||
| mov x7, x0 | |||||
| cmp x6, #4 | |||||
| blt Loop1 | |||||
| Loop4: | |||||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||||
| fadd v0.4s, v0.4s, v4.4s | |||||
| fadd v1.4s, v1.4s, v4.4s | |||||
| fadd v2.4s, v2.4s, v4.4s | |||||
| fadd v3.4s, v3.4s, v4.4s | |||||
| fmax v0.4s, v0.4s, v5.4s | |||||
| fmax v1.4s, v1.4s, v5.4s | |||||
| fmax v2.4s, v2.4s, v5.4s | |||||
| fmax v3.4s, v3.4s, v5.4s | |||||
| cmp x3, #4 | |||||
| bge Write4x4 | |||||
| cmp x3, #3 | |||||
| beq Write3x4 | |||||
| cmp x3, #2 | |||||
| beq Write2x4 | |||||
| Write1x4: | |||||
| str s0, [x7] | |||||
| add x7, x7, x5 | |||||
| str s1, [x7] | |||||
| add x7, x7, x5 | |||||
| str s2, [x7] | |||||
| add x7, x7, x5 | |||||
| str s3, [x7] | |||||
| add x7, x7, x5 | |||||
| b WriteEndx4 | |||||
| Write2x4: | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x5 | |||||
| dup s17, v1.s[1] | |||||
| stp s1, s17, [x7] | |||||
| add x7, x7, x5 | |||||
| dup s18, v2.s[1] | |||||
| stp s2, s18, [x7] | |||||
| add x7, x7, x5 | |||||
| dup s19, v3.s[1] | |||||
| stp s3, s19, [x7] | |||||
| add x7, x7, x5 | |||||
| b WriteEndx4 | |||||
| Write3x4: | |||||
| add x8, x7, #8 | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v0.s}[2], [x8], x5 | |||||
| dup s17, v1.s[1] | |||||
| stp s1, s17, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v1.s}[2], [x8], x5 | |||||
| dup s18, v2.s[1] | |||||
| stp s2, s18, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v2.s}[2], [x8], x5 | |||||
| dup s19, v3.s[1] | |||||
| stp s3, s19, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v3.s}[2], [x8], x5 | |||||
| b WriteEndx4 | |||||
| Write4x4: | |||||
| st1 {v0.4s}, [x7], x5 | |||||
| st1 {v1.4s}, [x7], x5 | |||||
| st1 {v2.4s}, [x7], x5 | |||||
| st1 {v3.4s}, [x7], x5 | |||||
| WriteEndx4: | |||||
| subs x6, x6, #4 | |||||
| beq LoopOcEnd | |||||
| cmp x6, #4 | |||||
| blt Loop1 | |||||
| b Loop4 | |||||
| Loop1: | |||||
| ld1 {v0.4s}, [x1], #16 | |||||
| fadd v0.4s, v0.4s, v4.4s | |||||
| fmax v0.4s, v0.4s, v5.4s | |||||
| cmp x3, #4 | |||||
| bge Write4 | |||||
| cmp x3, #3 | |||||
| beq Write3 | |||||
| cmp x3, #2 | |||||
| beq Write2 | |||||
| Write1: | |||||
| str s0, [x7] | |||||
| add x7, x7, x5 | |||||
| b WriteEnd | |||||
| Write2: | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x5 | |||||
| b WriteEnd | |||||
| Write3: | |||||
| add x8, x7, #8 | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v0.s}[2], [x8], x5 | |||||
| b WriteEnd | |||||
| Write4: | |||||
| st1 {v0.4s}, [x7], x5 | |||||
| WriteEnd: | |||||
| subs x6, x6, #1 | |||||
| bne Loop1 | |||||
| LoopOcEnd: | |||||
| subs x3, x3, #4 | |||||
| add x0, x0, #16 | |||||
| bgt LoopOc | |||||
| ret | |||||
| #endif | |||||
| @@ -1,146 +0,0 @@ | |||||
| #ifdef __aarch64__ | |||||
| .text | |||||
| .align 5 | |||||
| //.p2align 5,,15 | |||||
| .global C4BiasAddRelu6 | |||||
| #ifndef __APPLE__ | |||||
| .type C4BiasAddRelu6, %function | |||||
| #endif | |||||
| //void C4BiC4BiasAddRelu6asAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) | |||||
| //x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride | |||||
| C4BiasAddRelu6: | |||||
| dup v5.4s, wzr | |||||
| movi v6.4s, #6 | |||||
| scvtf v6.4s, v6.4s | |||||
| LoopOc: | |||||
| ld1 {v4.4s}, [x2], #16 | |||||
| mov x6, x4 | |||||
| mov x7, x0 | |||||
| cmp x6, #4 | |||||
| blt Loop1 | |||||
| Loop4: | |||||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||||
| fadd v0.4s, v0.4s, v4.4s | |||||
| fadd v1.4s, v1.4s, v4.4s | |||||
| fadd v2.4s, v2.4s, v4.4s | |||||
| fadd v3.4s, v3.4s, v4.4s | |||||
| fmax v0.4s, v0.4s, v5.4s | |||||
| fmax v1.4s, v1.4s, v5.4s | |||||
| fmax v2.4s, v2.4s, v5.4s | |||||
| fmax v3.4s, v3.4s, v5.4s | |||||
| fmin v0.4s, v0.4s, v6.4s | |||||
| fmin v1.4s, v1.4s, v6.4s | |||||
| fmin v2.4s, v2.4s, v6.4s | |||||
| fmin v3.4s, v3.4s, v6.4s | |||||
| cmp x3, #4 | |||||
| bge Write4x4 | |||||
| cmp x3, #3 | |||||
| beq Write3x4 | |||||
| cmp x3, #2 | |||||
| beq Write2x4 | |||||
| Write1x4: | |||||
| str s0, [x7] | |||||
| add x7, x7, x5 | |||||
| str s1, [x7] | |||||
| add x7, x7, x5 | |||||
| str s2, [x7] | |||||
| add x7, x7, x5 | |||||
| str s3, [x7] | |||||
| add x7, x7, x5 | |||||
| b WriteEndx4 | |||||
| Write2x4: | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x5 | |||||
| dup s17, v1.s[1] | |||||
| stp s1, s17, [x7] | |||||
| add x7, x7, x5 | |||||
| dup s18, v2.s[1] | |||||
| stp s2, s18, [x7] | |||||
| add x7, x7, x5 | |||||
| dup s19, v3.s[1] | |||||
| stp s3, s19, [x7] | |||||
| add x7, x7, x5 | |||||
| b WriteEndx4 | |||||
| Write3x4: | |||||
| add x8, x7, #8 | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v0.s}[2], [x8], x5 | |||||
| dup s17, v1.s[1] | |||||
| stp s1, s17, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v1.s}[2], [x8], x5 | |||||
| dup s18, v2.s[1] | |||||
| stp s2, s18, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v2.s}[2], [x8], x5 | |||||
| dup s19, v3.s[1] | |||||
| stp s3, s19, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v3.s}[2], [x8], x5 | |||||
| b WriteEndx4 | |||||
| Write4x4: | |||||
| st1 {v0.4s}, [x7], x5 | |||||
| st1 {v1.4s}, [x7], x5 | |||||
| st1 {v2.4s}, [x7], x5 | |||||
| st1 {v3.4s}, [x7], x5 | |||||
| WriteEndx4: | |||||
| subs x6, x6, #4 | |||||
| beq LoopOcEnd | |||||
| cmp x6, #4 | |||||
| blt Loop1 | |||||
| b Loop4 | |||||
| Loop1: | |||||
| ld1 {v0.4s}, [x1], #16 | |||||
| fadd v0.4s, v0.4s, v4.4s | |||||
| fmax v0.4s, v0.4s, v5.4s | |||||
| fmin v0.4s, v0.4s, v6.4s | |||||
| cmp x3, #4 | |||||
| bge Write4 | |||||
| cmp x3, #3 | |||||
| beq Write3 | |||||
| cmp x3, #2 | |||||
| beq Write2 | |||||
| Write1: | |||||
| str s0, [x7] | |||||
| add x7, x7, x5 | |||||
| b WriteEnd | |||||
| Write2: | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x5 | |||||
| b WriteEnd | |||||
| Write3: | |||||
| add x8, x7, #8 | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x5 | |||||
| st1 {v0.s}[2], [x8], x5 | |||||
| b WriteEnd | |||||
| Write4: | |||||
| st1 {v0.4s}, [x7], x5 | |||||
| WriteEnd: | |||||
| subs x6, x6, #1 | |||||
| bne Loop1 | |||||
| LoopOcEnd: | |||||
| subs x3, x3, #4 | |||||
| add x0, x0, #16 | |||||
| bgt LoopOc | |||||
| ret | |||||
| #endif | |||||
| @@ -1,132 +0,0 @@ | |||||
| #ifdef __aarch64__ | |||||
| .text | |||||
| .align 5 | |||||
| //.p2align 5,,15 | |||||
| .global C4Relu | |||||
| #ifndef __APPLE__ | |||||
| .type C4Relu, %function | |||||
| #endif | |||||
| //void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride) | |||||
| //x0: dst, x1: input, x2: oc, x3: plane_size, x4: stride | |||||
| C4Relu: | |||||
| dup v5.4s, wzr | |||||
| LoopOc: | |||||
| mov x6, x3 | |||||
| mov x7, x0 | |||||
| cmp x6, #4 | |||||
| blt Loop1 | |||||
| Loop4: | |||||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||||
| fmax v0.4s, v0.4s, v5.4s | |||||
| fmax v1.4s, v1.4s, v5.4s | |||||
| fmax v2.4s, v2.4s, v5.4s | |||||
| fmax v3.4s, v3.4s, v5.4s | |||||
| cmp x2, #4 | |||||
| bge Write4x4 | |||||
| cmp x2, #3 | |||||
| beq Write3x4 | |||||
| cmp x2, #2 | |||||
| beq Write2x4 | |||||
| Write1x4: | |||||
| str s0, [x7] | |||||
| add x7, x7, x4 | |||||
| str s1, [x7] | |||||
| add x7, x7, x4 | |||||
| str s2, [x7] | |||||
| add x7, x7, x4 | |||||
| str s3, [x7] | |||||
| add x7, x7, x4 | |||||
| b WriteEndx4 | |||||
| Write2x4: | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x4 | |||||
| dup s17, v1.s[1] | |||||
| stp s1, s17, [x7] | |||||
| add x7, x7, x4 | |||||
| dup s18, v2.s[1] | |||||
| stp s2, s18, [x7] | |||||
| add x7, x7, x4 | |||||
| dup s19, v3.s[1] | |||||
| stp s3, s19, [x7] | |||||
| add x7, x7, x4 | |||||
| b WriteEndx4 | |||||
| Write3x4: | |||||
| add x8, x7, #8 | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x4 | |||||
| st1 {v0.s}[2], [x8], x4 | |||||
| dup s17, v1.s[1] | |||||
| stp s1, s17, [x7] | |||||
| add x7, x7, x4 | |||||
| st1 {v1.s}[2], [x8], x4 | |||||
| dup s18, v2.s[1] | |||||
| stp s2, s18, [x7] | |||||
| add x7, x7, x4 | |||||
| st1 {v2.s}[2], [x8], x4 | |||||
| dup s19, v3.s[1] | |||||
| stp s3, s19, [x7] | |||||
| add x7, x7, x4 | |||||
| st1 {v3.s}[2], [x8], x4 | |||||
| b WriteEndx4 | |||||
| Write4x4: | |||||
| st1 {v0.4s}, [x7], x4 | |||||
| st1 {v1.4s}, [x7], x4 | |||||
| st1 {v2.4s}, [x7], x4 | |||||
| st1 {v3.4s}, [x7], x4 | |||||
| WriteEndx4: | |||||
| subs x6, x6, #4 | |||||
| beq LoopOcEnd | |||||
| cmp x6, #4 | |||||
| blt Loop1 | |||||
| b Loop4 | |||||
| Loop1: | |||||
| ld1 {v0.4s}, [x1], #16 | |||||
| fadd v0.4s, v0.4s, v4.4s | |||||
| fmax v0.4s, v0.4s, v5.4s | |||||
| cmp x2, #4 | |||||
| bge Write4 | |||||
| cmp x2, #3 | |||||
| beq Write3 | |||||
| cmp x2, #2 | |||||
| beq Write2 | |||||
| Write1: | |||||
| str s0, [x7] | |||||
| add x7, x7, x4 | |||||
| b WriteEnd | |||||
| Write2: | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x4 | |||||
| b WriteEnd | |||||
| Write3: | |||||
| add x8, x7, #8 | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x4 | |||||
| st1 {v0.s}[2], [x8], x4 | |||||
| b WriteEnd | |||||
| Write4: | |||||
| st1 {v0.4s}, [x7], x4 | |||||
| WriteEnd: | |||||
| subs x6, x6, #1 | |||||
| bne Loop1 | |||||
| LoopOcEnd: | |||||
| subs x2, x2, #4 | |||||
| add x0, x0, #16 | |||||
| bgt LoopOc | |||||
| ret | |||||
| #endif | |||||
| @@ -1,140 +0,0 @@ | |||||
| #ifdef __aarch64__ | |||||
| .text | |||||
| .align 5 | |||||
| //.p2align 5,,15 | |||||
| .global C4Relu6 | |||||
| #ifndef __APPLE__ | |||||
| .type C4Relu6, %function | |||||
| #endif | |||||
| //void C4Relu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) | |||||
| //x0: dst, x1: input, x2: oc, x2: plane_size, x3: stride | |||||
| C4Relu6: | |||||
| dup v5.4s, wzr | |||||
| movi v6.4s, #6 | |||||
| scvtf v6.4s, v6.4s | |||||
| LoopOc: | |||||
| mov x6, x3 | |||||
| mov x7, x0 | |||||
| cmp x6, #4 | |||||
| blt Loop1 | |||||
| Loop4: | |||||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||||
| fmax v0.4s, v0.4s, v5.4s | |||||
| fmax v1.4s, v1.4s, v5.4s | |||||
| fmax v2.4s, v2.4s, v5.4s | |||||
| fmax v3.4s, v3.4s, v5.4s | |||||
| fmin v0.4s, v0.4s, v6.4s | |||||
| fmin v1.4s, v1.4s, v6.4s | |||||
| fmin v2.4s, v2.4s, v6.4s | |||||
| fmin v3.4s, v3.4s, v6.4s | |||||
| cmp x2, #4 | |||||
| bge Write4x4 | |||||
| cmp x2, #3 | |||||
| beq Write3x4 | |||||
| cmp x2, #2 | |||||
| beq Write2x4 | |||||
| Write1x4: | |||||
| str s0, [x7] | |||||
| add x7, x7, x4 | |||||
| str s1, [x7] | |||||
| add x7, x7, x4 | |||||
| str s2, [x7] | |||||
| add x7, x7, x4 | |||||
| str s3, [x7] | |||||
| add x7, x7, x4 | |||||
| b WriteEndx4 | |||||
| Write2x4: | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x4 | |||||
| dup s17, v1.s[1] | |||||
| stp s1, s17, [x7] | |||||
| add x7, x7, x4 | |||||
| dup s18, v2.s[1] | |||||
| stp s2, s18, [x7] | |||||
| add x7, x7, x4 | |||||
| dup s19, v3.s[1] | |||||
| stp s3, s19, [x7] | |||||
| add x7, x7, x4 | |||||
| b WriteEndx4 | |||||
| Write3x4: | |||||
| add x8, x7, #8 | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x4 | |||||
| st1 {v0.s}[2], [x8], x4 | |||||
| dup s17, v1.s[1] | |||||
| stp s1, s17, [x7] | |||||
| add x7, x7, x4 | |||||
| st1 {v1.s}[2], [x8], x4 | |||||
| dup s18, v2.s[1] | |||||
| stp s2, s18, [x7] | |||||
| add x7, x7, x4 | |||||
| st1 {v2.s}[2], [x8], x4 | |||||
| dup s19, v3.s[1] | |||||
| stp s3, s19, [x7] | |||||
| add x7, x7, x4 | |||||
| st1 {v3.s}[2], [x8], x4 | |||||
| b WriteEndx4 | |||||
| Write4x4: | |||||
| st1 {v0.4s}, [x7], x4 | |||||
| st1 {v1.4s}, [x7], x4 | |||||
| st1 {v2.4s}, [x7], x4 | |||||
| st1 {v3.4s}, [x7], x4 | |||||
| WriteEndx4: | |||||
| subs x6, x6, #4 | |||||
| beq LoopOcEnd | |||||
| cmp x6, #4 | |||||
| blt Loop1 | |||||
| b Loop4 | |||||
| Loop1: | |||||
| ld1 {v0.4s}, [x1], #16 | |||||
| fadd v0.4s, v0.4s, v4.4s | |||||
| fmax v0.4s, v0.4s, v5.4s | |||||
| fmin v0.4s, v0.4s, v6.4s | |||||
| cmp x2, #4 | |||||
| bge Write4 | |||||
| cmp x2, #3 | |||||
| beq Write3 | |||||
| cmp x2, #2 | |||||
| beq Write2 | |||||
| Write1: | |||||
| str s0, [x7] | |||||
| add x7, x7, x4 | |||||
| b WriteEnd | |||||
| Write2: | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x4 | |||||
| b WriteEnd | |||||
| Write3: | |||||
| add x8, x7, #8 | |||||
| dup s16, v0.s[1] | |||||
| stp s0, s16, [x7] | |||||
| add x7, x7, x4 | |||||
| st1 {v0.s}[2], [x8], x4 | |||||
| b WriteEnd | |||||
| Write4: | |||||
| st1 {v0.4s}, [x7], x4 | |||||
| WriteEnd: | |||||
| subs x6, x6, #1 | |||||
| bne Loop1 | |||||
| LoopOcEnd: | |||||
| subs x2, x2, #4 | |||||
| add x0, x0, #16 | |||||
| bgt LoopOc | |||||
| ret | |||||
| #endif | |||||
| @@ -40,32 +40,6 @@ void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_p | |||||
| return; | return; | ||||
| } | } | ||||
| void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, | |||||
| size_t plane_size, size_t stride, bool is_relu, bool is_relu6) { | |||||
| #ifndef ENABLE_ARM64 | |||||
| PostConvFuncComm(c4_out_ptr, out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C4NUM); | |||||
| #else | |||||
| if (bias_ptr != NULL) { | |||||
| if (is_relu) { | |||||
| C4BiasAddRelu(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); | |||||
| } else if (is_relu6) { | |||||
| C4BiasAddRelu6(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); | |||||
| } else { | |||||
| C4BiasAdd(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); | |||||
| } | |||||
| } else { | |||||
| if (is_relu) { | |||||
| C4Relu(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float)); | |||||
| } else if (is_relu6) { | |||||
| C4Relu6(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float)); | |||||
| } else { | |||||
| // do nothing | |||||
| } | |||||
| } | |||||
| #endif | |||||
| return; | |||||
| } | |||||
| void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, | void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, | ||||
| size_t plane_size, size_t stride, bool is_relu, bool is_relu6) { | size_t plane_size, size_t stride, bool is_relu, bool is_relu6) { | ||||
| #ifndef ENABLE_ARM64 | #ifndef ENABLE_ARM64 | ||||
| @@ -27,8 +27,6 @@ | |||||
| extern "C" { | extern "C" { | ||||
| #endif | #endif | ||||
| void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, | |||||
| size_t plane_size, size_t stride, bool is_relu, bool is_relu6); | |||||
| void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, | void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, | ||||
| size_t plane_size, size_t stride, bool is_relu, bool is_relu6); | size_t plane_size, size_t stride, bool is_relu, bool is_relu6); | ||||
| float ShortToFloat32(uint16_t src_value); | float ShortToFloat32(uint16_t src_value); | ||||
| @@ -50,11 +48,6 @@ void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size); | |||||
| void BiasAddRelu(const float *bias, float *data, size_t oc4, size_t plan_size); | void BiasAddRelu(const float *bias, float *data, size_t oc4, size_t plan_size); | ||||
| void Relu6(float *data, size_t element4); | void Relu6(float *data, size_t element4); | ||||
| void Relu(float *data, size_t element4); | void Relu(float *data, size_t element4); | ||||
| void C4BiasAdd(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride); | |||||
| void C4BiasAddRelu(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride); | |||||
| void C4BiasAddRelu6(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride); | |||||
| void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); | |||||
| void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); | |||||
| void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels, | void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels, | ||||
| size_t output_channel, size_t input_step); | size_t output_channel, size_t input_step); | ||||
| @@ -45,8 +45,13 @@ int ConvolutionCPUKernel::InitWeightBias() { | |||||
| int ic4 = UP_DIV(in_channel, C4NUM); | int ic4 = UP_DIV(in_channel, C4NUM); | ||||
| int kernel_plane = kernel_h * kernel_w; | int kernel_plane = kernel_h * kernel_w; | ||||
| int oc_block, oc_block_num; | int oc_block, oc_block_num; | ||||
| #ifdef ENABLE_ARM32 | |||||
| oc_block = C4NUM; | |||||
| oc_block_num = UP_DIV(out_channel, C4NUM); | |||||
| #else | |||||
| oc_block = C8NUM; | oc_block = C8NUM; | ||||
| oc_block_num = UP_DIV(out_channel, C8NUM); | oc_block_num = UP_DIV(out_channel, C8NUM); | ||||
| #endif | |||||
| int pack_weight_size = oc_block_num * oc_block * ic4 * C4NUM * kernel_plane; | int pack_weight_size = oc_block_num * oc_block * ic4 * C4NUM * kernel_plane; | ||||
| auto origin_weight = reinterpret_cast<float *>(filter_tensor->MutableData()); | auto origin_weight = reinterpret_cast<float *>(filter_tensor->MutableData()); | ||||
| @@ -113,11 +118,11 @@ void ConvolutionCPUKernel::ConfigInputOutput() { | |||||
| auto output_tensor = out_tensors_.at(kOutputIndex); | auto output_tensor = out_tensors_.at(kOutputIndex); | ||||
| output_tensor->SetFormat(schema::Format::Format_NHWC); | output_tensor->SetFormat(schema::Format::Format_NHWC); | ||||
| // #ifdef ENABLE_ARM32 | |||||
| // gemm_func_ = IndirectGemmFp32_8x4; | |||||
| // #else | |||||
| #ifdef ENABLE_ARM32 | |||||
| gemm_func_ = IndirectGemmFp32_8x4; | |||||
| #else | |||||
| gemm_func_ = IndirectGemmFp32_8x8; | gemm_func_ = IndirectGemmFp32_8x8; | ||||
| // #endif | |||||
| #endif | |||||
| } | } | ||||
| int ConvolutionCPUKernel::Init() { | int ConvolutionCPUKernel::Init() { | ||||
| @@ -170,79 +170,6 @@ TEST_F(TestConv1x1Fp32, Conv1x1WeightTest1) { | |||||
| delete conv_param; | delete conv_param; | ||||
| } | } | ||||
| TEST_F(TestConv1x1Fp32, PostConvFuncC4Test1) { | |||||
| float in[] = {-9.389655, -5.83877, 7.5724425, -1.4675674, -5.456284, 0.7406984, 16.965645, 10.888806, | |||||
| -0.8614793, -4.404605, 10.917422, 0.11158327, -5.2733865, -0.96367484, -4.731118, -7.576815, | |||||
| -6.1621623, -0.6315082, -9.140878, 9.266748, 13.644127, 8.206812, 7.091153, -0.50162584, | |||||
| 2.0889723, 6.6916203, -5.3981733, 11.997365, -9.254076, -5.5964484, -5.981469, -0.51114964, | |||||
| -2.6300175, 0, 0, 0, -7.2690716, 0, 0, 0, | |||||
| 11.1863365, 0, 0, 0, -3.4595785, 0, 0, 0, | |||||
| -8.344107, 0, 0, 0, -3.792715, 0, 0, 0, | |||||
| -7.0394287, 0, 0, 0, -2.7693212, 0, 0, 0}; | |||||
| float bias[] = {0.7429814, 0.4863214, 0.9888875, 0.19727881, 0.009881007, 0, 0, 0}; | |||||
| float out[40] = {0}; | |||||
| float no[] = {-8.646674, -5.3524485, 8.56133, -1.2702886, -2.6201365, -4.7133026, 1.2270198, 17.954533, | |||||
| 11.086085, -7.2591906, -0.11849791, -3.9182835, 11.90631, 0.3088621, 11.196218, -4.530405, | |||||
| -0.47735345, -3.7422307, -7.379536, -3.4496975, -5.419181, -0.14518678, -8.15199, 9.464027, | |||||
| -8.334226, 14.387108, 8.693133, 8.080041, -0.30434704, -3.782834, 2.8319538, 7.177942, | |||||
| -4.409286, 12.194644, -7.0295477, -8.511095, -5.110127, -4.992582, -0.31387085, -2.7594402}; | |||||
| PostConvFuncFp32C4(in, out, bias, 5, 8, 5, false, false); | |||||
| CompareOutputData(out, no, 40, 0.0001); | |||||
| float relu[] = {0, 0, 8.56133, 0, 0, 0, 1.2270198, 17.954533, 11.086085, 0, | |||||
| 0, 0, 11.90631, 0.3088621, 11.196218, 0, 0, 0, 0, 0, | |||||
| 0, 0, 0, 9.464027, 0, 14.387108, 8.693133, 8.080041, 0, 0, | |||||
| 2.8319538, 7.177942, 0, 12.194644, 0, 0, 0, 0, 0, 0}; | |||||
| PostConvFuncFp32C4(in, out, bias, 5, 8, 5, true, false); | |||||
| CompareOutputData(out, relu, 40, 0.0001); | |||||
| float corr_relu6[] = {0, 0, 6, 0, 0, 0, 1.2270198, 6, 6, 0, 0, 0, 6, 0.3088621, 6, 0, 0, 0, 0, 0, | |||||
| 0, 0, 0, 6, 0, 6, 6, 6, 0, 0, 2.8319538, 6, 0, 6, 0, 0, 0, 0, 0, 0}; | |||||
| PostConvFuncFp32C4(in, out, bias, 5, 8, 5, false, true); | |||||
| CompareOutputData(out, corr_relu6, 40, 0.0001); | |||||
| float nob_relu[] = {0, 0, 7.5724425, 0, 0, 0, 0.7406984, 16.965645, | |||||
| 10.888806, 0, 0, 0, 10.917422, 0.11158327, 11.1863365, 0, | |||||
| 0, 0, 0, 0, 0, 0, 0, 9.266748, | |||||
| 0, 13.644127, 8.206812, 7.091153, 0, 0, 2.0889723, 6.6916203, | |||||
| 0, 11.997365, 0, 0, 0, 0, 0, 0}; | |||||
| PostConvFuncFp32C4(in, out, nullptr, 5, 8, 5, true, false); | |||||
| CompareOutputData(out, nob_relu, 40, 0.0001); | |||||
| } | |||||
| TEST_F(TestConv1x1Fp32, PostConvFuncC4Test2) { | |||||
| float in[] = {-9.389655, -5.83877, 7.5724425, -1.4675674, -5.456284, 0.7406984, 16.965645, 10.888806, | |||||
| -0.8614793, -4.404605, 10.917422, 0.11158327, -5.2733865, -0.96367484, -4.731118, -7.576815, | |||||
| -6.1621623, -0.6315082, -9.140878, 9.266748, 13.644127, 8.206812, 7.091153, -0.50162584, | |||||
| 2.0889723, 6.6916203, -5.3981733, 11.997365, -9.254076, -5.5964484, -5.981469, -0.51114964, | |||||
| -2.6300175, 0, 0, 0, -7.2690716, 0, 0, 0, | |||||
| 11.1863365, 0, 0, 0, -3.4595785, 0, 0, 0, | |||||
| -8.344107, 0, 0, 0, -3.792715, 0, 0, 0, | |||||
| -7.0394287, 0, 0, 0, -2.7693212, 0, 0, 0}; | |||||
| float bias[] = {0.7429814, 0.4863214, 0.9888875, 0.19727881, 0.009881007, 0, 0, 0}; | |||||
| float corr[] = {-8.646674, -5.3524485, 8.56133, -1.2702886, -2.6201365, -4.7133026, 1.2270198, 17.954533, | |||||
| 11.086085, -7.2591906, -0.11849791, -3.9182835, 11.90631, 0.3088621, 11.196218, -4.530405, | |||||
| -0.47735345, -3.7422307, -7.379536, -3.4496975, -5.419181, -0.14518678, -8.15199, 9.464027, | |||||
| -8.334226, 14.387108, 8.693133, 8.080041, -0.30434704, -3.782834, 2.8319538, 7.177942, | |||||
| -4.409286, 12.194644, -7.0295477, -8.511095, -5.110127, -4.992582, -0.31387085, -2.7594402}; | |||||
| float out[40] = {0}; | |||||
| int thread_count_ = 2; | |||||
| int thread_oc4_stride_ = 1; | |||||
| int output_channel = 5; | |||||
| int plane_size = 8; | |||||
| for (int i = 0; i < thread_count_; i++) { | |||||
| int cur_oc = MSMIN(thread_oc4_stride_ * 4, output_channel - i * thread_oc4_stride_ * 4); | |||||
| if (cur_oc <= 0) break; | |||||
| PostConvFuncFp32C4(in + thread_oc4_stride_ * i * 8 * 4, out + i * i * thread_oc4_stride_ * 4, | |||||
| bias + i * thread_oc4_stride_ * 4, cur_oc, plane_size, output_channel, false, false); | |||||
| } | |||||
| CompareOutputData(out, corr, 40, 0.0001); | |||||
| } | |||||
| int Conv1x1TestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tensor *> *outputs_, | int Conv1x1TestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tensor *> *outputs_, | ||||
| ConvParameter *conv_param, float **correct) { | ConvParameter *conv_param, float **correct) { | ||||
| lite::Tensor *in_t = new lite::Tensor(kNumberTypeFloat, {1, 2, 3, 4}, schema::Format_NHWC, | lite::Tensor *in_t = new lite::Tensor(kNumberTypeFloat, {1, 2, 3, 4}, schema::Format_NHWC, | ||||