| @@ -1,131 +0,0 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| //.p2align 5,,15 | |||
| .global C4BiasAdd | |||
| #ifndef __APPLE__ | |||
| .type C4BiasAdd, %function | |||
| #endif | |||
| //void C4BiasAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) | |||
| //x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride | |||
| C4BiasAdd: | |||
| LoopOc: | |||
| ld1 {v4.4s}, [x2], #16 | |||
| mov x6, x4 | |||
| mov x7, x0 | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| Loop4: | |||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| fadd v1.4s, v1.4s, v4.4s | |||
| fadd v2.4s, v2.4s, v4.4s | |||
| fadd v3.4s, v3.4s, v4.4s | |||
| cmp x3, #4 | |||
| bge Write4x4 | |||
| cmp x3, #3 | |||
| beq Write3x4 | |||
| cmp x3, #2 | |||
| beq Write2x4 | |||
| Write1x4: | |||
| str s0, [x7] | |||
| add x7, x7, x5 | |||
| str s1, [x7] | |||
| add x7, x7, x5 | |||
| str s2, [x7] | |||
| add x7, x7, x5 | |||
| str s3, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEndx4 | |||
| Write2x4: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x5 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x5 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEndx4 | |||
| Write3x4: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v0.s}[2], [x8], x5 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v1.s}[2], [x8], x5 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v2.s}[2], [x8], x5 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v3.s}[2], [x8], x5 | |||
| b WriteEndx4 | |||
| Write4x4: | |||
| st1 {v0.4s}, [x7], x5 | |||
| st1 {v1.4s}, [x7], x5 | |||
| st1 {v2.4s}, [x7], x5 | |||
| st1 {v3.4s}, [x7], x5 | |||
| WriteEndx4: | |||
| subs x6, x6, #4 | |||
| beq LoopOcEnd | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| b Loop4 | |||
| Loop1: | |||
| ld1 {v0.4s}, [x1], #16 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| cmp x3, #4 | |||
| bge Write4 | |||
| cmp x3, #3 | |||
| beq Write3 | |||
| cmp x3, #2 | |||
| beq Write2 | |||
| Write1: | |||
| str s0, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEnd | |||
| Write2: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEnd | |||
| Write3: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v0.s}[2], [x8], x5 | |||
| b WriteEnd | |||
| Write4: | |||
| st1 {v0.4s}, [x7], x5 | |||
| WriteEnd: | |||
| subs x6, x6, #1 | |||
| bne Loop1 | |||
| LoopOcEnd: | |||
| subs x3, x3, #4 | |||
| add x0, x0, #16 | |||
| bgt LoopOc | |||
| ret | |||
| #endif | |||
| @@ -1,137 +0,0 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| //.p2align 5,,15 | |||
| .global C4BiasAddRelu | |||
| #ifndef __APPLE__ | |||
| .type C4BiasAddRelu, %function | |||
| #endif | |||
| //void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) | |||
| //x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride | |||
| C4BiasAddRelu: | |||
| dup v5.4s, wzr | |||
| LoopOc: | |||
| ld1 {v4.4s}, [x2], #16 | |||
| mov x6, x4 | |||
| mov x7, x0 | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| Loop4: | |||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| fadd v1.4s, v1.4s, v4.4s | |||
| fadd v2.4s, v2.4s, v4.4s | |||
| fadd v3.4s, v3.4s, v4.4s | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| fmax v1.4s, v1.4s, v5.4s | |||
| fmax v2.4s, v2.4s, v5.4s | |||
| fmax v3.4s, v3.4s, v5.4s | |||
| cmp x3, #4 | |||
| bge Write4x4 | |||
| cmp x3, #3 | |||
| beq Write3x4 | |||
| cmp x3, #2 | |||
| beq Write2x4 | |||
| Write1x4: | |||
| str s0, [x7] | |||
| add x7, x7, x5 | |||
| str s1, [x7] | |||
| add x7, x7, x5 | |||
| str s2, [x7] | |||
| add x7, x7, x5 | |||
| str s3, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEndx4 | |||
| Write2x4: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x5 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x5 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEndx4 | |||
| Write3x4: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v0.s}[2], [x8], x5 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v1.s}[2], [x8], x5 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v2.s}[2], [x8], x5 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v3.s}[2], [x8], x5 | |||
| b WriteEndx4 | |||
| Write4x4: | |||
| st1 {v0.4s}, [x7], x5 | |||
| st1 {v1.4s}, [x7], x5 | |||
| st1 {v2.4s}, [x7], x5 | |||
| st1 {v3.4s}, [x7], x5 | |||
| WriteEndx4: | |||
| subs x6, x6, #4 | |||
| beq LoopOcEnd | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| b Loop4 | |||
| Loop1: | |||
| ld1 {v0.4s}, [x1], #16 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| cmp x3, #4 | |||
| bge Write4 | |||
| cmp x3, #3 | |||
| beq Write3 | |||
| cmp x3, #2 | |||
| beq Write2 | |||
| Write1: | |||
| str s0, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEnd | |||
| Write2: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEnd | |||
| Write3: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v0.s}[2], [x8], x5 | |||
| b WriteEnd | |||
| Write4: | |||
| st1 {v0.4s}, [x7], x5 | |||
| WriteEnd: | |||
| subs x6, x6, #1 | |||
| bne Loop1 | |||
| LoopOcEnd: | |||
| subs x3, x3, #4 | |||
| add x0, x0, #16 | |||
| bgt LoopOc | |||
| ret | |||
| #endif | |||
| @@ -1,146 +0,0 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| //.p2align 5,,15 | |||
| .global C4BiasAddRelu6 | |||
| #ifndef __APPLE__ | |||
| .type C4BiasAddRelu6, %function | |||
| #endif | |||
| //void C4BiC4BiasAddRelu6asAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) | |||
| //x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride | |||
| C4BiasAddRelu6: | |||
| dup v5.4s, wzr | |||
| movi v6.4s, #6 | |||
| scvtf v6.4s, v6.4s | |||
| LoopOc: | |||
| ld1 {v4.4s}, [x2], #16 | |||
| mov x6, x4 | |||
| mov x7, x0 | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| Loop4: | |||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| fadd v1.4s, v1.4s, v4.4s | |||
| fadd v2.4s, v2.4s, v4.4s | |||
| fadd v3.4s, v3.4s, v4.4s | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| fmax v1.4s, v1.4s, v5.4s | |||
| fmax v2.4s, v2.4s, v5.4s | |||
| fmax v3.4s, v3.4s, v5.4s | |||
| fmin v0.4s, v0.4s, v6.4s | |||
| fmin v1.4s, v1.4s, v6.4s | |||
| fmin v2.4s, v2.4s, v6.4s | |||
| fmin v3.4s, v3.4s, v6.4s | |||
| cmp x3, #4 | |||
| bge Write4x4 | |||
| cmp x3, #3 | |||
| beq Write3x4 | |||
| cmp x3, #2 | |||
| beq Write2x4 | |||
| Write1x4: | |||
| str s0, [x7] | |||
| add x7, x7, x5 | |||
| str s1, [x7] | |||
| add x7, x7, x5 | |||
| str s2, [x7] | |||
| add x7, x7, x5 | |||
| str s3, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEndx4 | |||
| Write2x4: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x5 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x5 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEndx4 | |||
| Write3x4: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v0.s}[2], [x8], x5 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v1.s}[2], [x8], x5 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v2.s}[2], [x8], x5 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v3.s}[2], [x8], x5 | |||
| b WriteEndx4 | |||
| Write4x4: | |||
| st1 {v0.4s}, [x7], x5 | |||
| st1 {v1.4s}, [x7], x5 | |||
| st1 {v2.4s}, [x7], x5 | |||
| st1 {v3.4s}, [x7], x5 | |||
| WriteEndx4: | |||
| subs x6, x6, #4 | |||
| beq LoopOcEnd | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| b Loop4 | |||
| Loop1: | |||
| ld1 {v0.4s}, [x1], #16 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| fmin v0.4s, v0.4s, v6.4s | |||
| cmp x3, #4 | |||
| bge Write4 | |||
| cmp x3, #3 | |||
| beq Write3 | |||
| cmp x3, #2 | |||
| beq Write2 | |||
| Write1: | |||
| str s0, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEnd | |||
| Write2: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEnd | |||
| Write3: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v0.s}[2], [x8], x5 | |||
| b WriteEnd | |||
| Write4: | |||
| st1 {v0.4s}, [x7], x5 | |||
| WriteEnd: | |||
| subs x6, x6, #1 | |||
| bne Loop1 | |||
| LoopOcEnd: | |||
| subs x3, x3, #4 | |||
| add x0, x0, #16 | |||
| bgt LoopOc | |||
| ret | |||
| #endif | |||
| @@ -1,132 +0,0 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| //.p2align 5,,15 | |||
| .global C4Relu | |||
| #ifndef __APPLE__ | |||
| .type C4Relu, %function | |||
| #endif | |||
| //void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride) | |||
| //x0: dst, x1: input, x2: oc, x3: plane_size, x4: stride | |||
| C4Relu: | |||
| dup v5.4s, wzr | |||
| LoopOc: | |||
| mov x6, x3 | |||
| mov x7, x0 | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| Loop4: | |||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| fmax v1.4s, v1.4s, v5.4s | |||
| fmax v2.4s, v2.4s, v5.4s | |||
| fmax v3.4s, v3.4s, v5.4s | |||
| cmp x2, #4 | |||
| bge Write4x4 | |||
| cmp x2, #3 | |||
| beq Write3x4 | |||
| cmp x2, #2 | |||
| beq Write2x4 | |||
| Write1x4: | |||
| str s0, [x7] | |||
| add x7, x7, x4 | |||
| str s1, [x7] | |||
| add x7, x7, x4 | |||
| str s2, [x7] | |||
| add x7, x7, x4 | |||
| str s3, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEndx4 | |||
| Write2x4: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x4 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x4 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEndx4 | |||
| Write3x4: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v0.s}[2], [x8], x4 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v1.s}[2], [x8], x4 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v2.s}[2], [x8], x4 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v3.s}[2], [x8], x4 | |||
| b WriteEndx4 | |||
| Write4x4: | |||
| st1 {v0.4s}, [x7], x4 | |||
| st1 {v1.4s}, [x7], x4 | |||
| st1 {v2.4s}, [x7], x4 | |||
| st1 {v3.4s}, [x7], x4 | |||
| WriteEndx4: | |||
| subs x6, x6, #4 | |||
| beq LoopOcEnd | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| b Loop4 | |||
| Loop1: | |||
| ld1 {v0.4s}, [x1], #16 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| cmp x2, #4 | |||
| bge Write4 | |||
| cmp x2, #3 | |||
| beq Write3 | |||
| cmp x2, #2 | |||
| beq Write2 | |||
| Write1: | |||
| str s0, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEnd | |||
| Write2: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEnd | |||
| Write3: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v0.s}[2], [x8], x4 | |||
| b WriteEnd | |||
| Write4: | |||
| st1 {v0.4s}, [x7], x4 | |||
| WriteEnd: | |||
| subs x6, x6, #1 | |||
| bne Loop1 | |||
| LoopOcEnd: | |||
| subs x2, x2, #4 | |||
| add x0, x0, #16 | |||
| bgt LoopOc | |||
| ret | |||
| #endif | |||
| @@ -1,140 +0,0 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| //.p2align 5,,15 | |||
| .global C4Relu6 | |||
| #ifndef __APPLE__ | |||
| .type C4Relu6, %function | |||
| #endif | |||
| //void C4Relu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) | |||
| //x0: dst, x1: input, x2: oc, x2: plane_size, x3: stride | |||
| C4Relu6: | |||
| dup v5.4s, wzr | |||
| movi v6.4s, #6 | |||
| scvtf v6.4s, v6.4s | |||
| LoopOc: | |||
| mov x6, x3 | |||
| mov x7, x0 | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| Loop4: | |||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| fmax v1.4s, v1.4s, v5.4s | |||
| fmax v2.4s, v2.4s, v5.4s | |||
| fmax v3.4s, v3.4s, v5.4s | |||
| fmin v0.4s, v0.4s, v6.4s | |||
| fmin v1.4s, v1.4s, v6.4s | |||
| fmin v2.4s, v2.4s, v6.4s | |||
| fmin v3.4s, v3.4s, v6.4s | |||
| cmp x2, #4 | |||
| bge Write4x4 | |||
| cmp x2, #3 | |||
| beq Write3x4 | |||
| cmp x2, #2 | |||
| beq Write2x4 | |||
| Write1x4: | |||
| str s0, [x7] | |||
| add x7, x7, x4 | |||
| str s1, [x7] | |||
| add x7, x7, x4 | |||
| str s2, [x7] | |||
| add x7, x7, x4 | |||
| str s3, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEndx4 | |||
| Write2x4: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x4 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x4 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEndx4 | |||
| Write3x4: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v0.s}[2], [x8], x4 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v1.s}[2], [x8], x4 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v2.s}[2], [x8], x4 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v3.s}[2], [x8], x4 | |||
| b WriteEndx4 | |||
| Write4x4: | |||
| st1 {v0.4s}, [x7], x4 | |||
| st1 {v1.4s}, [x7], x4 | |||
| st1 {v2.4s}, [x7], x4 | |||
| st1 {v3.4s}, [x7], x4 | |||
| WriteEndx4: | |||
| subs x6, x6, #4 | |||
| beq LoopOcEnd | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| b Loop4 | |||
| Loop1: | |||
| ld1 {v0.4s}, [x1], #16 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| fmin v0.4s, v0.4s, v6.4s | |||
| cmp x2, #4 | |||
| bge Write4 | |||
| cmp x2, #3 | |||
| beq Write3 | |||
| cmp x2, #2 | |||
| beq Write2 | |||
| Write1: | |||
| str s0, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEnd | |||
| Write2: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEnd | |||
| Write3: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v0.s}[2], [x8], x4 | |||
| b WriteEnd | |||
| Write4: | |||
| st1 {v0.4s}, [x7], x4 | |||
| WriteEnd: | |||
| subs x6, x6, #1 | |||
| bne Loop1 | |||
| LoopOcEnd: | |||
| subs x2, x2, #4 | |||
| add x0, x0, #16 | |||
| bgt LoopOc | |||
| ret | |||
| #endif | |||
| @@ -40,32 +40,6 @@ void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_p | |||
| return; | |||
| } | |||
| void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, | |||
| size_t plane_size, size_t stride, bool is_relu, bool is_relu6) { | |||
| #ifndef ENABLE_ARM64 | |||
| PostConvFuncComm(c4_out_ptr, out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C4NUM); | |||
| #else | |||
| if (bias_ptr != NULL) { | |||
| if (is_relu) { | |||
| C4BiasAddRelu(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); | |||
| } else if (is_relu6) { | |||
| C4BiasAddRelu6(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); | |||
| } else { | |||
| C4BiasAdd(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); | |||
| } | |||
| } else { | |||
| if (is_relu) { | |||
| C4Relu(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float)); | |||
| } else if (is_relu6) { | |||
| C4Relu6(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float)); | |||
| } else { | |||
| // do nothing | |||
| } | |||
| } | |||
| #endif | |||
| return; | |||
| } | |||
| void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, | |||
| size_t plane_size, size_t stride, bool is_relu, bool is_relu6) { | |||
| #ifndef ENABLE_ARM64 | |||
| @@ -27,8 +27,6 @@ | |||
| extern "C" { | |||
| #endif | |||
| void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, | |||
| size_t plane_size, size_t stride, bool is_relu, bool is_relu6); | |||
| void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, | |||
| size_t plane_size, size_t stride, bool is_relu, bool is_relu6); | |||
| float ShortToFloat32(uint16_t src_value); | |||
| @@ -50,11 +48,6 @@ void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size); | |||
| void BiasAddRelu(const float *bias, float *data, size_t oc4, size_t plan_size); | |||
| void Relu6(float *data, size_t element4); | |||
| void Relu(float *data, size_t element4); | |||
| void C4BiasAdd(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride); | |||
| void C4BiasAddRelu(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride); | |||
| void C4BiasAddRelu6(float *dst, const float *input, const float *bias, size_t oc, size_t plane_size, size_t stride); | |||
| void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); | |||
| void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); | |||
| void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels, | |||
| size_t output_channel, size_t input_step); | |||
| @@ -45,8 +45,13 @@ int ConvolutionCPUKernel::InitWeightBias() { | |||
| int ic4 = UP_DIV(in_channel, C4NUM); | |||
| int kernel_plane = kernel_h * kernel_w; | |||
| int oc_block, oc_block_num; | |||
| #ifdef ENABLE_ARM32 | |||
| oc_block = C4NUM; | |||
| oc_block_num = UP_DIV(out_channel, C4NUM); | |||
| #else | |||
| oc_block = C8NUM; | |||
| oc_block_num = UP_DIV(out_channel, C8NUM); | |||
| #endif | |||
| int pack_weight_size = oc_block_num * oc_block * ic4 * C4NUM * kernel_plane; | |||
| auto origin_weight = reinterpret_cast<float *>(filter_tensor->MutableData()); | |||
| @@ -113,11 +118,11 @@ void ConvolutionCPUKernel::ConfigInputOutput() { | |||
| auto output_tensor = out_tensors_.at(kOutputIndex); | |||
| output_tensor->SetFormat(schema::Format::Format_NHWC); | |||
| // #ifdef ENABLE_ARM32 | |||
| // gemm_func_ = IndirectGemmFp32_8x4; | |||
| // #else | |||
| #ifdef ENABLE_ARM32 | |||
| gemm_func_ = IndirectGemmFp32_8x4; | |||
| #else | |||
| gemm_func_ = IndirectGemmFp32_8x8; | |||
| // #endif | |||
| #endif | |||
| } | |||
| int ConvolutionCPUKernel::Init() { | |||
| @@ -170,79 +170,6 @@ TEST_F(TestConv1x1Fp32, Conv1x1WeightTest1) { | |||
| delete conv_param; | |||
| } | |||
| TEST_F(TestConv1x1Fp32, PostConvFuncC4Test1) { | |||
| float in[] = {-9.389655, -5.83877, 7.5724425, -1.4675674, -5.456284, 0.7406984, 16.965645, 10.888806, | |||
| -0.8614793, -4.404605, 10.917422, 0.11158327, -5.2733865, -0.96367484, -4.731118, -7.576815, | |||
| -6.1621623, -0.6315082, -9.140878, 9.266748, 13.644127, 8.206812, 7.091153, -0.50162584, | |||
| 2.0889723, 6.6916203, -5.3981733, 11.997365, -9.254076, -5.5964484, -5.981469, -0.51114964, | |||
| -2.6300175, 0, 0, 0, -7.2690716, 0, 0, 0, | |||
| 11.1863365, 0, 0, 0, -3.4595785, 0, 0, 0, | |||
| -8.344107, 0, 0, 0, -3.792715, 0, 0, 0, | |||
| -7.0394287, 0, 0, 0, -2.7693212, 0, 0, 0}; | |||
| float bias[] = {0.7429814, 0.4863214, 0.9888875, 0.19727881, 0.009881007, 0, 0, 0}; | |||
| float out[40] = {0}; | |||
| float no[] = {-8.646674, -5.3524485, 8.56133, -1.2702886, -2.6201365, -4.7133026, 1.2270198, 17.954533, | |||
| 11.086085, -7.2591906, -0.11849791, -3.9182835, 11.90631, 0.3088621, 11.196218, -4.530405, | |||
| -0.47735345, -3.7422307, -7.379536, -3.4496975, -5.419181, -0.14518678, -8.15199, 9.464027, | |||
| -8.334226, 14.387108, 8.693133, 8.080041, -0.30434704, -3.782834, 2.8319538, 7.177942, | |||
| -4.409286, 12.194644, -7.0295477, -8.511095, -5.110127, -4.992582, -0.31387085, -2.7594402}; | |||
| PostConvFuncFp32C4(in, out, bias, 5, 8, 5, false, false); | |||
| CompareOutputData(out, no, 40, 0.0001); | |||
| float relu[] = {0, 0, 8.56133, 0, 0, 0, 1.2270198, 17.954533, 11.086085, 0, | |||
| 0, 0, 11.90631, 0.3088621, 11.196218, 0, 0, 0, 0, 0, | |||
| 0, 0, 0, 9.464027, 0, 14.387108, 8.693133, 8.080041, 0, 0, | |||
| 2.8319538, 7.177942, 0, 12.194644, 0, 0, 0, 0, 0, 0}; | |||
| PostConvFuncFp32C4(in, out, bias, 5, 8, 5, true, false); | |||
| CompareOutputData(out, relu, 40, 0.0001); | |||
| float corr_relu6[] = {0, 0, 6, 0, 0, 0, 1.2270198, 6, 6, 0, 0, 0, 6, 0.3088621, 6, 0, 0, 0, 0, 0, | |||
| 0, 0, 0, 6, 0, 6, 6, 6, 0, 0, 2.8319538, 6, 0, 6, 0, 0, 0, 0, 0, 0}; | |||
| PostConvFuncFp32C4(in, out, bias, 5, 8, 5, false, true); | |||
| CompareOutputData(out, corr_relu6, 40, 0.0001); | |||
| float nob_relu[] = {0, 0, 7.5724425, 0, 0, 0, 0.7406984, 16.965645, | |||
| 10.888806, 0, 0, 0, 10.917422, 0.11158327, 11.1863365, 0, | |||
| 0, 0, 0, 0, 0, 0, 0, 9.266748, | |||
| 0, 13.644127, 8.206812, 7.091153, 0, 0, 2.0889723, 6.6916203, | |||
| 0, 11.997365, 0, 0, 0, 0, 0, 0}; | |||
| PostConvFuncFp32C4(in, out, nullptr, 5, 8, 5, true, false); | |||
| CompareOutputData(out, nob_relu, 40, 0.0001); | |||
| } | |||
| TEST_F(TestConv1x1Fp32, PostConvFuncC4Test2) { | |||
| float in[] = {-9.389655, -5.83877, 7.5724425, -1.4675674, -5.456284, 0.7406984, 16.965645, 10.888806, | |||
| -0.8614793, -4.404605, 10.917422, 0.11158327, -5.2733865, -0.96367484, -4.731118, -7.576815, | |||
| -6.1621623, -0.6315082, -9.140878, 9.266748, 13.644127, 8.206812, 7.091153, -0.50162584, | |||
| 2.0889723, 6.6916203, -5.3981733, 11.997365, -9.254076, -5.5964484, -5.981469, -0.51114964, | |||
| -2.6300175, 0, 0, 0, -7.2690716, 0, 0, 0, | |||
| 11.1863365, 0, 0, 0, -3.4595785, 0, 0, 0, | |||
| -8.344107, 0, 0, 0, -3.792715, 0, 0, 0, | |||
| -7.0394287, 0, 0, 0, -2.7693212, 0, 0, 0}; | |||
| float bias[] = {0.7429814, 0.4863214, 0.9888875, 0.19727881, 0.009881007, 0, 0, 0}; | |||
| float corr[] = {-8.646674, -5.3524485, 8.56133, -1.2702886, -2.6201365, -4.7133026, 1.2270198, 17.954533, | |||
| 11.086085, -7.2591906, -0.11849791, -3.9182835, 11.90631, 0.3088621, 11.196218, -4.530405, | |||
| -0.47735345, -3.7422307, -7.379536, -3.4496975, -5.419181, -0.14518678, -8.15199, 9.464027, | |||
| -8.334226, 14.387108, 8.693133, 8.080041, -0.30434704, -3.782834, 2.8319538, 7.177942, | |||
| -4.409286, 12.194644, -7.0295477, -8.511095, -5.110127, -4.992582, -0.31387085, -2.7594402}; | |||
| float out[40] = {0}; | |||
| int thread_count_ = 2; | |||
| int thread_oc4_stride_ = 1; | |||
| int output_channel = 5; | |||
| int plane_size = 8; | |||
| for (int i = 0; i < thread_count_; i++) { | |||
| int cur_oc = MSMIN(thread_oc4_stride_ * 4, output_channel - i * thread_oc4_stride_ * 4); | |||
| if (cur_oc <= 0) break; | |||
| PostConvFuncFp32C4(in + thread_oc4_stride_ * i * 8 * 4, out + i * i * thread_oc4_stride_ * 4, | |||
| bias + i * thread_oc4_stride_ * 4, cur_oc, plane_size, output_channel, false, false); | |||
| } | |||
| CompareOutputData(out, corr, 40, 0.0001); | |||
| } | |||
| int Conv1x1TestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tensor *> *outputs_, | |||
| ConvParameter *conv_param, float **correct) { | |||
| lite::Tensor *in_t = new lite::Tensor(kNumberTypeFloat, {1, 2, 3, 4}, schema::Format_NHWC, | |||