| @@ -1,4 +1,5 @@ | |||
| #ifdef __aarch64__ | |||
| #ifdef __arm__ | |||
| #ifndef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| @@ -10,15 +11,16 @@ | |||
| // void IndirectGemmFp32_8x4(float *output, float *input, float *weight, float *bias, | |||
| // size_t kSize, size_t ic4, size_t oc8, size_t offset, size_t mode, size_t writeC4, size_t relu, size_t relu6); | |||
| // r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset | |||
| // r8:mode, r10: writeMode, x10: relu, r10:relu6 | |||
| // r8:mode, r10: writeMode, r10: relu, r10:relu6 | |||
| // mode = 0 for general convolution, where one conv unit is a row | |||
| // mode = 1 for winograd/common gemm, where the total channels of one input is a row | |||
| IndirectGemmFp32_8x4: | |||
| .macro INIT_BIAS | |||
| veor q10, q10, q10 | |||
| cbz x3, InitBias | |||
| vld1.32 q10, [x3] | |||
| cmp r3, #0 | |||
| beq InitBias | |||
| vld1.32 q10, [r3] | |||
| InitBias: | |||
| vmov q11, q10 | |||
| vmov q12, q10 | |||
| @@ -27,10 +29,11 @@ IndirectGemmFp32_8x4: | |||
| vmov q15, q10 | |||
| .endm | |||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // r19 ~ r29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | |||
| // according to https://stackoverflow.com/questions/53625807 | |||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | |||
| // clang's rule seems more simple, though there are no subroutine calls here | |||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | |||
| push {r4-r8, r10, r11, lr} | |||
| vpush {q4-q7} | |||
| add sp, sp, #160 | |||
| @@ -41,7 +44,8 @@ IndirectGemmFp32_8x4: | |||
| ldr r7, [sp, #12] | |||
| ldr r8, [sp, #16] | |||
| cbnz r8, LoopOc | |||
| cmp r8, #0 | |||
| bne LoopOc | |||
| // step is one for common convolution, where ic8 should multiply by kernel size | |||
| // step is (a+b-1) for F(a,b) in winograd | |||
| mul r5, r4, r5 | |||
| @@ -57,17 +61,18 @@ IndirectGemmFp32_8x4: | |||
| INIT_BIAS | |||
| // load input for output 1-2 | |||
| vld1.32 {q0, q1, q2, q3}, [x12]! | |||
| vld1.32 {q0, q1}, [r12]! | |||
| vld1.32 {q2, q3}, [r12]! | |||
| // load weight | |||
| vld1.32 {q4, q5}, [x2]! | |||
| vld1.32 {q4, q5}, [r2]! | |||
| // step for output 1-2 | |||
| vmul.f32 q8, q4, d0[0] | |||
| vmul.f32 q9, q4, d2[0] | |||
| vmla.f32 q8, q5, d0[1] | |||
| vmla.f32 q9, q5, d2[1] | |||
| vld1.32 {q6, q7}, [x2]! | |||
| vld1.32 {q6, q7}, [r2]! | |||
| subs x10, x5, #1 | |||
| subs r10, r5, #1 | |||
| beq LoopIcEnd | |||
| LoopIc: | |||
| @@ -146,9 +151,11 @@ IndirectGemmFp32_8x4: | |||
| vmla.f32 q15, q7, d7[1] | |||
| ldr r10, [sp, #28] | |||
| cbnz r10, Relu6 | |||
| cmp r10, #0 | |||
| bne Relu6 | |||
| ldr r10, [sp, #24] | |||
| cbnz x10, Relu | |||
| cmp r10, #0 | |||
| bne Relu | |||
| b WriteStart | |||
| Relu6: | |||
| vmov.i32 q14, #6 | |||
| @@ -174,7 +181,8 @@ IndirectGemmFp32_8x4: | |||
| WriteStart: | |||
| ldr r10, [sp, #20] | |||
| cbnz x10, WriteC4 | |||
| cmp r10, #0 | |||
| bne WriteC4 | |||
| cmp r6, #1 | |||
| beq Write1 | |||
| cmp r6, #2 | |||
| @@ -183,97 +191,97 @@ IndirectGemmFp32_8x4: | |||
| beq Write3 | |||
| b Write4 | |||
| Write1: | |||
| str s0, [r11] | |||
| add r11, r11, x7 | |||
| str s4, [r11] | |||
| add r11, r11, x7 | |||
| str s8, [r11] | |||
| add r11, r11, x7 | |||
| str s12, [r11] | |||
| add r11, r11, x7 | |||
| str s16, [r11] | |||
| add r11, r11, x7 | |||
| str s20, [r11] | |||
| add r11, r11, x7 | |||
| str s24, [r11] | |||
| add r11, r11, x7 | |||
| str s28, [r11] | |||
| vst1.32 d0[0], [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d2[0], [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d4[0], [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d6[0], [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d8[0], [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d10[0], [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d12[0], [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d14[0], [r11] | |||
| add r0, r0, #4 | |||
| b WriteEnd | |||
| Write2: | |||
| str d0, [r11] | |||
| add r11, r11, x7 | |||
| str d2, [r11] | |||
| add r11, r11, x7 | |||
| str d4, [r11] | |||
| add r11, r11, x7 | |||
| str d6, [r11] | |||
| add r11, r11, x7 | |||
| str d8, [r11] | |||
| add r11, r11, x7 | |||
| str d10, [r11] | |||
| add r11, r11, x7 | |||
| str d12, [r11] | |||
| add r11, r11, x7 | |||
| str d14, [r11] | |||
| vst1.32 d0, [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d2, [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d4, [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d6, [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d8, [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d10, [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d12, [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d14, [r11] | |||
| add r0, r0, #8 | |||
| b WriteEnd | |||
| Write3: | |||
| add r12, r11, #8 | |||
| str d0, [r11] | |||
| add r11, r11, x7 | |||
| str s2, [r12] | |||
| vst1.32 d0, [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d1[0], [r12] | |||
| add r12, r12, r7 | |||
| str d2, [r11] | |||
| add r11, r11, x7 | |||
| str s6, [r12] | |||
| vst1.32 d2, [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d3[0], [r12] | |||
| add r12, r12, r7 | |||
| str d4, [r11] | |||
| add r11, r11, x7 | |||
| str s10, [r12] | |||
| vst1.32 d4, [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d5[0], [r12] | |||
| add r12, r12, r7 | |||
| str d6, [r11] | |||
| add r11, r11, x7 | |||
| str s14, [r12] | |||
| vst1.32 d6, [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d7[0], [r12] | |||
| add r12, r12, r7 | |||
| str d8, [r11] | |||
| add r11, r11, x7 | |||
| str s18, [r12] | |||
| vst1.32 d8, [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d9[0], [r12] | |||
| add r12, r12, r7 | |||
| str d10, [r11] | |||
| add r11, r11, x7 | |||
| str s22, [r12] | |||
| vst1.32 d10, [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d11[0], [r12] | |||
| add r12, r12, r7 | |||
| str d12, [r11] | |||
| add r11, r11, x7 | |||
| str s26, [r12] | |||
| vst1.32 d12, [r11] | |||
| add r11, r11, r7 | |||
| vst1.32 d13[0], [r12] | |||
| add r12, r12, r7 | |||
| str d14, [r11] | |||
| str s30, [r12] | |||
| vst1.32 d14, [r11] | |||
| vst1.32 d15[0], [r12] | |||
| add r0, r0, #12 | |||
| b WriteEnd | |||
| WriteC4: | |||
| vst1.32 q0, [r11], x7 | |||
| vst1.32 q1, [r11], x7 | |||
| vst1.32 q2, [r11], x7 | |||
| vst1.32 q3, [r11], x7 | |||
| vst1.32 q4, [r11], x7 | |||
| vst1.32 q5, [r11], x7 | |||
| vst1.32 q6, [r11], x7 | |||
| vst1.32 q0, [r11], r7 | |||
| vst1.32 q1, [r11], r7 | |||
| vst1.32 q2, [r11], r7 | |||
| vst1.32 q3, [r11], r7 | |||
| vst1.32 q4, [r11], r7 | |||
| vst1.32 q5, [r11], r7 | |||
| vst1.32 q6, [r11], r7 | |||
| vst1.32 q7, [r11] | |||
| add r0, r0, #16 | |||
| b WriteEnd | |||
| Write4: | |||
| // prefetching is not prefered while writing results in spite of cache missings | |||
| // you could try prfm pstl2strm | |||
| // you could try prfm pstl2vst1.32m | |||
| // there are almost no benefits observed though | |||
| vst1.32 q0, [r11], x7 | |||
| vst1.32 q1, [r11], x7 | |||
| vst1.32 q2, [r11], x7 | |||
| vst1.32 q3, [r11], x7 | |||
| vst1.32 q4, [r11], x7 | |||
| vst1.32 q5, [r11], x7 | |||
| vst1.32 q6, [r11], x7 | |||
| vst1.32 q0, [r11], r7 | |||
| vst1.32 q1, [r11], r7 | |||
| vst1.32 q2, [r11], r7 | |||
| vst1.32 q3, [r11], r7 | |||
| vst1.32 q4, [r11], r7 | |||
| vst1.32 q5, [r11], r7 | |||
| vst1.32 q6, [r11], r7 | |||
| vst1.32 q7, [r11] | |||
| add r0, r0, #16 | |||
| @@ -283,7 +291,8 @@ IndirectGemmFp32_8x4: | |||
| bne LoopKsize | |||
| subs r6, r6, #4 | |||
| cbz r3, NoStepFowrard | |||
| cmp r3, #0 | |||
| beq NoStepFowrard | |||
| add r3, r3, #16 | |||
| NoStepFowrard: | |||
| bgt LoopOc | |||
| @@ -292,3 +301,4 @@ IndirectGemmFp32_8x4: | |||
| vpop {q4-q7} | |||
| pop {r4-r8, r10, r11, pc} | |||
| #endif | |||
| #endif | |||
| @@ -1,4 +1,5 @@ | |||
| #ifdef __aarch64__ | |||
| #ifdef __arm__ | |||
| #ifndef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| @@ -10,8 +11,8 @@ | |||
| // void IndirectGemmInt8_2x4(int8_t *output, int8_t *input, int8_t *weight, int32_t *bias, size_t ksize, size_t ic4, | |||
| // size_t oc, size_t offset, int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp, size_t out_multiplier, | |||
| // size_t shift_before, size_t shift_after); | |||
| // x0: output, x1: input, r2: weight, x3: bias, x4: kSize, x5: ic4, x6: oc, x7: offset | |||
| // x8: input_sum, x10: act_min, x11: act_max, x10: out_zp, x11: out_multiplier, x10: shift_before, x11: shift_after | |||
| // r0: output, r1: input, r2: weight, r3: bias, r4: kSize, r5: ic4, r6: oc, r7: offset | |||
| // r8: input_sum, r10: act_min, r11: act_max, r10: out_zp, r11: out_multiplier, r10: shift_before, r11: shift_after | |||
| IndirectGemmInt8_2x4: | |||
| .macro INIT_BIAS | |||
| @@ -73,7 +74,7 @@ IndirectGemmInt8_2x4: | |||
| vmlal.s8 q6, d1, d9 | |||
| vmlal.s8 q7, d1, d11 | |||
| subs x10, x5, #1 | |||
| subs r10, r5, #1 | |||
| beq LoopIcEnd | |||
| LoopIc: | |||
| @@ -107,7 +108,7 @@ IndirectGemmInt8_2x4: | |||
| vmlal.s8 q6, d1, d9 | |||
| vmlal.s8 q7, d1, d11 | |||
| subs x10, x10, #1 | |||
| subs r10, r10, #1 | |||
| bne LoopIc | |||
| LoopIcEnd: | |||
| @@ -131,15 +132,23 @@ IndirectGemmInt8_2x4: | |||
| vld1.32 q0[], [r10]! | |||
| vld1.32 q1[], [r10]! | |||
| // pairwise add | |||
| vpadd.i32 q8, q8, q9 | |||
| vpadd.i32 q10, q10, q11 | |||
| vpadd.i32 q12, q12, q13 | |||
| vpadd.i32 q14, q14, q15 | |||
| vpadd.i32 q8, q8, q10 | |||
| vpadd.i32 q12, q12, q14 | |||
| vpadd.i32 d16, d16, d17 | |||
| vpadd.i32 d18, d18, d19 | |||
| vpadd.i32 d20, d20, d21 | |||
| vpadd.i32 d22, d22, d23 | |||
| vpadd.i32 d24, d24, d25 | |||
| vpadd.i32 d26, d26, d27 | |||
| vpadd.i32 d28, d28, d29 | |||
| vpadd.i32 d30, d30, d31 | |||
| vpadd.i32 d16, d16, d18 | |||
| vpadd.i32 d17, d20, d22 | |||
| vpadd.i32 d24, d24, d26 | |||
| vpadd.i32 d25, d28, d30 | |||
| vsub.i32 q8, q8, q0 | |||
| vsub.i32 q12, q12, q1 | |||
| cbz r3, NoBias | |||
| cmp r3, #0 | |||
| beq NoBias | |||
| vld1.32 q2, [r3] | |||
| vadd.i32 q8, q8, q2 | |||
| vadd.i32 q12, q12, q2 | |||
| @@ -182,34 +191,34 @@ IndirectGemmInt8_2x4: | |||
| // prefetching is not prefered while writing results in spite of cache missings | |||
| // you could try prfm pstl2strm | |||
| WriteStart: | |||
| cmp x6, #1 | |||
| cmp r6, #1 | |||
| beq Write1 | |||
| cmp x6, #2 | |||
| cmp r6, #2 | |||
| beq Write2 | |||
| cmp x6, #3 | |||
| cmp r6, #3 | |||
| beq Write3 | |||
| b Write4 | |||
| Write1: | |||
| vst1.8 {d0[0]}, [x11], x7 | |||
| vst1.8 {d0[1]}, [x11] | |||
| vst1.8 {d0[0]}, [r11], r7 | |||
| vst1.8 {d0[1]}, [r11] | |||
| add r0, r0, #1 | |||
| b WriteEnd | |||
| Write2: | |||
| vst1.16 {d0[0]}, [x11], x7 | |||
| vst1.16 {d0[1]}, [x11] | |||
| vst1.16 {d0[0]}, [r11], r7 | |||
| vst1.16 {d0[1]}, [r11] | |||
| add r0, r0, #2 | |||
| b WriteEnd | |||
| Write3: | |||
| add x14, x11, #2 | |||
| vst1.16 {d0[0]}, [x11], x7 | |||
| vst1.16 {d0[1]}, [x11] | |||
| vst1.8 {d0[0]}, [x14], x7 | |||
| vst1.8 {d0[1]}, [x14] | |||
| add r14, r11, #2 | |||
| vst1.16 {d0[0]}, [r11], r7 | |||
| vst1.16 {d0[1]}, [r11] | |||
| vst1.8 {d0[0]}, [r14], r7 | |||
| vst1.8 {d0[1]}, [r14] | |||
| add r0, r0, #3 | |||
| b WriteEnd | |||
| Write4: | |||
| vst1.32 {d0[0]}, [x11], x7 | |||
| vst1.32 {d0[1]}, [x11] | |||
| vst1.32 {d0[0]}, [r11], r7 | |||
| vst1.32 {d0[1]}, [r11] | |||
| add r0, r0, #4 | |||
| WriteEnd: | |||
| @@ -218,7 +227,8 @@ IndirectGemmInt8_2x4: | |||
| bne LoopKsize | |||
| subs r6, r6, #4 | |||
| cbz r3, NoStepFowrard | |||
| cmp r3, #0 | |||
| beq NoStepFowrard | |||
| add r3, r3, #16 | |||
| NoStepFowrard: | |||
| bgt LoopOc | |||
| @@ -227,3 +237,4 @@ IndirectGemmInt8_2x4: | |||
| vpop {q4-q7} | |||
| pop {r4-r8, r10, r11, pc} | |||
| #endif | |||
| #endif | |||
| @@ -0,0 +1,131 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| //.p2align 5,,15 | |||
| .global C4BiasAdd | |||
| #ifndef __APPLE__ | |||
| .type C4BiasAdd, %function | |||
| #endif | |||
| //void C4BiasAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) | |||
| //x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride | |||
| C4BiasAdd: | |||
| LoopOc: | |||
| ld1 {v4.4s}, [x2], #16 | |||
| mov x6, x4 | |||
| mov x7, x0 | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| Loop4: | |||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| fadd v1.4s, v1.4s, v4.4s | |||
| fadd v2.4s, v2.4s, v4.4s | |||
| fadd v3.4s, v3.4s, v4.4s | |||
| cmp x3, #4 | |||
| bge Write4x4 | |||
| cmp x3, #3 | |||
| beq Write3x4 | |||
| cmp x3, #2 | |||
| beq Write2x4 | |||
| Write1x4: | |||
| str s0, [x7] | |||
| add x7, x7, x5 | |||
| str s1, [x7] | |||
| add x7, x7, x5 | |||
| str s2, [x7] | |||
| add x7, x7, x5 | |||
| str s3, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEndx4 | |||
| Write2x4: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x5 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x5 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEndx4 | |||
| Write3x4: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v0.s}[2], [x8], x5 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v1.s}[2], [x8], x5 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v2.s}[2], [x8], x5 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v3.s}[2], [x8], x5 | |||
| b WriteEndx4 | |||
| Write4x4: | |||
| st1 {v0.4s}, [x7], x5 | |||
| st1 {v1.4s}, [x7], x5 | |||
| st1 {v2.4s}, [x7], x5 | |||
| st1 {v3.4s}, [x7], x5 | |||
| WriteEndx4: | |||
| subs x6, x6, #4 | |||
| beq LoopOcEnd | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| b Loop4 | |||
| Loop1: | |||
| ld1 {v0.4s}, [x1], #16 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| cmp x3, #4 | |||
| bge Write4 | |||
| cmp x3, #3 | |||
| beq Write3 | |||
| cmp x3, #2 | |||
| beq Write2 | |||
| Write1: | |||
| str s0, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEnd | |||
| Write2: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEnd | |||
| Write3: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v0.s}[2], [x8], x5 | |||
| b WriteEnd | |||
| Write4: | |||
| st1 {v0.4s}, [x7], x5 | |||
| WriteEnd: | |||
| subs x6, x6, #1 | |||
| bne Loop1 | |||
| LoopOcEnd: | |||
| subs x3, x3, #4 | |||
| add x0, x0, #16 | |||
| bgt LoopOc | |||
| ret | |||
| #endif | |||
| @@ -0,0 +1,137 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| //.p2align 5,,15 | |||
| .global C4BiasAddRelu | |||
| #ifndef __APPLE__ | |||
| .type C4BiasAddRelu, %function | |||
| #endif | |||
| //void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) | |||
| //x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride | |||
| C4BiasAddRelu: | |||
| dup v5.4s, wzr | |||
| LoopOc: | |||
| ld1 {v4.4s}, [x2], #16 | |||
| mov x6, x4 | |||
| mov x7, x0 | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| Loop4: | |||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| fadd v1.4s, v1.4s, v4.4s | |||
| fadd v2.4s, v2.4s, v4.4s | |||
| fadd v3.4s, v3.4s, v4.4s | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| fmax v1.4s, v1.4s, v5.4s | |||
| fmax v2.4s, v2.4s, v5.4s | |||
| fmax v3.4s, v3.4s, v5.4s | |||
| cmp x3, #4 | |||
| bge Write4x4 | |||
| cmp x3, #3 | |||
| beq Write3x4 | |||
| cmp x3, #2 | |||
| beq Write2x4 | |||
| Write1x4: | |||
| str s0, [x7] | |||
| add x7, x7, x5 | |||
| str s1, [x7] | |||
| add x7, x7, x5 | |||
| str s2, [x7] | |||
| add x7, x7, x5 | |||
| str s3, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEndx4 | |||
| Write2x4: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x5 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x5 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEndx4 | |||
| Write3x4: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v0.s}[2], [x8], x5 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v1.s}[2], [x8], x5 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v2.s}[2], [x8], x5 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v3.s}[2], [x8], x5 | |||
| b WriteEndx4 | |||
| Write4x4: | |||
| st1 {v0.4s}, [x7], x5 | |||
| st1 {v1.4s}, [x7], x5 | |||
| st1 {v2.4s}, [x7], x5 | |||
| st1 {v3.4s}, [x7], x5 | |||
| WriteEndx4: | |||
| subs x6, x6, #4 | |||
| beq LoopOcEnd | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| b Loop4 | |||
| Loop1: | |||
| ld1 {v0.4s}, [x1], #16 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| cmp x3, #4 | |||
| bge Write4 | |||
| cmp x3, #3 | |||
| beq Write3 | |||
| cmp x3, #2 | |||
| beq Write2 | |||
| Write1: | |||
| str s0, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEnd | |||
| Write2: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEnd | |||
| Write3: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v0.s}[2], [x8], x5 | |||
| b WriteEnd | |||
| Write4: | |||
| st1 {v0.4s}, [x7], x5 | |||
| WriteEnd: | |||
| subs x6, x6, #1 | |||
| bne Loop1 | |||
| LoopOcEnd: | |||
| subs x3, x3, #4 | |||
| add x0, x0, #16 | |||
| bgt LoopOc | |||
| ret | |||
| #endif | |||
| @@ -0,0 +1,146 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| //.p2align 5,,15 | |||
| .global C4BiasAddRelu6 | |||
| #ifndef __APPLE__ | |||
| .type C4BiasAddRelu6, %function | |||
| #endif | |||
| //void C4BiC4BiasAddRelu6asAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) | |||
| //x0: dst, x1: input, x2: bias, x3: oc, x4: plane_size, x5: stride | |||
| C4BiasAddRelu6: | |||
| dup v5.4s, wzr | |||
| movi v6.4s, #6 | |||
| scvtf v6.4s, v6.4s | |||
| LoopOc: | |||
| ld1 {v4.4s}, [x2], #16 | |||
| mov x6, x4 | |||
| mov x7, x0 | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| Loop4: | |||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| fadd v1.4s, v1.4s, v4.4s | |||
| fadd v2.4s, v2.4s, v4.4s | |||
| fadd v3.4s, v3.4s, v4.4s | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| fmax v1.4s, v1.4s, v5.4s | |||
| fmax v2.4s, v2.4s, v5.4s | |||
| fmax v3.4s, v3.4s, v5.4s | |||
| fmin v0.4s, v0.4s, v6.4s | |||
| fmin v1.4s, v1.4s, v6.4s | |||
| fmin v2.4s, v2.4s, v6.4s | |||
| fmin v3.4s, v3.4s, v6.4s | |||
| cmp x3, #4 | |||
| bge Write4x4 | |||
| cmp x3, #3 | |||
| beq Write3x4 | |||
| cmp x3, #2 | |||
| beq Write2x4 | |||
| Write1x4: | |||
| str s0, [x7] | |||
| add x7, x7, x5 | |||
| str s1, [x7] | |||
| add x7, x7, x5 | |||
| str s2, [x7] | |||
| add x7, x7, x5 | |||
| str s3, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEndx4 | |||
| Write2x4: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x5 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x5 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEndx4 | |||
| Write3x4: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v0.s}[2], [x8], x5 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v1.s}[2], [x8], x5 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v2.s}[2], [x8], x5 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v3.s}[2], [x8], x5 | |||
| b WriteEndx4 | |||
| Write4x4: | |||
| st1 {v0.4s}, [x7], x5 | |||
| st1 {v1.4s}, [x7], x5 | |||
| st1 {v2.4s}, [x7], x5 | |||
| st1 {v3.4s}, [x7], x5 | |||
| WriteEndx4: | |||
| subs x6, x6, #4 | |||
| beq LoopOcEnd | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| b Loop4 | |||
| Loop1: | |||
| ld1 {v0.4s}, [x1], #16 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| fmin v0.4s, v0.4s, v6.4s | |||
| cmp x3, #4 | |||
| bge Write4 | |||
| cmp x3, #3 | |||
| beq Write3 | |||
| cmp x3, #2 | |||
| beq Write2 | |||
| Write1: | |||
| str s0, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEnd | |||
| Write2: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| b WriteEnd | |||
| Write3: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x5 | |||
| st1 {v0.s}[2], [x8], x5 | |||
| b WriteEnd | |||
| Write4: | |||
| st1 {v0.4s}, [x7], x5 | |||
| WriteEnd: | |||
| subs x6, x6, #1 | |||
| bne Loop1 | |||
| LoopOcEnd: | |||
| subs x3, x3, #4 | |||
| add x0, x0, #16 | |||
| bgt LoopOc | |||
| ret | |||
| #endif | |||
| @@ -0,0 +1,132 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| //.p2align 5,,15 | |||
| .global C4Relu | |||
| #ifndef __APPLE__ | |||
| .type C4Relu, %function | |||
| #endif | |||
| //void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride) | |||
| //x0: dst, x1: input, x2: oc, x3: plane_size, x4: stride | |||
| C4Relu: | |||
| dup v5.4s, wzr | |||
| LoopOc: | |||
| mov x6, x3 | |||
| mov x7, x0 | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| Loop4: | |||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| fmax v1.4s, v1.4s, v5.4s | |||
| fmax v2.4s, v2.4s, v5.4s | |||
| fmax v3.4s, v3.4s, v5.4s | |||
| cmp x2, #4 | |||
| bge Write4x4 | |||
| cmp x2, #3 | |||
| beq Write3x4 | |||
| cmp x2, #2 | |||
| beq Write2x4 | |||
| Write1x4: | |||
| str s0, [x7] | |||
| add x7, x7, x4 | |||
| str s1, [x7] | |||
| add x7, x7, x4 | |||
| str s2, [x7] | |||
| add x7, x7, x4 | |||
| str s3, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEndx4 | |||
| Write2x4: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x4 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x4 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEndx4 | |||
| Write3x4: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v0.s}[2], [x8], x4 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v1.s}[2], [x8], x4 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v2.s}[2], [x8], x4 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v3.s}[2], [x8], x4 | |||
| b WriteEndx4 | |||
| Write4x4: | |||
| st1 {v0.4s}, [x7], x4 | |||
| st1 {v1.4s}, [x7], x4 | |||
| st1 {v2.4s}, [x7], x4 | |||
| st1 {v3.4s}, [x7], x4 | |||
| WriteEndx4: | |||
| subs x6, x6, #4 | |||
| beq LoopOcEnd | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| b Loop4 | |||
| Loop1: | |||
| ld1 {v0.4s}, [x1], #16 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| cmp x2, #4 | |||
| bge Write4 | |||
| cmp x2, #3 | |||
| beq Write3 | |||
| cmp x2, #2 | |||
| beq Write2 | |||
| Write1: | |||
| str s0, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEnd | |||
| Write2: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEnd | |||
| Write3: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v0.s}[2], [x8], x4 | |||
| b WriteEnd | |||
| Write4: | |||
| st1 {v0.4s}, [x7], x4 | |||
| WriteEnd: | |||
| subs x6, x6, #1 | |||
| bne Loop1 | |||
| LoopOcEnd: | |||
| subs x2, x2, #4 | |||
| add x0, x0, #16 | |||
| bgt LoopOc | |||
| ret | |||
| #endif | |||
| @@ -0,0 +1,140 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| //.p2align 5,,15 | |||
| .global C4Relu6 | |||
| #ifndef __APPLE__ | |||
| .type C4Relu6, %function | |||
| #endif | |||
| //void C4Relu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride) | |||
| //x0: dst, x1: input, x2: oc, x2: plane_size, x3: stride | |||
| C4Relu6: | |||
| dup v5.4s, wzr | |||
| movi v6.4s, #6 | |||
| scvtf v6.4s, v6.4s | |||
| LoopOc: | |||
| mov x6, x3 | |||
| mov x7, x0 | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| Loop4: | |||
| ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| fmax v1.4s, v1.4s, v5.4s | |||
| fmax v2.4s, v2.4s, v5.4s | |||
| fmax v3.4s, v3.4s, v5.4s | |||
| fmin v0.4s, v0.4s, v6.4s | |||
| fmin v1.4s, v1.4s, v6.4s | |||
| fmin v2.4s, v2.4s, v6.4s | |||
| fmin v3.4s, v3.4s, v6.4s | |||
| cmp x2, #4 | |||
| bge Write4x4 | |||
| cmp x2, #3 | |||
| beq Write3x4 | |||
| cmp x2, #2 | |||
| beq Write2x4 | |||
| Write1x4: | |||
| str s0, [x7] | |||
| add x7, x7, x4 | |||
| str s1, [x7] | |||
| add x7, x7, x4 | |||
| str s2, [x7] | |||
| add x7, x7, x4 | |||
| str s3, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEndx4 | |||
| Write2x4: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x4 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x4 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEndx4 | |||
| Write3x4: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v0.s}[2], [x8], x4 | |||
| dup s17, v1.s[1] | |||
| stp s1, s17, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v1.s}[2], [x8], x4 | |||
| dup s18, v2.s[1] | |||
| stp s2, s18, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v2.s}[2], [x8], x4 | |||
| dup s19, v3.s[1] | |||
| stp s3, s19, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v3.s}[2], [x8], x4 | |||
| b WriteEndx4 | |||
| Write4x4: | |||
| st1 {v0.4s}, [x7], x4 | |||
| st1 {v1.4s}, [x7], x4 | |||
| st1 {v2.4s}, [x7], x4 | |||
| st1 {v3.4s}, [x7], x4 | |||
| WriteEndx4: | |||
| subs x6, x6, #4 | |||
| beq LoopOcEnd | |||
| cmp x6, #4 | |||
| blt Loop1 | |||
| b Loop4 | |||
| Loop1: | |||
| ld1 {v0.4s}, [x1], #16 | |||
| fadd v0.4s, v0.4s, v4.4s | |||
| fmax v0.4s, v0.4s, v5.4s | |||
| fmin v0.4s, v0.4s, v6.4s | |||
| cmp x2, #4 | |||
| bge Write4 | |||
| cmp x2, #3 | |||
| beq Write3 | |||
| cmp x2, #2 | |||
| beq Write2 | |||
| Write1: | |||
| str s0, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEnd | |||
| Write2: | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| b WriteEnd | |||
| Write3: | |||
| add x8, x7, #8 | |||
| dup s16, v0.s[1] | |||
| stp s0, s16, [x7] | |||
| add x7, x7, x4 | |||
| st1 {v0.s}[2], [x8], x4 | |||
| b WriteEnd | |||
| Write4: | |||
| st1 {v0.4s}, [x7], x4 | |||
| WriteEnd: | |||
| subs x6, x6, #1 | |||
| bne Loop1 | |||
| LoopOcEnd: | |||
| subs x2, x2, #4 | |||
| add x0, x0, #16 | |||
| bgt LoopOc | |||
| ret | |||
| #endif | |||
| @@ -0,0 +1,96 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global ConvDwFp32Center | |||
| #ifndef __APPLE__ | |||
| .type ConvDwFp32Center, %function | |||
| #endif | |||
| // void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||
| // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | |||
| // size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); | |||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, | |||
| // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step | |||
| // x14: relu, x15: relu6 | |||
| ConvDwFp32Center: | |||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| sub sp, sp, #48 | |||
| stp x19, x20, [sp], #16 | |||
| stp x21, x22, [sp], #16 | |||
| stp x23, x24, [sp], #16 | |||
| ldr x8, [sp] | |||
| ldr x9, [sp, #8] | |||
| ldr x10, [sp, #16] | |||
| ldr x11, [sp, #24] | |||
| ldr x12, [sp, #32] | |||
| ldr x13, [sp, #40] | |||
| ldr x14, [sp, #48] | |||
| ldr x15, [sp, #56] | |||
| mov x16, #4 | |||
| mul x8, x8, x16 | |||
| mul x9, x9, x16 | |||
| mul x10, x10, x16 | |||
| mul x11, x11, x16 | |||
| mul x12, x12, x16 | |||
| mul x13, x13, x16 | |||
| mov x16, #16 | |||
| mul x19, x7, x16 | |||
| ld1 {v5.4s}, [x3] | |||
| LoopH: | |||
| mov x23, x1 | |||
| mov x24, x5 | |||
| mov x3, x0 | |||
| LoopW: | |||
| mov x16, x23 | |||
| mov x17, x2 | |||
| mov x20, x6 | |||
| ld1 {v0.4s}, [x3] | |||
| fadd v0.4s, v0.4s, v5.4s | |||
| LoopKh: | |||
| mov x18, x7 | |||
| mov x21, x17 | |||
| mov x22, x16 | |||
| LoopKw: | |||
| ld1 {v1.4s}, [x22], x13 | |||
| ld1 {v2.4s}, [x21], #16 | |||
| fmla v0.4s, v1.4s, v2.4s | |||
| subs x18, x18, #1 | |||
| bne LoopKw | |||
| add x16, x16, x12 | |||
| add x17, x17, x19 | |||
| subs x20, x20, #1 | |||
| bne LoopKh | |||
| cbnz x15, Relu6 | |||
| cbnz x14, Relu | |||
| b Write | |||
| Relu6: | |||
| movi v4.4s, #6 | |||
| scvtf v4.4s, v4.4s | |||
| fmin v0.4s, v0.4s, v4.4s | |||
| Relu: | |||
| dup v3.4s, wzr | |||
| fmax v0.4s, v0.4s, v3.4s | |||
| Write: | |||
| st1 {v0.4s}, [x3], x9 | |||
| add x23, x23, x11 | |||
| subs x24, x24, #1 | |||
| bne LoopW | |||
| add x0, x0, x8 | |||
| add x1, x1, x10 | |||
| subs x4, x4, #1 | |||
| bne LoopH | |||
| sub sp, sp, #48 | |||
| ldp x19, x20, [sp], #16 | |||
| ldp x21, x22, [sp], #16 | |||
| ldp x23, x24, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -0,0 +1,77 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global DeconvDwFp32Center | |||
| #ifndef __APPLE__ | |||
| .type DeconvDwFp32Center, %function | |||
| #endif | |||
| // void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, | |||
| // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | |||
| // size_t in_kh_step, size_t in_kw_step); | |||
| // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step | |||
| // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step | |||
| DeconvDwFp32Center: | |||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| sub sp, sp, #32 | |||
| stp x19, x20, [sp], #16 | |||
| stp x21, x22, [sp], #16 | |||
| ldr x8, [sp] | |||
| ldr x9, [sp, #8] | |||
| ldr x10, [sp, #16] | |||
| ldr x11, [sp, #24] | |||
| ldr x12, [sp, #32] | |||
| mov x13, #4 | |||
| mul x7, x7, x13 | |||
| mul x8, x8, x13 | |||
| mul x9, x9, x13 | |||
| mul x10, x10, x13 | |||
| mul x11, x11, x13 | |||
| mul x12, x12, x13 | |||
| mov x13, #16 | |||
| mul x14, x6, x13 | |||
| LoopH: | |||
| mov x15, x0 | |||
| mov x16, x1 | |||
| mov x17, x4 | |||
| LoopW: | |||
| mov x18, x15 | |||
| mov x19, x2 | |||
| mov x20, x5 | |||
| LoopKh: | |||
| mov x21, x18 | |||
| mov x22, x19 | |||
| mov x13, x6 | |||
| LoopKw: | |||
| ld1 {v0.4s}, [x21] | |||
| ld1 {v1.4s}, [x16] | |||
| ld1 {v2.4s}, [x22], #16 | |||
| fmla v0.4s, v1.4s, v2.4s | |||
| st1 {v0.4s}, [x21], x12 | |||
| subs x13, x13, #1 | |||
| bne LoopKw | |||
| add x18, x18, x11 | |||
| add x19, x19, x14 | |||
| subs x20, x20, #1 | |||
| bne LoopKh | |||
| add x15, x15, x10 | |||
| add x16, x16, x8 | |||
| subs x17, x17, #1 | |||
| bne LoopW | |||
| add x0, x0, x9 | |||
| add x1, x1, x7 | |||
| subs x3, x3, #1 | |||
| bne LoopH | |||
| sub sp, sp, #32 | |||
| ldp x19, x20, [sp], #16 | |||
| ldp x21, x22, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -81,20 +81,19 @@ void PostConvFuncFp32(const float *c4_out_ptr, float *out_ptr, const float *bias | |||
| } | |||
| } | |||
| #else | |||
| int oc4 = UP_DIV(output_channel, C4NUM); | |||
| if (bias_ptr != nullptr) { | |||
| if (is_relu) { | |||
| BiasAddRelu(bias_ptr, out_ptr, oc4, plane_size); | |||
| C4BiasAddRelu(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); | |||
| } else if (is_relu6) { | |||
| BiasAddRelu6(bias_ptr, out_ptr, oc4, plane_size); | |||
| C4BiasAddRelu6(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); | |||
| } else { | |||
| BiasAdd(bias_ptr, out_ptr, oc4, plane_size); | |||
| C4BiasAdd(out_ptr, c4_out_ptr, bias_ptr, output_channel, plane_size, stride * sizeof(float)); | |||
| } | |||
| } else { | |||
| if (is_relu) { | |||
| Relu(out_ptr, oc4 * plane_size); | |||
| C4Relu(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float)); | |||
| } else if (is_relu6) { | |||
| Relu6(out_ptr, oc4 * plane_size); | |||
| C4Relu6(out_ptr, c4_out_ptr, output_channel, plane_size, stride * sizeof(float)); | |||
| } else { | |||
| // do nothing | |||
| } | |||
| @@ -42,6 +42,17 @@ void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size); | |||
| void BiasAddRelu(const float *bias, float *data, size_t oc4, size_t plan_size); | |||
| void Relu6(float *data, size_t element4); | |||
| void Relu(float *data, size_t element4); | |||
| void C4BiasAdd(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride); | |||
| void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride); | |||
| void C4BiasAddRelu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride); | |||
| void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); | |||
| void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); | |||
| void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||
| size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, | |||
| size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); | |||
| void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, | |||
| size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, | |||
| size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); | |||
| #endif | |||
| #ifdef __cplusplus | |||
| @@ -15,6 +15,7 @@ | |||
| */ | |||
| #include "src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h" | |||
| #include "src/runtime/kernel/arm/opclib/fp32/common_func.h" | |||
| #ifdef ENABLE_ARM64 | |||
| #include <arm_neon.h> | |||
| #endif | |||
| @@ -122,6 +123,10 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl | |||
| void DepthwiseCenter(float *dst, const float *src, const float *weight, const float *bias, int height, int width, | |||
| int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, int in_sw_step, | |||
| int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6) { | |||
| #ifdef ENABLE_ARM64 | |||
| ConvDwFp32Center(dst, src, weight, bias, height, width, kernel_h, kernel_w, out_h_step, block_channel, | |||
| in_sh_step, in_sw_step, in_kh_step, in_kw_step, is_relu, is_relu6); | |||
| #else | |||
| float *dst_h = dst; | |||
| const float *src_h = src; | |||
| for (int oh = 0; oh < height; oh++) { | |||
| @@ -163,6 +168,7 @@ void DepthwiseCenter(float *dst, const float *src, const float *weight, const fl | |||
| dst_h += out_h_step; | |||
| src_h += in_sh_step; | |||
| } // dst_height loop | |||
| #endif | |||
| } | |||
| // conv depthwise fp32: sliding window | |||
| @@ -262,6 +268,10 @@ void DeconvDepthwiseBorder(float *dst, const float *src, const float *weight, in | |||
| void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, int height, int width, int kernel_h, | |||
| int kernel_w, int out_h_step, int block_channel, int in_sh_step, int in_sw_step, | |||
| int in_kh_step, int in_kw_step) { | |||
| #ifdef ENABLE_ARM64 | |||
| DeconvDwFp32Center(dst, src, weight, height, width, kernel_h, kernel_w, out_h_step, block_channel, | |||
| in_sh_step, in_sw_step, in_kh_step, in_kw_step); | |||
| #else | |||
| float *dst_h = dst; | |||
| const float *src_h = src; | |||
| for (int oh = 0; oh < height; oh++) { | |||
| @@ -297,6 +307,7 @@ void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, in | |||
| dst_h += in_sh_step; | |||
| src_h += out_h_step; | |||
| } // dst_height loop | |||
| #endif | |||
| } | |||
| void DeconvDepthwisePostFunc(float *dst, const float *bias, int block_channel, const ConvParameter *conv_param) { | |||