| @@ -0,0 +1,222 @@ | |||||
| #ifdef ENABLE_ARM32 | |||||
| #include "nnacl/assembly_global.h" | |||||
| .text | |||||
| .align 5 | |||||
| // void MatVecMulA32NeonFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, | |||||
| // int depth, int col) { | |||||
| // r0: a | |||||
| // r1: b | |||||
| // r2: c | |||||
| // r3: bias | |||||
| // r4: act_type | |||||
| // r5: depth | |||||
| // r6: col | |||||
| asm_function MatVecMulA32NeonFp16 | |||||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | |||||
| push {r0-r8, r9, r10, r11, lr} | |||||
| add sp, sp, #52 | |||||
| ldr r4, [sp] | |||||
| ldr r5, [sp, #4] | |||||
| ldr r6, [sp, #8] | |||||
| add r10, r5, r5 // stride = depth * sizeof(float16_t) | |||||
| mov lr, #4 | |||||
| mul r11, r10, lr // stride x 4 | |||||
| cmp r6, #4 | |||||
| blt Col1Loop | |||||
| Col4Loop: | |||||
| mov r7, r0 // reload a(vector) ptr | |||||
| mov r9, r1 // reload b(matrix) ptr | |||||
| mov r8, r5 // reload depth value | |||||
| veor q9, q9, q9 | |||||
| veor q10, q10, q10 | |||||
| veor q11, q11, q11 | |||||
| veor q12, q12, q12 | |||||
| veor q15, q15, q15 | |||||
| cmp r8, #8 | |||||
| bge Col4Depth8 | |||||
| cmp r8, #4 | |||||
| bge Col4Depth4 | |||||
| cmp r8, #1 | |||||
| bge Col4Depth1 | |||||
| b Col4End | |||||
| Col4Depth8: | |||||
| vld1.16 {q8}, [r7]! | |||||
| add lr, r9, r10 | |||||
| vld1.16 {q0}, [r9]! | |||||
| vld1.16 {q1}, [lr], r10 | |||||
| vld1.16 {q2}, [lr], r10 | |||||
| vld1.16 {q3}, [lr] | |||||
| vmla.f16 q9, q8, q0 | |||||
| vmla.f16 q10, q8, q1 | |||||
| vmla.f16 q11, q8, q2 | |||||
| vmla.f16 q12, q8, q3 | |||||
| sub r8, r8, #8 | |||||
| cmp r8, #8 | |||||
| bge Col4Depth8 | |||||
| cmp r8, #4 | |||||
| bge Col4Depth4 | |||||
| b AddC4 | |||||
| Col4Depth4: | |||||
| vld1.16 {d16}, [r7]! | |||||
| add lr, r9, r10 | |||||
| vld1.16 {d0}, [r9]! | |||||
| vld1.16 {d2}, [lr], r10 | |||||
| vld1.16 {d4}, [lr], r10 | |||||
| vld1.16 {d6}, [lr] | |||||
| vmla.f16 d18, d16, d0 | |||||
| vmla.f16 d20, d16, d2 | |||||
| vmla.f16 d22, d16, d4 | |||||
| vmla.f16 d24, d16, d6 | |||||
| sub r8, r8, #4 | |||||
| cmp r8, #4 | |||||
| bge Col4Depth4 | |||||
| AddC4: | |||||
| vpadd.f16 d0, d18, d19 | |||||
| vpadd.f16 d1, d20, d21 | |||||
| vpadd.f16 d2, d22, d23 | |||||
| vpadd.f16 d4, d24, d25 | |||||
| vpadd.f16 d30, d0, d1 | |||||
| vpadd.f16 d31, d2, d4 | |||||
| vpadd.f16 d30, d30, d31 | |||||
| cmp r8, #1 | |||||
| bge Col4Depth1 | |||||
| b Col4End | |||||
| Col4Depth1: | |||||
| vld1.16 {d0[0]}, [r7]! | |||||
| add lr, r9, r10 | |||||
| vld1.16 {d2[0]}, [r9]! | |||||
| vld1.16 {d2[1]}, [lr], r10 | |||||
| vld1.16 {d2[2]}, [lr], r10 | |||||
| vld1.16 {d2[3]}, [lr] | |||||
| vmla.f16 d30, d2, d0[0] | |||||
| subs r8, r8, #1 | |||||
| bne Col4Depth1 | |||||
| Col4End: | |||||
| cmp r3, #0 | |||||
| beq Col4Activation | |||||
| vld1.16 {d26}, [r3]! | |||||
| vadd.f16 d30, d30, d26 | |||||
| Col4Activation: | |||||
| cmp r4, #3 | |||||
| beq Col4Relu6 | |||||
| cmp r4, #1 | |||||
| beq Col4Relu | |||||
| b Col4Write | |||||
| Col4Relu6: | |||||
| vmov.i16 q12, #6 | |||||
| vcvt.f16.s16 q12, q12 | |||||
| vmin.f16 d30, d30, d24 | |||||
| Col4Relu: | |||||
| veor q13, q13, q13 | |||||
| vmax.f16 d30, d30, d26 | |||||
| Col4Write: | |||||
| vst1.16 {d30}, [r2]! | |||||
| subs r6, r6, #4 | |||||
| beq End | |||||
| add r1, r1, r11 | |||||
| cmp r6, #4 | |||||
| bge Col4Loop | |||||
| Col1Loop: | |||||
| mov r7, r0 // reload a(vector) ptr | |||||
| mov r9, r1 // reload b(matrix) ptr | |||||
| mov r8, r5 // reload depth value | |||||
| veor q10, q10, q10 | |||||
| veor q15, q15, q15 | |||||
| cmp r8, #8 | |||||
| bge Col1Depth8 | |||||
| cmp r8, #4 | |||||
| bge Col1Depth4 | |||||
| cmp r8, #1 | |||||
| bge Col1Depth1 | |||||
| b Col1End | |||||
| Col1Depth8: | |||||
| vld1.16 {q0}, [r7]! | |||||
| vld1.16 {q1}, [r9]! | |||||
| vmla.f16 q10, q1, q0 | |||||
| sub r8, r8, #8 | |||||
| cmp r8, #8 | |||||
| bge Col1Depth8 | |||||
| cmp r8, #4 | |||||
| bge Col1Depth4 | |||||
| b AddC1 | |||||
| Col1Depth4: | |||||
| vld1.16 {d0}, [r7]! | |||||
| vld1.16 {d2}, [r9]! | |||||
| vmla.f16 d20, d2, d0 | |||||
| sub r8, r8, #4 | |||||
| cmp r8, #4 | |||||
| bge Col1Depth4 | |||||
| AddC1: | |||||
| vpadd.f16 d30, d20, d21 | |||||
| vpadd.f16 d30, d30, d20 | |||||
| vpadd.f16 d30, d30, d20 | |||||
| cmp r8, #1 | |||||
| bge Col1Depth1 | |||||
| b Col1End | |||||
| Col1Depth1: | |||||
| vld1.16 {d0[0]}, [r7]! | |||||
| vld1.16 {d2[0]}, [r9]! | |||||
| vmla.f16 d30, d2, d0[0] | |||||
| subs r8, r8, #1 | |||||
| bne Col1Depth1 | |||||
| Col1End: | |||||
| cmp r3, #0 | |||||
| beq Col1Activation | |||||
| vld1.16 {d28[0]}, [r3]! | |||||
| vadd.f16 d30, d30, d28 | |||||
| Col1Activation: | |||||
| cmp r4, #3 | |||||
| beq Col1Relu6 | |||||
| cmp r4, #1 | |||||
| beq Col1Relu | |||||
| b Col1Write | |||||
| Col1Relu6: | |||||
| vmov.i16 d26, #6 | |||||
| vcvt.f16.s16 d26, d26 | |||||
| vmin.f16 d30, d30, d26 | |||||
| Col1Relu: | |||||
| veor d24, d24, d24 | |||||
| vmax.f16 d30, d30, d24 | |||||
| Col1Write: | |||||
| vst1.16 {d30[0]}, [r2]! | |||||
| subs r6, r6, #1 | |||||
| beq End | |||||
| add r1, r1, r10 | |||||
| b Col1Loop | |||||
| End: | |||||
| sub sp, sp, #52 | |||||
| pop {r0-r8, r9, r10, r11, pc} | |||||
| #endif | |||||
| @@ -0,0 +1,93 @@ | |||||
| #ifdef ENABLE_ARM32 | |||||
| #include "nnacl/assembly_global.h" | |||||
| .text | |||||
| .align 5 | |||||
| asm_function TiledC4MatmulFp16 | |||||
| // void TiledC4MatmulFp16(float16_t *dst, const float16_t *src, const float16_t *weight, size_t cal_num, size_t ic4, | |||||
| // size_t oc4); | |||||
| // r0: dst | |||||
| // r1: src | |||||
| // r2: weight | |||||
| // r3: cal_num | |||||
| // r4(sp): ic4 | |||||
| // r5(sp + #4): oc4 | |||||
| push {r4-r11, lr} | |||||
| vpush {q4-q7} | |||||
| add sp, sp, #100 | |||||
| ldr r4, [sp] | |||||
| ldr r5, [sp, #4] // oc4 | |||||
| add r3, r3, r3 | |||||
| mov r7, r1 | |||||
| cmp r5, #1 | |||||
| blt LoopOCEnd | |||||
| cmp r4, #1 | |||||
| blt LoopICEnd | |||||
| LoopOC: | |||||
| ldr r4, [sp] | |||||
| veor q15, q15, q15 | |||||
| veor q14, q14, q14 | |||||
| veor q13, q13, q13 | |||||
| veor q12, q12, q12 | |||||
| LoopIC: | |||||
| vld1.16 {q4, q5}, [r2]! // weight | |||||
| vld1.16 {q2, q3}, [r1]! // 16 number src | |||||
| vmla.f16 d24, d8, d4[0] | |||||
| vmla.f16 d24, d9, d4[1] | |||||
| vmla.f16 d24, d10, d4[2] | |||||
| vmla.f16 d24, d11, d4[3] | |||||
| vmla.f16 d25, d8, d5[0] | |||||
| vmla.f16 d25, d9, d5[1] | |||||
| vmla.f16 d25, d10, d5[2] | |||||
| vmla.f16 d25, d11, d5[3] | |||||
| vmla.f16 d26, d8, d6[0] | |||||
| vmla.f16 d26, d9, d6[1] | |||||
| vmla.f16 d26, d10, d6[2] | |||||
| vmla.f16 d26, d11, d6[3] | |||||
| vmla.f16 d27, d8, d7[0] | |||||
| vmla.f16 d27, d9, d7[1] | |||||
| vmla.f16 d27, d10, d7[2] | |||||
| vmla.f16 d27, d11, d7[3] | |||||
| vld1.16 {q0, q1}, [r1]! // 16 number src | |||||
| vmla.f16 d28, d8, d0[0] | |||||
| vmla.f16 d28, d9, d0[1] | |||||
| vmla.f16 d28, d10, d0[2] | |||||
| vmla.f16 d28, d11, d0[3] | |||||
| vmla.f16 d29, d8, d1[0] | |||||
| vmla.f16 d29, d9, d1[1] | |||||
| vmla.f16 d29, d10, d1[2] | |||||
| vmla.f16 d29, d11, d1[3] | |||||
| vmla.f16 d30, d8, d2[0] | |||||
| vmla.f16 d30, d9, d2[1] | |||||
| vmla.f16 d30, d10, d2[2] | |||||
| vmla.f16 d30, d11, d2[3] | |||||
| vmla.f16 d31, d8, d3[0] | |||||
| vmla.f16 d31, d9, d3[1] | |||||
| vmla.f16 d31, d10, d3[2] | |||||
| vmla.f16 d31, d11, d3[3] | |||||
| subs r4, r4, #1 | |||||
| bne LoopIC | |||||
| b LoopICEnd | |||||
| LoopICEnd: | |||||
| mov lr, r0 | |||||
| vst1.16 {q12, q13}, [lr]! | |||||
| vst1.16 {q14, q15}, [lr]! | |||||
| add r0, r0, r3 // dst += cal_num | |||||
| mov r1, r7 | |||||
| subs r5, r5, #1 | |||||
| bne LoopOC | |||||
| LoopOCEnd: | |||||
| sub sp, sp, #100 | |||||
| vpop {q4-q7} | |||||
| pop {r4-r11, pc} | |||||
| #endif | |||||
| @@ -0,0 +1,150 @@ | |||||
| #ifdef ENABLE_ARM32 | |||||
| #include "nnacl/assembly_global.h" | |||||
| .text | |||||
| .align 5 | |||||
| // void WinogradTransLeftFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k, | |||||
| // size_t length); | |||||
| //r0: S | |||||
| //r1: B | |||||
| //r2: M | |||||
| //r3: w | |||||
| //r4: h | |||||
| //r5: k | |||||
| //r6: length | |||||
| asm_function WinogradTransLeftFp16 | |||||
| push {r0, r3, r4-r11, lr} | |||||
| vpush {q4-q7} | |||||
| add sp, sp, #108 | |||||
| ldr r4, [sp] | |||||
| ldr r6, [sp, #8] | |||||
| mov r8, #8 // 4 * sizeof(float16_t) | |||||
| mul r8, r6, r8 // length * 4 * 2 | |||||
| mul r7, r3, r8 // step for S | |||||
| add r10, r4, r4 // step for B | |||||
| cmp r4, #1 | |||||
| blt LoopHEnd | |||||
| cmp r3, #1 | |||||
| blt LoopHEnd | |||||
| LoopH: | |||||
| ldr r3, [sp, #-40] // w | |||||
| ldr r0, [sp, #-44] | |||||
| LoopW: | |||||
| mov r11, r0 // S | |||||
| mov lr, r1 // B_src | |||||
| veor q6, q6, q6 | |||||
| ldr r6, [sp, #8] | |||||
| InitZero: | |||||
| vst1.16 {d12}, [r2]! | |||||
| subs r6, r6, #1 | |||||
| bne InitZero | |||||
| sub r2, r2, r8 | |||||
| ldr r5, [sp, #4] | |||||
| cmp r5, #4 | |||||
| bge LoopK4 | |||||
| cmp r5, #3 | |||||
| bge LoopK3 | |||||
| cmp r5, #1 | |||||
| bge LoopK1 | |||||
| b LoopKEnd | |||||
| LoopK4: | |||||
| ldr r6, [sp, #8] | |||||
| vld1.16 {d1[0]}, [lr], r10 | |||||
| vld1.16 {d3[0]}, [lr], r10 | |||||
| vld1.16 {d5[0]}, [lr], r10 | |||||
| vld1.16 {d7[0]}, [lr], r10 | |||||
| add r12, r11, r7 | |||||
| add r14, r12, r7 | |||||
| add r9, r14, r7 | |||||
| LoopK4L4: | |||||
| vld1.16 {d12}, [r2] | |||||
| vld1.16 {d0}, [r11]! | |||||
| vld1.16 {d2}, [r12]! | |||||
| vmla.f16 d12, d0, d1[0] | |||||
| vld1.16 {d4}, [r14]! | |||||
| vmla.f16 d12, d2, d3[0] | |||||
| vld1.16 {d6}, [r9]! | |||||
| vmla.f16 d12, d4, d5[0] | |||||
| vmla.f16 d12, d6, d7[0] | |||||
| vst1.16 {d12}, [r2]! // dst | |||||
| subs r6, r6, #1 // length | |||||
| bne LoopK4L4 | |||||
| subs r5, r5, #4 // k | |||||
| beq LoopKEnd | |||||
| sub r2, r2, r8 // dst - step | |||||
| sub r9, r9, r8 | |||||
| add r11, r9, r7 | |||||
| cmp r5, #4 | |||||
| bge LoopK4 | |||||
| cmp r5, #3 | |||||
| bge LoopK3 | |||||
| b LoopK1 | |||||
| LoopK3: | |||||
| ldr r6, [sp, #8] | |||||
| vld1.16 {d1[0]}, [lr], r10 | |||||
| vld1.16 {d3[0]}, [lr], r10 | |||||
| vld1.16 {d5[0]}, [lr], r10 | |||||
| add r12, r11, r7 | |||||
| add r9, r12, r7 | |||||
| LoopK3L4: | |||||
| vld1.16 {d12}, [r2] | |||||
| vld1.16 {d0}, [r11]! | |||||
| vld1.16 {d2}, [r12]! | |||||
| vmla.f16 d12, d0, d1[0] | |||||
| vld1.16 {d4}, [r9]! | |||||
| vmla.f16 d12, d2, d3[0] | |||||
| vmla.f16 d12, d4, d5[0] | |||||
| vst1.16 {d12}, [r2]! // dst | |||||
| subs r6, r6, #1 // length | |||||
| bne LoopK3L4 | |||||
| subs r5, r5, #3 // k | |||||
| beq LoopKEnd | |||||
| sub r2, r2, r8 // dst - step | |||||
| sub r9, r9, r8 | |||||
| add r11, r9, r7 | |||||
| cmp r5, #3 | |||||
| bge LoopK3 | |||||
| b LoopK1 | |||||
| LoopK1: | |||||
| ldr r6, [sp, #8] | |||||
| vld1.16 {d1[0]}, [lr], r10 | |||||
| LoopK1L4: | |||||
| vld1.16 {d12}, [r2] | |||||
| vld1.16 {d0}, [r11]! | |||||
| vmla.f16 d12, d0, d1[0] | |||||
| vst1.16 {d12}, [r2]! // dst | |||||
| subs r6, r6, #1 // length | |||||
| bne LoopK1L4 | |||||
| subs r5, r5, #1 // k | |||||
| beq LoopKEnd | |||||
| sub r2, r2, r8 // dst - step | |||||
| sub r11, r11, r8 | |||||
| add r11, r11, r7 | |||||
| b LoopK1 | |||||
| LoopKEnd: | |||||
| add r0, r0, r8 // S += unitstep | |||||
| subs r3, r3, #1 | |||||
| bne LoopW | |||||
| LoopWEnd: | |||||
| subs r4, r4, #1 | |||||
| beq LoopHEnd | |||||
| add r1, r1, #2 // B += 1 | |||||
| b LoopH | |||||
| LoopHEnd: | |||||
| sub sp, sp, #108 | |||||
| vpop {q4-q7} | |||||
| pop {r0, r3, r4-r11, pc} | |||||
| #endif | |||||
| @@ -0,0 +1,148 @@ | |||||
| #ifdef ENABLE_ARM32 | |||||
| #include "nnacl/assembly_global.h" | |||||
| .text | |||||
| .align 5 | |||||
| // void WinogradTransRightFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k, | |||||
| // size_t length); | |||||
| //r0: S | |||||
| //r1: B | |||||
| //r2: M | |||||
| //r3: w | |||||
| //r4: h | |||||
| //r5: k | |||||
| //r6: length | |||||
| asm_function WinogradTransRightFp16 | |||||
| push {r1, r3, r4-r11, lr} | |||||
| vpush {q4-q7} | |||||
| add sp, sp, #108 | |||||
| ldr r4, [sp] | |||||
| ldr r5, [sp, #4] | |||||
| ldr r6, [sp, #8] | |||||
| mov r8, #8 // 4 * sizeof(float16_t) | |||||
| mul r8, r6, r8 // length * 4 * 2 | |||||
| mul r7, r5, r8 // step for S = k * unitStep * 4 | |||||
| add r10, r4, r4 // step for B = 2 * h | |||||
| cmp r4, #1 | |||||
| blt LoopHEnd | |||||
| cmp r3, #1 | |||||
| blt LoopHEnd | |||||
| LoopH: | |||||
| ldr r3, [sp, #-40] // w | |||||
| ldr r1, [sp, #-44] | |||||
| LoopW: | |||||
| mov r11, r0 // S | |||||
| mov lr, r1 // B_src | |||||
| veor q6, q6, q6 | |||||
| ldr r6, [sp, #8] | |||||
| InitZero: | |||||
| vst1.16 {d12}, [r2]! | |||||
| subs r6, r6, #1 | |||||
| bne InitZero | |||||
| sub r2, r2, r8 | |||||
| ldr r5, [sp, #4] | |||||
| cmp r5, #4 | |||||
| bge LoopK4 | |||||
| cmp r5, #3 | |||||
| bge LoopK3 | |||||
| cmp r5, #1 | |||||
| bge LoopK1 | |||||
| b LoopKEnd | |||||
| LoopK4: | |||||
| ldr r6, [sp, #8] | |||||
| vld1.16 {d1[0]}, [lr], r10 | |||||
| vld1.16 {d3[0]}, [lr], r10 | |||||
| vld1.16 {d5[0]}, [lr], r10 | |||||
| vld1.16 {d7[0]}, [lr], r10 | |||||
| add r12, r11, r8 | |||||
| add r14, r12, r8 | |||||
| add r9, r14, r8 | |||||
| LoopK4L4: | |||||
| vld1.16 {d12}, [r2] | |||||
| vld1.16 {d0}, [r11]! | |||||
| vld1.16 {d2}, [r12]! | |||||
| vmla.f16 d12, d0, d1[0] | |||||
| vld1.16 {d4}, [r14]! | |||||
| vmla.f16 d12, d2, d3[0] | |||||
| vld1.16 {d6}, [r9]! | |||||
| vmla.f16 d12, d4, d5[0] | |||||
| vmla.f16 d12, d6, d7[0] | |||||
| vst1.16 {d12}, [r2]! // dst | |||||
| subs r6, r6, #1 // length | |||||
| bne LoopK4L4 | |||||
| subs r5, r5, #4 // k | |||||
| beq LoopKEnd | |||||
| sub r2, r2, r8 // dst - step | |||||
| mov r11, r9 | |||||
| cmp r5, #4 | |||||
| bge LoopK4 | |||||
| cmp r5, #3 | |||||
| bge LoopK3 | |||||
| b LoopK1 | |||||
| LoopK3: | |||||
| ldr r6, [sp, #8] | |||||
| vld1.16 {d1[0]}, [lr], r10 | |||||
| vld1.16 {d3[0]}, [lr], r10 | |||||
| vld1.16 {d5[0]}, [lr], r10 | |||||
| add r12, r11, r8 | |||||
| add r9, r12, r8 | |||||
| LoopK3L4: | |||||
| vld1.16 {d12}, [r2] | |||||
| vld1.16 {d0}, [r11]! | |||||
| vld1.16 {d2}, [r12]! | |||||
| vmla.f16 d12, d0, d1[0] | |||||
| vld1.16 {d4}, [r9]! | |||||
| vmla.f16 d12, d2, d3[0] | |||||
| vmla.f16 d12, d4, d5[0] | |||||
| vst1.16 {d12}, [r2]! // dst | |||||
| subs r6, r6, #1 // length | |||||
| bne LoopK3L4 | |||||
| subs r5, r5, #3 // k | |||||
| beq LoopKEnd | |||||
| sub r2, r2, r8 // dst - step | |||||
| mov r11, r9 | |||||
| cmp r5, #3 | |||||
| bge LoopK3 | |||||
| b LoopK1 | |||||
| LoopK1: | |||||
| ldr r6, [sp, #8] | |||||
| vld1.16 {d1[0]}, [lr], r10 | |||||
| LoopK1L4: | |||||
| vld1.16 {d12}, [r2] | |||||
| vld1.16 {d0}, [r11]! | |||||
| vmla.f16 d12, d0, d1[0] | |||||
| vst1.16 {d12}, [r2]! // dst | |||||
| subs r6, r6, #1 // length | |||||
| bne LoopK1L4 | |||||
| subs r5, r5, #1 // k | |||||
| beq LoopKEnd | |||||
| sub r2, r2, r8 // dst - step | |||||
| b LoopK1 | |||||
| LoopKEnd: | |||||
| add r1, r1, #2 // B[x] | |||||
| subs r3, r3, #1 | |||||
| bne LoopW | |||||
| LoopWEnd: | |||||
| add r0, r0, r7 | |||||
| subs r4, r4, #1 | |||||
| beq LoopHEnd | |||||
| b LoopH | |||||
| LoopHEnd: | |||||
| sub sp, sp, #108 | |||||
| vpop {q4-q7} | |||||
| pop {r1, r3, r4-r11, pc} | |||||
| #endif | |||||
| @@ -41,32 +41,36 @@ void PostConvFuncCommFp16(float16_t *out_ptr, const float16_t *src_ptr_, const f | |||||
| void PostConvFuncFp16C8(const float16_t *c8_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane, | void PostConvFuncFp16C8(const float16_t *c8_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane, | ||||
| size_t oc_stride, ActType act_type) { | size_t oc_stride, ActType act_type) { | ||||
| #ifdef ENABLE_ARM64 | |||||
| size_t oc8mod = oc % C8NUM; | size_t oc8mod = oc % C8NUM; | ||||
| size_t oc8div = oc - oc8mod; | size_t oc8div = oc - oc8mod; | ||||
| size_t stride_size = oc_stride * sizeof(float16_t); | size_t stride_size = oc_stride * sizeof(float16_t); | ||||
| PostFuncBiasReluC8Fp16(nhwc_out, c8_out, bias, oc8div, oc8mod, plane, stride_size, act_type); | PostFuncBiasReluC8Fp16(nhwc_out, c8_out, bias, oc8div, oc8mod, plane, stride_size, act_type); | ||||
| return; | |||||
| #else | |||||
| PostConvFuncCommFp16(nhwc_out, c8_out, bias, oc, plane, oc_stride, plane, act_type, C8NUM); | |||||
| #endif | |||||
| } | } | ||||
| void PostConvFuncFp16C4(const float16_t *c4_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane, | void PostConvFuncFp16C4(const float16_t *c4_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane, | ||||
| size_t plane_stride, ActType act_type) { | size_t plane_stride, ActType act_type) { | ||||
| #ifdef ENABLE_ARM64 | |||||
| size_t oc4mod = oc % C4NUM; | size_t oc4mod = oc % C4NUM; | ||||
| size_t oc4div = oc - oc4mod; | size_t oc4div = oc - oc4mod; | ||||
| size_t stride_size = (plane_stride - plane) * C4NUM * sizeof(float16_t); | size_t stride_size = (plane_stride - plane) * C4NUM * sizeof(float16_t); | ||||
| PostFuncBiasReluC4Fp16(nhwc_out, c4_out, bias, oc4div, oc4mod, plane, stride_size, act_type); | PostFuncBiasReluC4Fp16(nhwc_out, c4_out, bias, oc4div, oc4mod, plane, stride_size, act_type); | ||||
| return; | |||||
| #else | |||||
| PostConvFuncCommFp16(nhwc_out, c4_out, bias, oc, plane, oc, plane_stride, act_type, C4NUM); | |||||
| #endif | |||||
| } | } | ||||
| #ifdef ENABLE_ARM82_A32 | #ifdef ENABLE_ARM82_A32 | ||||
| void PostFuncBiasReluC4Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc4div, size_t oc4mod, | void PostFuncBiasReluC4Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc4div, size_t oc4mod, | ||||
| size_t plane_size, size_t plane_stride, size_t relu_type) { | size_t plane_size, size_t plane_stride, size_t relu_type) { | ||||
| // TODO(fun): function | // TODO(fun): function | ||||
| return; | |||||
| } | } | ||||
| void PostFuncBiasReluC8Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc8div, size_t oc8mod, | void PostFuncBiasReluC8Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc8div, size_t oc8mod, | ||||
| size_t plane_size, size_t stride, size_t relu_type) { | size_t plane_size, size_t stride, size_t relu_type) { | ||||
| // TODO(fun): function | // TODO(fun): function | ||||
| return; | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -41,6 +41,55 @@ void DeConvWgInputPackFp16(float16_t *src_ptr, float16_t *dst_ptr, int channel, | |||||
| return; | return; | ||||
| } | } | ||||
| #ifdef ENABLE_ARM82_A32 | |||||
| void DeconvWgMergeFp16A32Fun(const float16_t *src_ptr, float16_t *dst_ptr, size_t src_step, size_t dst_step) { | |||||
| asm volatile( | |||||
| "mov r7, %[src_ptr]\n" | |||||
| "mov r8, %[dst_ptr]\n" | |||||
| "mov r10, r8\n" | |||||
| "vld1.16 {d0}, [r7], %[src_step]\n" | |||||
| "vld1.16 {d2}, [r8], %[dst_step]\n" | |||||
| "vld1.16 {d4}, [r7], %[src_step]\n" | |||||
| "vld1.16 {d6}, [r8], %[dst_step]\n" | |||||
| "vadd.f16 d0, d0, d2\n" | |||||
| "vld1.16 {d8}, [r7], %[src_step]\n" | |||||
| "vadd.f16 d4, d4, d6\n" | |||||
| "vst1.16 {d0}, [r10], %[dst_step]\n" | |||||
| "vst1.16 {d4}, [r10], %[dst_step]\n" | |||||
| "vld1.16 {d10}, [r8], %[dst_step]\n" | |||||
| "vld1.16 {d12}, [r7], %[src_step]\n" | |||||
| "vadd.f16 d8, d8, d10\n" | |||||
| "vld1.16 {d14}, [r8], %[dst_step]\n" | |||||
| "vadd.f16 d12, d12, d14\n" | |||||
| "vld1.16 {d0}, [r7], %[src_step]\n" | |||||
| "vst1.16 {d8}, [r10], %[dst_step]\n" | |||||
| "vst1.16 {d12}, [r10], %[dst_step]\n" | |||||
| "vld1.16 {d2}, [r8], %[dst_step]\n" | |||||
| "vld1.16 {d4}, [r7], %[src_step]\n" | |||||
| "vld1.16 {d6}, [r8], %[dst_step]\n" | |||||
| "vadd.f16 d0, d0, d2\n" | |||||
| "vadd.f16 d4, d4, d6\n" | |||||
| "vst1.16 {d0}, [r10], %[dst_step]\n" | |||||
| "vst1.16 {d4}, [r10], %[dst_step]\n" | |||||
| "vld1.16 {d8}, [r7], %[src_step]\n" | |||||
| "vld1.16 {d10}, [r8], %[dst_step]\n" | |||||
| "vld1.16 {d12}, [r7], %[src_step]\n" | |||||
| "vld1.16 {d14}, [r8], %[dst_step]\n" | |||||
| "vadd.f16 d8, d8, d10\n" | |||||
| "vadd.f16 d12, d12, d14\n" | |||||
| "vst1.16 {d8}, [r10], %[dst_step]\n" | |||||
| "vst1.16 {d12}, [r10], %[dst_step]\n" | |||||
| : | |||||
| : [ src_ptr ] "r"(src_ptr), [ dst_ptr ] "r"(dst_ptr), [ src_step ] "r"(src_step), [ dst_step ] "r"(dst_step) | |||||
| : "r7", "r8", "r10", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); | |||||
| } | |||||
| #endif | |||||
| void DeConvWgMergeFp16(const float16_t *src, float16_t *dst, size_t src_stride, size_t dst_stride, size_t count) { | void DeConvWgMergeFp16(const float16_t *src, float16_t *dst, size_t src_stride, size_t dst_stride, size_t count) { | ||||
| const float16_t *src_ptr = src; | const float16_t *src_ptr = src; | ||||
| float16_t *dst_ptr = dst; | float16_t *dst_ptr = dst; | ||||
| @@ -94,8 +143,18 @@ void DeConvWgMergeFp16(const float16_t *src, float16_t *dst, size_t src_stride, | |||||
| : | : | ||||
| : [ src_ptr ] "r"(src_ptr), [ dst_ptr ] "r"(dst_ptr), [ src_step ] "r"(src_step), [ dst_step ] "r"(dst_step) | : [ src_ptr ] "r"(src_ptr), [ dst_ptr ] "r"(dst_ptr), [ src_step ] "r"(src_step), [ dst_step ] "r"(dst_step) | ||||
| : "x7", "x8", "x10", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); | : "x7", "x8", "x10", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); | ||||
| #elif defined(ENABLE_ARM82_A32) | |||||
| size_t src_step = src_stride * sizeof(float16_t); | |||||
| size_t dst_step = dst_stride * sizeof(float16_t); | |||||
| DeconvWgMergeFp16A32Fun(src_ptr, dst_ptr, src_step, dst_step); | |||||
| #else | #else | ||||
| // TODO(fun): arm32 | |||||
| for (int j = 0; j < 8; j++) { | |||||
| const float16_t *s = src_ptr + j * src_stride; | |||||
| float16_t *d = dst_ptr + j * dst_stride; | |||||
| for (int k = 0; k < 4; k++) { | |||||
| d[k] += s[k]; | |||||
| } | |||||
| } | |||||
| #endif | #endif | ||||
| src_ptr += C8NUM * src_stride; | src_ptr += C8NUM * src_stride; | ||||
| dst_ptr += C8NUM * dst_stride; | dst_ptr += C8NUM * dst_stride; | ||||
| @@ -377,22 +436,78 @@ void DeconvWgPostFp16(float16_t *tile_out, float16_t *nc4hw4_output, ConvParamet | |||||
| return; | return; | ||||
| } | } | ||||
| #ifdef ENABLE_ARM82_A32 | |||||
| #ifndef ENABLE_ARM | |||||
| void WinogradTransLeftFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k, | void WinogradTransLeftFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k, | ||||
| size_t length) { | size_t length) { | ||||
| // TODO(fun): function | |||||
| return; | |||||
| const int unitStep = 4 * length; | |||||
| for (int y = 0; y < h; ++y) { | |||||
| float16_t *dstY = M + y * w * unitStep; | |||||
| for (int x = 0; x < w; ++x) { | |||||
| float16_t *dstX = dstY + x * unitStep; | |||||
| const float16_t *srcX = S + x * unitStep; | |||||
| memset(dstX, 0, unitStep * sizeof(float16_t)); | |||||
| for (int i = 0; i < k; ++i) { | |||||
| float16_t b = B[i * h + y]; | |||||
| const float16_t *srcY = srcX + i * w * unitStep; | |||||
| if (0.0f == b) { | |||||
| continue; | |||||
| } | |||||
| for (int j = 0; j < unitStep; ++j) { | |||||
| dstX[j] += srcY[j] * b; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | } | ||||
| void WinogradTransRightFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k, | void WinogradTransRightFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k, | ||||
| size_t length) { | size_t length) { | ||||
| // TODO(fun): function | |||||
| return; | |||||
| const int unitStep = 4 * length; | |||||
| for (int y = 0; y < h; ++y) { | |||||
| float16_t *dstY = M + y * w * unitStep; | |||||
| const float16_t *srcY = S + y * k * unitStep; | |||||
| for (int x = 0; x < w; ++x) { | |||||
| float16_t *dstX = dstY + x * unitStep; | |||||
| memset(dstX, 0, unitStep * sizeof(float16_t)); | |||||
| for (int i = 0; i < k; ++i) { | |||||
| const float16_t *srcX = srcY + i * unitStep; | |||||
| float16_t b = B[i * h + x]; | |||||
| if (0.0f == b) { | |||||
| continue; | |||||
| } | |||||
| for (int j = 0; j < unitStep; ++j) { | |||||
| dstX[j] += srcX[j] * b; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | } | ||||
| void TiledC4MatmulFp16(float16_t *dst, const float16_t *src, const float16_t *weight, size_t ic4, size_t cal_num, | |||||
| void TiledC4MatmulFp16(float16_t *dst, const float16_t *src, const float16_t *weight, size_t cal_num, size_t ic4, | |||||
| size_t oc4) { | size_t oc4) { | ||||
| // TODO(fun): function | |||||
| return; | |||||
| int dx, sz, dz; | |||||
| int src_depth_step = 4 * DECONV_WINOGRAD_DEFAULT_TILE; | |||||
| for (dz = 0; dz < oc4; ++dz) { | |||||
| float16_t *dst_z = dst + dz * cal_num; | |||||
| const float16_t *weight_dz = weight + dz * ic4 * 16; | |||||
| for (dx = 0; dx < DECONV_WINOGRAD_DEFAULT_TILE; ++dx) { | |||||
| float16_t *dst_x = dst_z + dx * 4; | |||||
| dst_x[0] = 0.0f; | |||||
| dst_x[1] = 0.0f; | |||||
| dst_x[2] = 0.0f; | |||||
| dst_x[3] = 0.0f; | |||||
| const float16_t *src_dx = src + 4 * dx; | |||||
| for (sz = 0; sz < ic4; ++sz) { | |||||
| const float16_t *src_z = src_dx + sz * src_depth_step; | |||||
| const float16_t *weight_z = weight_dz + sz * 16; | |||||
| for (int i = 0; i < 4; ++i) { | |||||
| for (int j = 0; j < 4; ++j) { | |||||
| dst_x[j] += src_z[i] * weight_z[4 * i + j]; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -371,8 +371,16 @@ void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const floa | |||||
| #ifdef ENABLE_ARM82_A32 | #ifdef ENABLE_ARM82_A32 | ||||
| void MatVecMulA32Fp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, | void MatVecMulA32Fp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, | ||||
| int depth, int col) { | int depth, int col) { | ||||
| // TODO(fun): function | |||||
| return; | |||||
| for (int ci = 0; ci < col; ci++) { | |||||
| float value = 0; | |||||
| for (int di = 0; di < depth; di++) { | |||||
| value += a[di] * b[ci * depth + di]; | |||||
| } | |||||
| if (bias != NULL) value += bias[ci]; | |||||
| if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); | |||||
| if (act_type == ActType_Relu || act_type == ActType_Relu6) value = MSMAX(0.0f, value); | |||||
| c[ci] = value; | |||||
| } | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -381,7 +389,7 @@ void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const f | |||||
| #ifdef ENABLE_ARM64 | #ifdef ENABLE_ARM64 | ||||
| MatVecMulFp16Neon64(a, b, c, bias, (int)act_type, depth, col); | MatVecMulFp16Neon64(a, b, c, bias, (int)act_type, depth, col); | ||||
| #else | #else | ||||
| MatVecMulA32Fp16(a, b, c, bias, (int)act_type, depth, col); | |||||
| MatVecMulA32NeonFp16(a, b, c, bias, (int)act_type, depth, col); | |||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -609,6 +617,23 @@ void RowMajor2Col16MajorFp16(const void *src, float16_t *dst, int row, int col, | |||||
| return; | return; | ||||
| } | } | ||||
| void RowMajor2Col12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { | |||||
| if (is_fp32_src) { | |||||
| const float *fp32_src = (const float *)src; | |||||
| for (int r = 0; r < row; r++) { | |||||
| for (int c = 0; c < col; c++) { | |||||
| int r_div12 = r / 12; | |||||
| int r_mod12 = r % 12; | |||||
| dst[r_div12 * 12 * col + c * 12 + r_mod12] = (float16_t)(fp32_src[r * col + c]); | |||||
| } | |||||
| } | |||||
| } else { | |||||
| const float16_t *fp16_src = (const float16_t *)src; | |||||
| RowMajor2Col12MajorFp16Opt(fp16_src, dst, row, col); | |||||
| } | |||||
| return; | |||||
| } | |||||
| void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { | void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { | ||||
| for (int r = 0; r < row; r++) { | for (int r = 0; r < row; r++) { | ||||
| for (int c = 0; c < col; c++) { | for (int c = 0; c < col; c++) { | ||||
| @@ -623,6 +648,20 @@ void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, | |||||
| } | } | ||||
| } | } | ||||
| void RowMajor2Row12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { | |||||
| for (int r = 0; r < row; r++) { | |||||
| for (int c = 0; c < col; c++) { | |||||
| int c_div12 = c / 12; | |||||
| int c_mod12 = c % 12; | |||||
| if (is_fp32_src) { | |||||
| dst[c_div12 * 12 * row + r * 12 + c_mod12] = (float16_t)(((const float *)src)[r * col + c]); | |||||
| } else { | |||||
| dst[c_div12 * 12 * row + r * 12 + c_mod12] = ((const float16_t *)src)[r * col + c]; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { | void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { | ||||
| for (int r = 0; r < row; r++) { | for (int r = 0; r < row; r++) { | ||||
| for (int c = 0; c < col; c++) { | for (int c = 0; c < col; c++) { | ||||
| @@ -19,9 +19,6 @@ | |||||
| #include <float.h> | #include <float.h> | ||||
| #include <string.h> | #include <string.h> | ||||
| #ifdef ENABLE_ARM64 | |||||
| #include <arm_neon.h> | |||||
| #endif | |||||
| #include "nnacl/errorcode.h" | #include "nnacl/errorcode.h" | ||||
| #include "nnacl/matmul_parameter.h" | #include "nnacl/matmul_parameter.h" | ||||
| #include "nnacl/op_base.h" | #include "nnacl/op_base.h" | ||||
| @@ -63,6 +60,9 @@ void MatMul12x8A32Fp16(const float16_t *a, const float16_t *b, float16_t *dst, c | |||||
| void MatVecMulA32Fp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, | void MatVecMulA32Fp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, | ||||
| int depth, int col); | int depth, int col); | ||||
| void MatVecMulA32NeonFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, | |||||
| int depth, int col); | |||||
| #endif | #endif | ||||
| void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, | void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, | ||||
| @@ -79,8 +79,12 @@ void RowMajor2Col12MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, si | |||||
| void RowMajor2Col16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); | void RowMajor2Col16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); | ||||
| void RowMajor2Col12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); | |||||
| void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); | void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); | ||||
| void RowMajor2Row12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); | |||||
| void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); | void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); | ||||
| void RowMajor2Col8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); | void RowMajor2Col8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); | ||||
| @@ -114,7 +114,12 @@ void MatmulBaseFP16CPUKernel::ResizeParameter() { | |||||
| params_->row_align_ = 1; | params_->row_align_ = 1; | ||||
| params_->col_align_ = params_->col_; | params_->col_align_ = params_->col_; | ||||
| } else { | } else { | ||||
| params_->row_align_ = UP_ROUND(params_->row_, C16NUM); | |||||
| #ifdef ENABLE_ARM64 | |||||
| int row_tile = C16NUM; | |||||
| #else | |||||
| int row_tile = C12NUM; | |||||
| #endif | |||||
| params_->row_align_ = UP_ROUND(params_->row_, row_tile); | |||||
| params_->col_align_ = UP_ROUND(params_->col_, C8NUM); | params_->col_align_ = UP_ROUND(params_->col_, C8NUM); | ||||
| } | } | ||||
| return; | return; | ||||
| @@ -163,9 +168,17 @@ void MatmulBaseFP16CPUKernel::InitMatrixA(void *src_ptr) { | |||||
| int8_t *src = int8_src + i * params_->deep_ * params_->row_ * lite::DataTypeSize(src_data_type); | int8_t *src = int8_src + i * params_->deep_ * params_->row_ * lite::DataTypeSize(src_data_type); | ||||
| float16_t *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_; | float16_t *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_; | ||||
| if (params_->a_transpose_) { | if (params_->a_transpose_) { | ||||
| #ifdef ENABLE_ARM64 | |||||
| RowMajor2Row16MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32); | RowMajor2Row16MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32); | ||||
| #else | |||||
| RowMajor2Row12MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32); | |||||
| #endif | |||||
| } else { | } else { | ||||
| #ifdef ENABLE_ARM64 | |||||
| RowMajor2Col16MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32); | RowMajor2Col16MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32); | ||||
| #else | |||||
| RowMajor2Col12MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32); | |||||
| #endif | |||||
| } | } | ||||
| } | } | ||||
| return; | return; | ||||