diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/MatVecMulFp16.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/MatVecMulFp16.S new file mode 100644 index 0000000000..936827dedb --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/MatVecMulFp16.S @@ -0,0 +1,222 @@ +#ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" + +.text +.align 5 + +// void MatVecMulA32NeonFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, +// int depth, int col) { +// r0: a +// r1: b +// r2: c +// r3: bias +// r4: act_type +// r5: depth +// r6: col + +asm_function MatVecMulA32NeonFp16 + // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf + push {r0-r8, r9, r10, r11, lr} + add sp, sp, #52 + + ldr r4, [sp] + ldr r5, [sp, #4] + ldr r6, [sp, #8] + + add r10, r5, r5 // stride = depth * sizeof(float16_t) + mov lr, #4 + mul r11, r10, lr // stride x 4 + + cmp r6, #4 + blt Col1Loop + +Col4Loop: + mov r7, r0 // reload a(vector) ptr + mov r9, r1 // reload b(matrix) ptr + mov r8, r5 // reload depth value + + veor q9, q9, q9 + veor q10, q10, q10 + veor q11, q11, q11 + veor q12, q12, q12 + veor q15, q15, q15 + + cmp r8, #8 + bge Col4Depth8 + cmp r8, #4 + bge Col4Depth4 + cmp r8, #1 + bge Col4Depth1 + b Col4End + + Col4Depth8: + vld1.16 {q8}, [r7]! + add lr, r9, r10 + vld1.16 {q0}, [r9]! + vld1.16 {q1}, [lr], r10 + vld1.16 {q2}, [lr], r10 + vld1.16 {q3}, [lr] + + vmla.f16 q9, q8, q0 + vmla.f16 q10, q8, q1 + vmla.f16 q11, q8, q2 + vmla.f16 q12, q8, q3 + sub r8, r8, #8 + cmp r8, #8 + bge Col4Depth8 + cmp r8, #4 + bge Col4Depth4 + b AddC4 + + Col4Depth4: + vld1.16 {d16}, [r7]! + add lr, r9, r10 + vld1.16 {d0}, [r9]! + vld1.16 {d2}, [lr], r10 + vld1.16 {d4}, [lr], r10 + vld1.16 {d6}, [lr] + + vmla.f16 d18, d16, d0 + vmla.f16 d20, d16, d2 + vmla.f16 d22, d16, d4 + vmla.f16 d24, d16, d6 + sub r8, r8, #4 + cmp r8, #4 + bge Col4Depth4 + + AddC4: + vpadd.f16 d0, d18, d19 + vpadd.f16 d1, d20, d21 + vpadd.f16 d2, d22, d23 + vpadd.f16 d4, d24, d25 + vpadd.f16 d30, d0, d1 + vpadd.f16 d31, d2, d4 + vpadd.f16 d30, d30, d31 + cmp r8, #1 + bge Col4Depth1 + b Col4End + + Col4Depth1: + vld1.16 {d0[0]}, [r7]! + add lr, r9, r10 + vld1.16 {d2[0]}, [r9]! + vld1.16 {d2[1]}, [lr], r10 + vld1.16 {d2[2]}, [lr], r10 + vld1.16 {d2[3]}, [lr] + + vmla.f16 d30, d2, d0[0] + subs r8, r8, #1 + bne Col4Depth1 + + Col4End: + cmp r3, #0 + beq Col4Activation + vld1.16 {d26}, [r3]! + vadd.f16 d30, d30, d26 + + Col4Activation: + cmp r4, #3 + beq Col4Relu6 + cmp r4, #1 + beq Col4Relu + b Col4Write + + Col4Relu6: + vmov.i16 q12, #6 + vcvt.f16.s16 q12, q12 + vmin.f16 d30, d30, d24 + + Col4Relu: + veor q13, q13, q13 + vmax.f16 d30, d30, d26 + + Col4Write: + vst1.16 {d30}, [r2]! + subs r6, r6, #4 + beq End + add r1, r1, r11 + cmp r6, #4 + bge Col4Loop + +Col1Loop: + mov r7, r0 // reload a(vector) ptr + mov r9, r1 // reload b(matrix) ptr + mov r8, r5 // reload depth value + veor q10, q10, q10 + veor q15, q15, q15 + + cmp r8, #8 + bge Col1Depth8 + cmp r8, #4 + bge Col1Depth4 + cmp r8, #1 + bge Col1Depth1 + b Col1End + + Col1Depth8: + vld1.16 {q0}, [r7]! + vld1.16 {q1}, [r9]! + vmla.f16 q10, q1, q0 + sub r8, r8, #8 + cmp r8, #8 + bge Col1Depth8 + cmp r8, #4 + bge Col1Depth4 + b AddC1 + + Col1Depth4: + vld1.16 {d0}, [r7]! + vld1.16 {d2}, [r9]! + vmla.f16 d20, d2, d0 + sub r8, r8, #4 + cmp r8, #4 + bge Col1Depth4 + + AddC1: + vpadd.f16 d30, d20, d21 + vpadd.f16 d30, d30, d20 + vpadd.f16 d30, d30, d20 + cmp r8, #1 + bge Col1Depth1 + b Col1End + + Col1Depth1: + vld1.16 {d0[0]}, [r7]! + vld1.16 {d2[0]}, [r9]! + vmla.f16 d30, d2, d0[0] + subs r8, r8, #1 + bne Col1Depth1 + + Col1End: + cmp r3, #0 + beq Col1Activation + vld1.16 {d28[0]}, [r3]! + vadd.f16 d30, d30, d28 + + Col1Activation: + cmp r4, #3 + beq Col1Relu6 + cmp r4, #1 + beq Col1Relu + b Col1Write + + Col1Relu6: + vmov.i16 d26, #6 + vcvt.f16.s16 d26, d26 + vmin.f16 d30, d30, d26 + + Col1Relu: + veor d24, d24, d24 + vmax.f16 d30, d30, d24 + + Col1Write: + vst1.16 {d30[0]}, [r2]! + subs r6, r6, #1 + beq End + add r1, r1, r10 + b Col1Loop + +End: + sub sp, sp, #52 + pop {r0-r8, r9, r10, r11, pc} +#endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/TiledC4MatmulFp16.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/TiledC4MatmulFp16.S new file mode 100644 index 0000000000..726f87c902 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/TiledC4MatmulFp16.S @@ -0,0 +1,93 @@ +#ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" + +.text +.align 5 + +asm_function TiledC4MatmulFp16 +// void TiledC4MatmulFp16(float16_t *dst, const float16_t *src, const float16_t *weight, size_t cal_num, size_t ic4, +// size_t oc4); +// r0: dst +// r1: src +// r2: weight +// r3: cal_num +// r4(sp): ic4 +// r5(sp + #4): oc4 +push {r4-r11, lr} +vpush {q4-q7} +add sp, sp, #100 +ldr r4, [sp] +ldr r5, [sp, #4] // oc4 +add r3, r3, r3 +mov r7, r1 + +cmp r5, #1 +blt LoopOCEnd +cmp r4, #1 +blt LoopICEnd +LoopOC: + ldr r4, [sp] + veor q15, q15, q15 + veor q14, q14, q14 + veor q13, q13, q13 + veor q12, q12, q12 + LoopIC: + vld1.16 {q4, q5}, [r2]! // weight + vld1.16 {q2, q3}, [r1]! // 16 number src + vmla.f16 d24, d8, d4[0] + vmla.f16 d24, d9, d4[1] + vmla.f16 d24, d10, d4[2] + vmla.f16 d24, d11, d4[3] + + vmla.f16 d25, d8, d5[0] + vmla.f16 d25, d9, d5[1] + vmla.f16 d25, d10, d5[2] + vmla.f16 d25, d11, d5[3] + + vmla.f16 d26, d8, d6[0] + vmla.f16 d26, d9, d6[1] + vmla.f16 d26, d10, d6[2] + vmla.f16 d26, d11, d6[3] + + vmla.f16 d27, d8, d7[0] + vmla.f16 d27, d9, d7[1] + vmla.f16 d27, d10, d7[2] + vmla.f16 d27, d11, d7[3] + + vld1.16 {q0, q1}, [r1]! // 16 number src + vmla.f16 d28, d8, d0[0] + vmla.f16 d28, d9, d0[1] + vmla.f16 d28, d10, d0[2] + vmla.f16 d28, d11, d0[3] + + vmla.f16 d29, d8, d1[0] + vmla.f16 d29, d9, d1[1] + vmla.f16 d29, d10, d1[2] + vmla.f16 d29, d11, d1[3] + + vmla.f16 d30, d8, d2[0] + vmla.f16 d30, d9, d2[1] + vmla.f16 d30, d10, d2[2] + vmla.f16 d30, d11, d2[3] + + vmla.f16 d31, d8, d3[0] + vmla.f16 d31, d9, d3[1] + vmla.f16 d31, d10, d3[2] + vmla.f16 d31, d11, d3[3] + + subs r4, r4, #1 + bne LoopIC + b LoopICEnd + LoopICEnd: + mov lr, r0 + vst1.16 {q12, q13}, [lr]! + vst1.16 {q14, q15}, [lr]! + add r0, r0, r3 // dst += cal_num + mov r1, r7 + subs r5, r5, #1 + bne LoopOC +LoopOCEnd: + sub sp, sp, #100 + vpop {q4-q7} + pop {r4-r11, pc} +#endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/WinogradTransLeft.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/WinogradTransLeft.S new file mode 100644 index 0000000000..364f4ce646 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/WinogradTransLeft.S @@ -0,0 +1,150 @@ +#ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" + +.text +.align 5 + +// void WinogradTransLeftFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k, +// size_t length); +//r0: S +//r1: B +//r2: M +//r3: w +//r4: h +//r5: k +//r6: length +asm_function WinogradTransLeftFp16 + push {r0, r3, r4-r11, lr} + vpush {q4-q7} + add sp, sp, #108 + ldr r4, [sp] + ldr r6, [sp, #8] + + mov r8, #8 // 4 * sizeof(float16_t) + mul r8, r6, r8 // length * 4 * 2 + mul r7, r3, r8 // step for S + add r10, r4, r4 // step for B + +cmp r4, #1 +blt LoopHEnd +cmp r3, #1 +blt LoopHEnd +LoopH: + ldr r3, [sp, #-40] // w + ldr r0, [sp, #-44] + LoopW: + mov r11, r0 // S + mov lr, r1 // B_src + veor q6, q6, q6 + ldr r6, [sp, #8] + InitZero: + vst1.16 {d12}, [r2]! + subs r6, r6, #1 + bne InitZero + sub r2, r2, r8 + + ldr r5, [sp, #4] + cmp r5, #4 + bge LoopK4 + cmp r5, #3 + bge LoopK3 + cmp r5, #1 + bge LoopK1 + b LoopKEnd + + LoopK4: + ldr r6, [sp, #8] + vld1.16 {d1[0]}, [lr], r10 + vld1.16 {d3[0]}, [lr], r10 + vld1.16 {d5[0]}, [lr], r10 + vld1.16 {d7[0]}, [lr], r10 + + add r12, r11, r7 + add r14, r12, r7 + add r9, r14, r7 + LoopK4L4: + vld1.16 {d12}, [r2] + vld1.16 {d0}, [r11]! + vld1.16 {d2}, [r12]! + vmla.f16 d12, d0, d1[0] + vld1.16 {d4}, [r14]! + vmla.f16 d12, d2, d3[0] + vld1.16 {d6}, [r9]! + vmla.f16 d12, d4, d5[0] + vmla.f16 d12, d6, d7[0] + vst1.16 {d12}, [r2]! // dst + subs r6, r6, #1 // length + bne LoopK4L4 + + subs r5, r5, #4 // k + beq LoopKEnd + sub r2, r2, r8 // dst - step + sub r9, r9, r8 + add r11, r9, r7 + cmp r5, #4 + bge LoopK4 + cmp r5, #3 + bge LoopK3 + b LoopK1 + + LoopK3: + ldr r6, [sp, #8] + vld1.16 {d1[0]}, [lr], r10 + vld1.16 {d3[0]}, [lr], r10 + vld1.16 {d5[0]}, [lr], r10 + + add r12, r11, r7 + add r9, r12, r7 + LoopK3L4: + vld1.16 {d12}, [r2] + vld1.16 {d0}, [r11]! + vld1.16 {d2}, [r12]! + vmla.f16 d12, d0, d1[0] + vld1.16 {d4}, [r9]! + vmla.f16 d12, d2, d3[0] + vmla.f16 d12, d4, d5[0] + vst1.16 {d12}, [r2]! // dst + subs r6, r6, #1 // length + bne LoopK3L4 + + subs r5, r5, #3 // k + beq LoopKEnd + sub r2, r2, r8 // dst - step + sub r9, r9, r8 + add r11, r9, r7 + cmp r5, #3 + bge LoopK3 + b LoopK1 + + LoopK1: + ldr r6, [sp, #8] + vld1.16 {d1[0]}, [lr], r10 + + LoopK1L4: + vld1.16 {d12}, [r2] + vld1.16 {d0}, [r11]! + vmla.f16 d12, d0, d1[0] + vst1.16 {d12}, [r2]! // dst + subs r6, r6, #1 // length + bne LoopK1L4 + + subs r5, r5, #1 // k + beq LoopKEnd + sub r2, r2, r8 // dst - step + sub r11, r11, r8 + add r11, r11, r7 + b LoopK1 + LoopKEnd: + add r0, r0, r8 // S += unitstep + subs r3, r3, #1 + bne LoopW + LoopWEnd: + subs r4, r4, #1 + beq LoopHEnd + add r1, r1, #2 // B += 1 + b LoopH +LoopHEnd: + sub sp, sp, #108 + vpop {q4-q7} + pop {r0, r3, r4-r11, pc} +#endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/WinogradTransRight.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/WinogradTransRight.S new file mode 100644 index 0000000000..5ad8dee3b7 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/WinogradTransRight.S @@ -0,0 +1,148 @@ +#ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" + +.text +.align 5 + +// void WinogradTransRightFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k, +// size_t length); +//r0: S +//r1: B +//r2: M +//r3: w +//r4: h +//r5: k +//r6: length +asm_function WinogradTransRightFp16 + push {r1, r3, r4-r11, lr} + vpush {q4-q7} + add sp, sp, #108 + ldr r4, [sp] + ldr r5, [sp, #4] + ldr r6, [sp, #8] + + mov r8, #8 // 4 * sizeof(float16_t) + mul r8, r6, r8 // length * 4 * 2 + mul r7, r5, r8 // step for S = k * unitStep * 4 + add r10, r4, r4 // step for B = 2 * h + +cmp r4, #1 +blt LoopHEnd +cmp r3, #1 +blt LoopHEnd +LoopH: + ldr r3, [sp, #-40] // w + ldr r1, [sp, #-44] + LoopW: + mov r11, r0 // S + mov lr, r1 // B_src + veor q6, q6, q6 + ldr r6, [sp, #8] + InitZero: + vst1.16 {d12}, [r2]! + subs r6, r6, #1 + bne InitZero + sub r2, r2, r8 + + ldr r5, [sp, #4] + cmp r5, #4 + bge LoopK4 + cmp r5, #3 + bge LoopK3 + cmp r5, #1 + bge LoopK1 + b LoopKEnd + + LoopK4: + ldr r6, [sp, #8] + vld1.16 {d1[0]}, [lr], r10 + vld1.16 {d3[0]}, [lr], r10 + vld1.16 {d5[0]}, [lr], r10 + vld1.16 {d7[0]}, [lr], r10 + + add r12, r11, r8 + add r14, r12, r8 + add r9, r14, r8 + LoopK4L4: + vld1.16 {d12}, [r2] + vld1.16 {d0}, [r11]! + vld1.16 {d2}, [r12]! + vmla.f16 d12, d0, d1[0] + vld1.16 {d4}, [r14]! + vmla.f16 d12, d2, d3[0] + vld1.16 {d6}, [r9]! + vmla.f16 d12, d4, d5[0] + vmla.f16 d12, d6, d7[0] + vst1.16 {d12}, [r2]! // dst + subs r6, r6, #1 // length + bne LoopK4L4 + + subs r5, r5, #4 // k + beq LoopKEnd + sub r2, r2, r8 // dst - step + mov r11, r9 + cmp r5, #4 + bge LoopK4 + cmp r5, #3 + bge LoopK3 + b LoopK1 + + LoopK3: + ldr r6, [sp, #8] + vld1.16 {d1[0]}, [lr], r10 + vld1.16 {d3[0]}, [lr], r10 + vld1.16 {d5[0]}, [lr], r10 + + add r12, r11, r8 + add r9, r12, r8 + LoopK3L4: + vld1.16 {d12}, [r2] + vld1.16 {d0}, [r11]! + vld1.16 {d2}, [r12]! + vmla.f16 d12, d0, d1[0] + vld1.16 {d4}, [r9]! + vmla.f16 d12, d2, d3[0] + vmla.f16 d12, d4, d5[0] + vst1.16 {d12}, [r2]! // dst + subs r6, r6, #1 // length + bne LoopK3L4 + + subs r5, r5, #3 // k + beq LoopKEnd + sub r2, r2, r8 // dst - step + mov r11, r9 + cmp r5, #3 + bge LoopK3 + b LoopK1 + + LoopK1: + ldr r6, [sp, #8] + vld1.16 {d1[0]}, [lr], r10 + + LoopK1L4: + vld1.16 {d12}, [r2] + vld1.16 {d0}, [r11]! + vmla.f16 d12, d0, d1[0] + vst1.16 {d12}, [r2]! // dst + subs r6, r6, #1 // length + bne LoopK1L4 + + subs r5, r5, #1 // k + beq LoopKEnd + sub r2, r2, r8 // dst - step + b LoopK1 + + LoopKEnd: + add r1, r1, #2 // B[x] + subs r3, r3, #1 + bne LoopW + LoopWEnd: + add r0, r0, r7 + subs r4, r4, #1 + beq LoopHEnd + b LoopH +LoopHEnd: + sub sp, sp, #108 + vpop {q4-q7} + pop {r1, r3, r4-r11, pc} +#endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/common_func_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/common_func_fp16.c index 9a66f52e5d..370258cacd 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/common_func_fp16.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/common_func_fp16.c @@ -41,32 +41,36 @@ void PostConvFuncCommFp16(float16_t *out_ptr, const float16_t *src_ptr_, const f void PostConvFuncFp16C8(const float16_t *c8_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane, size_t oc_stride, ActType act_type) { +#ifdef ENABLE_ARM64 size_t oc8mod = oc % C8NUM; size_t oc8div = oc - oc8mod; size_t stride_size = oc_stride * sizeof(float16_t); PostFuncBiasReluC8Fp16(nhwc_out, c8_out, bias, oc8div, oc8mod, plane, stride_size, act_type); - return; +#else + PostConvFuncCommFp16(nhwc_out, c8_out, bias, oc, plane, oc_stride, plane, act_type, C8NUM); +#endif } void PostConvFuncFp16C4(const float16_t *c4_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane, size_t plane_stride, ActType act_type) { +#ifdef ENABLE_ARM64 size_t oc4mod = oc % C4NUM; size_t oc4div = oc - oc4mod; size_t stride_size = (plane_stride - plane) * C4NUM * sizeof(float16_t); PostFuncBiasReluC4Fp16(nhwc_out, c4_out, bias, oc4div, oc4mod, plane, stride_size, act_type); - return; +#else + PostConvFuncCommFp16(nhwc_out, c4_out, bias, oc, plane, oc, plane_stride, act_type, C4NUM); +#endif } #ifdef ENABLE_ARM82_A32 void PostFuncBiasReluC4Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc4div, size_t oc4mod, size_t plane_size, size_t plane_stride, size_t relu_type) { // TODO(fun): function - return; } void PostFuncBiasReluC8Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc8div, size_t oc8mod, size_t plane_size, size_t stride, size_t relu_type) { // TODO(fun): function - return; } #endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.c index f8003ec939..c1a97e0c95 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.c @@ -41,6 +41,55 @@ void DeConvWgInputPackFp16(float16_t *src_ptr, float16_t *dst_ptr, int channel, return; } +#ifdef ENABLE_ARM82_A32 +void DeconvWgMergeFp16A32Fun(const float16_t *src_ptr, float16_t *dst_ptr, size_t src_step, size_t dst_step) { + asm volatile( + "mov r7, %[src_ptr]\n" + "mov r8, %[dst_ptr]\n" + "mov r10, r8\n" + + "vld1.16 {d0}, [r7], %[src_step]\n" + "vld1.16 {d2}, [r8], %[dst_step]\n" + "vld1.16 {d4}, [r7], %[src_step]\n" + "vld1.16 {d6}, [r8], %[dst_step]\n" + "vadd.f16 d0, d0, d2\n" + "vld1.16 {d8}, [r7], %[src_step]\n" + "vadd.f16 d4, d4, d6\n" + "vst1.16 {d0}, [r10], %[dst_step]\n" + "vst1.16 {d4}, [r10], %[dst_step]\n" + + "vld1.16 {d10}, [r8], %[dst_step]\n" + "vld1.16 {d12}, [r7], %[src_step]\n" + "vadd.f16 d8, d8, d10\n" + "vld1.16 {d14}, [r8], %[dst_step]\n" + "vadd.f16 d12, d12, d14\n" + "vld1.16 {d0}, [r7], %[src_step]\n" + "vst1.16 {d8}, [r10], %[dst_step]\n" + "vst1.16 {d12}, [r10], %[dst_step]\n" + + "vld1.16 {d2}, [r8], %[dst_step]\n" + "vld1.16 {d4}, [r7], %[src_step]\n" + "vld1.16 {d6}, [r8], %[dst_step]\n" + "vadd.f16 d0, d0, d2\n" + "vadd.f16 d4, d4, d6\n" + "vst1.16 {d0}, [r10], %[dst_step]\n" + "vst1.16 {d4}, [r10], %[dst_step]\n" + + "vld1.16 {d8}, [r7], %[src_step]\n" + "vld1.16 {d10}, [r8], %[dst_step]\n" + "vld1.16 {d12}, [r7], %[src_step]\n" + "vld1.16 {d14}, [r8], %[dst_step]\n" + "vadd.f16 d8, d8, d10\n" + "vadd.f16 d12, d12, d14\n" + "vst1.16 {d8}, [r10], %[dst_step]\n" + "vst1.16 {d12}, [r10], %[dst_step]\n" + + : + : [ src_ptr ] "r"(src_ptr), [ dst_ptr ] "r"(dst_ptr), [ src_step ] "r"(src_step), [ dst_step ] "r"(dst_step) + : "r7", "r8", "r10", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +} +#endif + void DeConvWgMergeFp16(const float16_t *src, float16_t *dst, size_t src_stride, size_t dst_stride, size_t count) { const float16_t *src_ptr = src; float16_t *dst_ptr = dst; @@ -94,8 +143,18 @@ void DeConvWgMergeFp16(const float16_t *src, float16_t *dst, size_t src_stride, : : [ src_ptr ] "r"(src_ptr), [ dst_ptr ] "r"(dst_ptr), [ src_step ] "r"(src_step), [ dst_step ] "r"(dst_step) : "x7", "x8", "x10", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +#elif defined(ENABLE_ARM82_A32) + size_t src_step = src_stride * sizeof(float16_t); + size_t dst_step = dst_stride * sizeof(float16_t); + DeconvWgMergeFp16A32Fun(src_ptr, dst_ptr, src_step, dst_step); #else - // TODO(fun): arm32 + for (int j = 0; j < 8; j++) { + const float16_t *s = src_ptr + j * src_stride; + float16_t *d = dst_ptr + j * dst_stride; + for (int k = 0; k < 4; k++) { + d[k] += s[k]; + } + } #endif src_ptr += C8NUM * src_stride; dst_ptr += C8NUM * dst_stride; @@ -377,22 +436,78 @@ void DeconvWgPostFp16(float16_t *tile_out, float16_t *nc4hw4_output, ConvParamet return; } -#ifdef ENABLE_ARM82_A32 +#ifndef ENABLE_ARM void WinogradTransLeftFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k, size_t length) { - // TODO(fun): function - return; + const int unitStep = 4 * length; + for (int y = 0; y < h; ++y) { + float16_t *dstY = M + y * w * unitStep; + for (int x = 0; x < w; ++x) { + float16_t *dstX = dstY + x * unitStep; + const float16_t *srcX = S + x * unitStep; + memset(dstX, 0, unitStep * sizeof(float16_t)); + for (int i = 0; i < k; ++i) { + float16_t b = B[i * h + y]; + const float16_t *srcY = srcX + i * w * unitStep; + if (0.0f == b) { + continue; + } + for (int j = 0; j < unitStep; ++j) { + dstX[j] += srcY[j] * b; + } + } + } + } } void WinogradTransRightFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k, size_t length) { - // TODO(fun): function - return; + const int unitStep = 4 * length; + for (int y = 0; y < h; ++y) { + float16_t *dstY = M + y * w * unitStep; + const float16_t *srcY = S + y * k * unitStep; + + for (int x = 0; x < w; ++x) { + float16_t *dstX = dstY + x * unitStep; + memset(dstX, 0, unitStep * sizeof(float16_t)); + for (int i = 0; i < k; ++i) { + const float16_t *srcX = srcY + i * unitStep; + float16_t b = B[i * h + x]; + if (0.0f == b) { + continue; + } + for (int j = 0; j < unitStep; ++j) { + dstX[j] += srcX[j] * b; + } + } + } + } } -void TiledC4MatmulFp16(float16_t *dst, const float16_t *src, const float16_t *weight, size_t ic4, size_t cal_num, +void TiledC4MatmulFp16(float16_t *dst, const float16_t *src, const float16_t *weight, size_t cal_num, size_t ic4, size_t oc4) { - // TODO(fun): function - return; + int dx, sz, dz; + int src_depth_step = 4 * DECONV_WINOGRAD_DEFAULT_TILE; + for (dz = 0; dz < oc4; ++dz) { + float16_t *dst_z = dst + dz * cal_num; + const float16_t *weight_dz = weight + dz * ic4 * 16; + for (dx = 0; dx < DECONV_WINOGRAD_DEFAULT_TILE; ++dx) { + float16_t *dst_x = dst_z + dx * 4; + dst_x[0] = 0.0f; + dst_x[1] = 0.0f; + dst_x[2] = 0.0f; + dst_x[3] = 0.0f; + const float16_t *src_dx = src + 4 * dx; + for (sz = 0; sz < ic4; ++sz) { + const float16_t *src_z = src_dx + sz * src_depth_step; + const float16_t *weight_z = weight_dz + sz * 16; + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + dst_x[j] += src_z[i] * weight_z[4 * i + j]; + } + } + } + } + } } #endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.c index 8a70d04f72..0cd5b56e95 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.c @@ -371,8 +371,16 @@ void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const floa #ifdef ENABLE_ARM82_A32 void MatVecMulA32Fp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col) { - // TODO(fun): function - return; + for (int ci = 0; ci < col; ci++) { + float value = 0; + for (int di = 0; di < depth; di++) { + value += a[di] * b[ci * depth + di]; + } + if (bias != NULL) value += bias[ci]; + if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); + if (act_type == ActType_Relu || act_type == ActType_Relu6) value = MSMAX(0.0f, value); + c[ci] = value; + } } #endif @@ -381,7 +389,7 @@ void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const f #ifdef ENABLE_ARM64 MatVecMulFp16Neon64(a, b, c, bias, (int)act_type, depth, col); #else - MatVecMulA32Fp16(a, b, c, bias, (int)act_type, depth, col); + MatVecMulA32NeonFp16(a, b, c, bias, (int)act_type, depth, col); #endif } @@ -609,6 +617,23 @@ void RowMajor2Col16MajorFp16(const void *src, float16_t *dst, int row, int col, return; } +void RowMajor2Col12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { + if (is_fp32_src) { + const float *fp32_src = (const float *)src; + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + int r_div12 = r / 12; + int r_mod12 = r % 12; + dst[r_div12 * 12 * col + c * 12 + r_mod12] = (float16_t)(fp32_src[r * col + c]); + } + } + } else { + const float16_t *fp16_src = (const float16_t *)src; + RowMajor2Col12MajorFp16Opt(fp16_src, dst, row, col); + } + return; +} + void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { for (int r = 0; r < row; r++) { for (int c = 0; c < col; c++) { @@ -623,6 +648,20 @@ void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, } } +void RowMajor2Row12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + int c_div12 = c / 12; + int c_mod12 = c % 12; + if (is_fp32_src) { + dst[c_div12 * 12 * row + r * 12 + c_mod12] = (float16_t)(((const float *)src)[r * col + c]); + } else { + dst[c_div12 * 12 * row + r * 12 + c_mod12] = ((const float16_t *)src)[r * col + c]; + } + } + } +} + void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) { for (int r = 0; r < row; r++) { for (int c = 0; c < col; c++) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.h index 12a564a677..6cb203345b 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/matmul_fp16.h @@ -19,9 +19,6 @@ #include #include -#ifdef ENABLE_ARM64 -#include -#endif #include "nnacl/errorcode.h" #include "nnacl/matmul_parameter.h" #include "nnacl/op_base.h" @@ -63,6 +60,9 @@ void MatMul12x8A32Fp16(const float16_t *a, const float16_t *b, float16_t *dst, c void MatVecMulA32Fp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col); + +void MatVecMulA32NeonFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, + int depth, int col); #endif void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, @@ -79,8 +79,12 @@ void RowMajor2Col12MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, si void RowMajor2Col16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); +void RowMajor2Col12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); + void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); +void RowMajor2Row12MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); + void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); void RowMajor2Col8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc index 9d58d28aaf..c50a081490 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc @@ -114,7 +114,12 @@ void MatmulBaseFP16CPUKernel::ResizeParameter() { params_->row_align_ = 1; params_->col_align_ = params_->col_; } else { - params_->row_align_ = UP_ROUND(params_->row_, C16NUM); +#ifdef ENABLE_ARM64 + int row_tile = C16NUM; +#else + int row_tile = C12NUM; +#endif + params_->row_align_ = UP_ROUND(params_->row_, row_tile); params_->col_align_ = UP_ROUND(params_->col_, C8NUM); } return; @@ -163,9 +168,17 @@ void MatmulBaseFP16CPUKernel::InitMatrixA(void *src_ptr) { int8_t *src = int8_src + i * params_->deep_ * params_->row_ * lite::DataTypeSize(src_data_type); float16_t *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_; if (params_->a_transpose_) { +#ifdef ENABLE_ARM64 RowMajor2Row16MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32); +#else + RowMajor2Row12MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32); +#endif } else { +#ifdef ENABLE_ARM64 RowMajor2Col16MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32); +#else + RowMajor2Col12MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32); +#endif } } return;