From 48e2f85593a6a669b141e305bd8dfe4bfb248b9c Mon Sep 17 00:00:00 2001 From: lixian <179220644@qq.com> Date: Thu, 6 Aug 2020 21:04:15 +0800 Subject: [PATCH] optimization for depth wise convolution --- .../opclib/assembly/arm32/ConvDwFp32Center.S | 161 +++++ .../opclib/assembly/arm32/ConvDwInt8Center.S | 207 +++++++ .../assembly/arm32/DeconvDwFp32Center.S | 69 +++ .../assembly/arm32/DeconvDwInt8Center.S | 69 +++ .../opclib/assembly/arm64/ConvDwFp32Center.S | 232 +++++++- .../opclib/assembly/arm64/ConvDwInt8Center.S | 558 ++++++++++++++++++ .../assembly/arm64/DeconvDwFp32Center.S | 5 +- .../assembly/arm64/DeconvDwInt8Center.S | 65 ++ .../opclib/assembly/opt/ConvDwFp16Center.S | 294 +++++++++ .../opclib/assembly/opt/DeconvDwFp16Center.S | 64 ++ .../kernel/arm/opclib/fp16/common_func.h | 44 ++ .../arm/opclib/fp16/conv_depthwise_fp16.cc | 38 +- .../kernel/arm/opclib/fp32/common_func.h | 15 +- .../kernel/arm/opclib/int8/common_func.h | 62 ++ .../arm/opclib/int8/conv_depthwise_int8.cc | 27 +- .../kernel/arm/opclib/int8/conv_int8.cc | 20 +- 16 files changed, 1886 insertions(+), 44 deletions(-) create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwFp32Center.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwInt8Center.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwFp32Center.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwInt8Center.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwInt8Center.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwInt8Center.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/ConvDwFp16Center.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/DeconvDwFp16Center.S create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/fp16/common_func.h create mode 100644 mindspore/lite/src/runtime/kernel/arm/opclib/int8/common_func.h diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwFp32Center.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwFp32Center.S new file mode 100644 index 0000000000..7f1724e656 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwFp32Center.S @@ -0,0 +1,161 @@ +#ifdef __arm__ +#ifndef __aarch64__ + +.text +.align 5 +.global ConvDwFp32Center +#ifndef __APPLE__ +.type ConvDwFp32Center, %function +#endif + +// void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, +// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, +// size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); +// r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: weight, #56: kernel_h, #60: kernel_w, +// #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step +// #88: relu, #92: relu6 +ConvDwFp32Center: + // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" + // according to https://stackoverflow.com/questions/53625807 + // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway + // clang's rule seems more simple, though there are no subroutine calls here + // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf + push {r0-r8, r10, r11, lr} + vpush {v4-v7} + add sp, sp, #112 + + ldr r4, [sp, #48] + + vld1.32 {q13}, [r3] + vmov.i32 q14, #6 + vcvt.f32.s32 q14, q14 + veor q15, q15, q15 + + LoopH: + ldr r1, [sp, #4] // src_w + ldr r5, [sp, #52] // width + ldr r0, [sp] // dst_w + cmp r5, #4 + blt LoopW + LoopW4: + mov r11, [sp, #76] // in_sw_step + mov r8, r1 // src_kh + ldr r2, [sp, #8] // weight_kh + ldr r6, [sp, #56] // kernel_h + vmov q0, q13 + LoopKh4: + ldr r12, [sp, #80] //in_kh_step + ldr r7, [sp, #60] // kernel_w + mov lr, r8 // src_kw + LoopKw4: + mov r10, lr + vld1.32 {q12}, [r2]! + vld1.32 {q4}, [r10] + add r10, r10, r11 + vmla.f32 q0, q4, q12 + vld1.32 {q5}, [r10] + add r10, r10, r11 + vmla.f32 q1, q5, q12 + vld1.32 {q6}, [r10] + add r10, r10, r11 + vmla.f32 q2, q6, q12 + vld1.32 {q7}, [r10] + add r10, r10, r11 + vmla.f32 q3, q7, q12 + subs r7, r7, #1 + add lr, lr, r12 + bne LoopKw4 + ldr r12, [sp, #80] + add r8, r8, r12 + subs r6, r6, #1 + bne LoopKh4 + ldr r12, [sp, #92] + cmp r12, #0 + bne Relu64 + ldr r12, [sp, #88] + cmp r12, #0 + bne Relu4 + b Write4 + Relu64: + vmin.f32 q0, q0, q14 + vmin.f32 q1, q1, q14 + vmin.f32 q2, q2, q14 + vmin.f32 q3, q3, q14 + Relu4: + vmax.f32 q0, q0, q15 + vmax.f32 q1, q1, q15 + vmax.f32 q2, q2, q15 + vmax.f32 q3, q3, q15 + Write4: + ldr r12, [sp, #68] + vst1.32 {q0}, [r0] + add r0, r0, r12 + vst1.32 {q1}, [r0] + add r0, r0, r12 + vst1.32 {q2}, [r0] + add r0, r0, r12 + vst1.32 {q3}, [r0] + add r0, r0, r12 + mov r12, #4 + mul r11, r11, r12 + add r1, r1, r11 + sub r5, r5, #4 + cmp r5, r5, #0 + ble LoopWEnd + cmp r5, #4 + bge LoopW + LoopW: + mov r8, r1 // src_kh + ldr r2, [sp, #8] // weight_kh + ldr r6, [sp, #56] // kernel_h + vmov q0, q13 + LoopKh: + ldr r12, [sp, #84] //in_kw_step + ldr r7, [sp, #60] // kernel_w + mov r10, r8 // src_kw + LoopKw: + vld1.32 {q1}, [r10] + add r10, r10, r12 + vld1.32 {q12}, [r2]! + vmla.f32 q0, q1, q12 + subs r7, r7, #1 + bne LoopKw + ldr r12, [sp, #80] + add r8, r8, r12 + subs r6, r6, #1 + bne LoopKh + ldr r12, [sp, #92] + cmp r12, #0 + bne Relu6 + ldr r12, [sp, #88] + cmp r12, #0 + bne Relu + b Write + Relu6: + vmin.f32 q0, q0, q14 + Relu: + vmax.f32 q0, q0, q15 + Write: + ldr r12, [sp, #68] + vst1.32 {q0}, [r0] + add r0, r0, r12 + ldr r12, [sp, #76] + add r1, r1, r12 + subs r5, r5, #1 + bne LoopW + ldr r3, [sp, #64] + ldr r12, [sp] + add r12, r12, r3 + str r12, [sp] + ldr r3, [sp, #72] + ldr r12, [sp, #4] + add r12, r12, r3 + str r12, [sp, #4] + subs r4, r4, #1 + bne LoopH +LoopWEnd: + sub sp, sp, #112 + vpop {v4-v7} + pop {r0-r8, r10, r11, pc} +#endif +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwInt8Center.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwInt8Center.S new file mode 100644 index 0000000000..b97dc8a9ee --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/ConvDwInt8Center.S @@ -0,0 +1,207 @@ +#ifdef __arm__ +#ifndef __aarch64__ + +.text +.align 5 +.global ConvDwInt8Center +#ifndef __APPLE__ +.type ConvDwInt8Center, %function +#endif + +// void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width, +// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, +// size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift, +// int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max); +// r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: width, #56: kernel_h, #60: kernel_w, +// #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step +// #88: out_multiplier, #92: left_shift, #96: right_shift, #100: out_zp, #104: acc_min, #108: acc_max +ConvDwInt8Center: + // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" + // according to https://stackoverflow.com/questions/53625807 + // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway + // clang's rule seems more simple, though there are no subroutine calls here + // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf + push {r0-r8, r10, r11, lr} + vpush {q4-q7} + add sp, sp, #112 + + ldr r4, [sp, #48] + + ldr r12, [sp, #92] + vdup.32 q9, r12 + + ldr r11, [sp, #88] + vdup.32 q10, r11 + + ldr r10, [sp, #96] + vdup.32 q11, r10 + + ldr r8, [sp, #100] + vdup.32 q12, r8 + + ldr r7, [sp, #104] + vdup.32 q13, r7 + + ldr r6, [sp, #108] + vdup.32 q14, r6 + + vld1.32 {q15}, [r3] + + LoopH: + ldr r1, [sp, #4] // src_w + ldr r5, [sp, #52] // width + ldr r0, [sp] // dst_w + LoopW4: + mov r11, [sp, #76] // in_sw_step + mov r8, r1 // src_kh + ldr r2, [sp, #8] // weight_kh + ldr r6, [sp, #56] // kernel_h + vmov q0, q15 + LoopKh4: + ldr r12, [sp, #80] //in_kh_step + ldr r7, [sp, #60] // kernel_w + mov r10, r8 // src_kw + LoopKw4: + vld1.16 {d24}, [r2]! + vld1.16 {d8}, [r10] + add r10, r10, r11 + vmlal.s16 q0, d8, d24 + vld1.16 {d10}, [r10] + add r10, r10, r11 + vmlal.s16 q1, d10, d24 + vld1.16 {d12}, [r10] + add r10, r10, r11 + vmlal.s16 q2, d12, d24 + vld1.16 {d14}, [r10] + add r10, r10, r11 + vmlal.s16 q3, d14, d24 + subs r7, r7, #1 + bne LoopKw4 + ldr r12, [sp, #80] + add r8, r8, r12 + subs r6, r6, #1 + bne LoopKh4 + + vshl.s32 q0, q0, q9 + vshl.s32 q1, q1, q9 + vshl.s32 q2, q2, q9 + vshl.s32 q3, q3, q9 + vqrdmulh.s32 q0, q0, q10 + vqrdmulh.s32 q1, q1, q10 + vqrdmulh.s32 q2, q2, q10 + vqrdmulh.s32 q3, q3, q10 + vrshl.s32 q0, q0, q11 + vrshl.s32 q1, q1, q11 + vrshl.s32 q2, q2, q11 + vrshl.s32 q3, q3, q11 + vadd.i32 q0, q0, q12 + vadd.i32 q1, q1, q12 + vadd.i32 q2, q2, q12 + vadd.i32 q3, q3, q12 + vmax.s32 q0, q0, q13 + vmax.s32 q1, q1, q13 + vmax.s32 q2, q2, q13 + vmax.s32 q3, q3, q13 + vmin.s32 q0, q0, q14 + vmin.s32 q1, q1, q14 + vmin.s32 q2, q2, q14 + vmin.s32 q3, q3, q14 + + vqmovn.s32 d0, q0 + vqmovn.s32 d2, q1 + vqmovn.s32 d4, q2 + vqmovn.s32 d6, q3 + vqmovn.s16 d0, q0 + vqmovn.s16 d2, q1 + vqmovn.s16 d4, q2 + vqmovn.s16 d6, q3 + + mov r3, r0 + ldr r12, [sp, #68] + vst1.8 {d0[0]}, [r3]! + vst1.8 {d0[1]}, [r3]! + vst1.8 {d0[2]}, [r3]! + vst1.8 {d0[3]}, [r3]! + add r0, r0, r12 + mov r3, r0 + vst1.8 {d2[0]}, [r3]! + vst1.8 {d2[1]}, [r3]! + vst1.8 {d2[2]}, [r3]! + vst1.8 {d2[3]}, [r3]! + add r0, r0, r12 + mov r3, r0 + vst1.8 {d4[0]}, [r3]! + vst1.8 {d4[1]}, [r3]! + vst1.8 {d4[2]}, [r3]! + vst1.8 {d4[3]}, [r3]! + add r0, r0, r12 + mov r3, r0 + vst1.8 {d6[0]}, [r3]! + vst1.8 {d6[1]}, [r3]! + vst1.8 {d6[2]}, [r3]! + vst1.8 {d6[3]}, [r3]! + add r0, r0, r12 + mov r3, r0 + mov r12, #4 + mul r11, r11, r12 + add r1, r1, r11 + subs r5, r5, #1 + bne LoopW4 + LoopW: + mov r8, r1 // src_kh + ldr r2, [sp, #8] // weight_kh + ldr r6, [sp, #56] // kernel_h + vmov q0, q15 + LoopKh: + ldr r12, [sp, #84] //in_kw_step + ldr r7, [sp, #60] // kernel_w + mov r10, r8 // src_kw + LoopKw: + vld1.16 {d2}, [r10] + add r10, r10, r12 + vld1.16 {d24}, [r2]! + vmlal.s16 q0, d2, d24 + subs r7, r7, #1 + bne LoopKw + ldr r12, [sp, #80] + add r8, r8, r12 + subs r6, r6, #1 + bne LoopKh + + vshl.s32 q0, q0, q9 + vqrdmulh.s32 q0, q0, q10 + vrshl.s32 q0, q0, q11 + vadd.i32 q0, q0, q12 + vmax.s32 q0, q0, q13 + vmin.s32 q0, q0, q14 + + vqmovn.s32 d0, q0 + vqmovn.s16 d0, q0 + + mov r3, r0 + ldr r12, [sp, #68] + vst1.8 {d0[0]}, [r3]! + vst1.8 {d0[1]}, [r3]! + vst1.8 {d0[2]}, [r3]! + vst1.8 {d0[3]}, [r3]! + add r0, r0, r12 + ldr r12, [sp, #76] + add r1, r1, r12 + subs r5, r5, #1 + bne LoopW + ldr r3, [sp, #64] + ldr r12, [sp] + add r12, r12, r3 + str r12, [sp] + ldr r3, [sp, #72] + ldr r12, [sp, #4] + add r12, r12, r3 + str r12, [sp, #4] + subs r4, r4, #1 + bne LoopH + + sub sp, sp, #112 + vpop {q4-q7} + pop {r0-r8, r10, r11, pc} +#endif +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwFp32Center.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwFp32Center.S new file mode 100644 index 0000000000..06c38740a5 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwFp32Center.S @@ -0,0 +1,69 @@ +#ifdef __arm__ +#ifndef __aarch64__ + +.text +.align 5 +.global DeconvDwFp32Center +#ifndef __APPLE__ +.type DeconvDwFp32Center, %function +#endif + +// void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, +// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, +// size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); +// r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step +// #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step +DeconvDwFp32Center: + // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" + // according to https://stackoverflow.com/questions/53625807 + // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway + // clang's rule seems more simple, though there are no subroutine calls here + // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf + push {r0-r8, r10, r11, lr} + + ldr r10, [sp, #80] // in_kw_step + ldr r11, [sp, #76] // in_kh_step + + LoopH: + ldr r0, [sp] // dst_w + ldr r1, [sp, #4] // src_w + ldr r4, [sp, #48] // width + LoopW: + mov r6, r0 // dst_kh + ldr r2, [sp, #8] // weight_kh + ldr r5, [sp, #52] // kernel_h + vld1.32 {q1}, [r1] + LoopKh: + mov r7, r6 // dst_kw + ldr r12, [sp, #56] // kernel_w + LoopKw: + vld1.32 {q0}, [r7] + vld1.32 {q2}, [r2]! + vmla.f32 q0, q1, q2 + vst1.32 {q0}, [r7] + add r7, r7, r10 + subs r12, r12, #1 + bne LoopKw + add r6, r6, r11 + subs r5, r5, #1 + bne LoopKh + ldr r12, [sp, #72] + add r0, r0, r12 + ldr r8, [sp, #64] + add r1, r1, r8 + subs r4, r4, #1 + bne LoopW + ldr r8, [sp, #68] + ldr r12, [sp] + add r12, r12, r8 + str r12, [sp] + ldr r8, [sp, #60] + ldr r12, [sp, #4] + add r12, r12, r8 + str r12, [sp, #4] + subs r3, r3, #1 + bne LoopH + + pop {r0-r8, r10, r11, pc} +#endif +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwInt8Center.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwInt8Center.S new file mode 100644 index 0000000000..abae39e13a --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/DeconvDwInt8Center.S @@ -0,0 +1,69 @@ +#ifdef __arm__ +#ifndef __aarch64__ + +.text +.align 5 +.global DeconvDwInt8Center +#ifndef __APPLE__ +.type DeconvDwInt8Center, %function +#endif + +// void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width, +// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, +// size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); +// r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step +// #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step +DeconvDwInt8Center: + // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" + // according to https://stackoverflow.com/questions/53625807 + // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway + // clang's rule seems more simple, though there are no subroutine calls here + // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf + push {r0-r8, r10, r11, lr} + + ldr r10, [sp, #80] // in_kw_step + ldr r11, [sp, #76] // in_kh_step + + LoopH: + ldr r0, [sp] // dst_w + ldr r1, [sp, #4] // src_w + ldr r4, [sp, #48] // width + LoopW: + mov r6, r0 // dst_kh + ldr r2, [sp, #8] // weight_kh + ldr r5, [sp, #52] // kernel_h + vld1.16 {d2}, [r1] + LoopKh: + mov r7, r6 // dst_kw + ldr r12, [sp, #56] // kernel_w + LoopKw: + vld1.32 {q0}, [r7] + vld1.16 {d24}, [r2]! + vmlal.s16 q0, d2, d24 + vst1.32 {q0}, [r7] + add r7, r7, r10 + subs r12, r12, #1 + bne LoopKw + add r6, r6, r11 + subs r5, r5, #1 + bne LoopKh + ldr r12, [sp, #72] + add r0, r0, r12 + ldr r8, [sp, #64] + add r1, r1, r8 + subs r4, r4, #1 + bne LoopW + ldr r8, [sp, #68] + ldr r12, [sp] + add r12, r12, r8 + str r12, [sp] + ldr r8, [sp, #60] + ldr r12, [sp, #4] + add r12, r12, r8 + str r12, [sp, #4] + subs r3, r3, #1 + bne LoopH + + pop {r0-r8, r10, r11, pc} +#endif +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S index 1e27860d72..6b51afbe05 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S @@ -32,24 +32,238 @@ ConvDwFp32Center: ldr x14, [sp, #48] ldr x15, [sp, #56] - ld1 {v5.4s}, [x3] + ld1 {v24.4s}, [x3] + movi v26.4s, #6 + scvtf v26.4s, v26.4s + dup v27.4s, wzr LoopH: mov x23, x1 mov x24, x5 mov x3, x0 + cmp x24, #8 + blt LoopW + cmp x24, #16 + blt LoopW8 + + LoopW16: + mov x19, #16 + mul x19, x19, x11 + mov x16, x23 + mov x17, x2 + mov x20, x6 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v3.16b, v24.16b + mov v4.16b, v24.16b + mov v5.16b, v24.16b + mov v6.16b, v24.16b + mov v7.16b, v24.16b + mov v8.16b, v24.16b + mov v9.16b, v24.16b + mov v10.16b, v24.16b + mov v11.16b, v24.16b + mov v12.16b, v24.16b + mov v13.16b, v24.16b + mov v14.16b, v24.16b + mov v15.16b, v24.16b + LoopKh16: + mov x18, x7 + mov x21, x16 + LoopKw16: + mov x22, x21 + ld1 {v25.4s}, [x17], #16 + ld1 {v16.4s}, [x22], x11 + ld1 {v17.4s}, [x22], x11 + fmla v0.4s, v16.4s, v25.4s + fmla v1.4s, v17.4s, v25.4s + ld1 {v18.4s}, [x22], x11 + ld1 {v19.4s}, [x22], x11 + fmla v2.4s, v18.4s, v25.4s + fmla v3.4s, v19.4s, v25.4s + ld1 {v20.4s}, [x22], x11 + ld1 {v21.4s}, [x22], x11 + fmla v4.4s, v20.4s, v25.4s + fmla v5.4s, v21.4s, v25.4s + ld1 {v22.4s}, [x22], x11 + ld1 {v23.4s}, [x22], x11 + fmla v6.4s, v22.4s, v25.4s + fmla v7.4s, v23.4s, v25.4s + ld1 {v16.4s}, [x22], x11 + ld1 {v17.4s}, [x22], x11 + fmla v8.4s, v16.4s, v25.4s + fmla v9.4s, v17.4s, v25.4s + ld1 {v18.4s}, [x22], x11 + ld1 {v19.4s}, [x22], x11 + fmla v10.4s, v18.4s, v25.4s + fmla v11.4s, v19.4s, v25.4s + ld1 {v20.4s}, [x22], x11 + ld1 {v21.4s}, [x22], x11 + fmla v12.4s, v20.4s, v25.4s + fmla v13.4s, v21.4s, v25.4s + ld1 {v22.4s}, [x22], x11 + ld1 {v23.4s}, [x22], x11 + fmla v14.4s, v22.4s, v25.4s + fmla v15.4s, v23.4s, v25.4s + subs x18, x18, #1 + add x21, x21, x13 + bne LoopKw16 + add x16, x16, x12 + subs x20, x20, #1 + bne LoopKh16 + cbnz x15, Relu616 + cbnz x14, Relu16 + b Write16 + Relu616: + fmin v0.4s, v0.4s, v26.4s + fmin v1.4s, v1.4s, v26.4s + fmin v2.4s, v2.4s, v26.4s + fmin v3.4s, v3.4s, v26.4s + fmin v4.4s, v4.4s, v26.4s + fmin v5.4s, v5.4s, v26.4s + fmin v6.4s, v6.4s, v26.4s + fmin v7.4s, v7.4s, v26.4s + fmin v8.4s, v8.4s, v26.4s + fmin v9.4s, v9.4s, v26.4s + fmin v10.4s, v10.4s, v26.4s + fmin v11.4s, v11.4s, v26.4s + fmin v12.4s, v12.4s, v26.4s + fmin v13.4s, v13.4s, v26.4s + fmin v14.4s, v14.4s, v26.4s + fmin v15.4s, v15.4s, v26.4s + Relu16: + fmax v0.4s, v0.4s, v27.4s + fmax v1.4s, v1.4s, v27.4s + fmax v2.4s, v2.4s, v27.4s + fmax v3.4s, v3.4s, v27.4s + fmax v4.4s, v4.4s, v27.4s + fmax v5.4s, v5.4s, v27.4s + fmax v6.4s, v6.4s, v27.4s + fmax v7.4s, v7.4s, v27.4s + fmax v8.4s, v8.4s, v27.4s + fmax v9.4s, v9.4s, v27.4s + fmax v10.4s, v10.4s, v27.4s + fmax v11.4s, v11.4s, v27.4s + fmax v12.4s, v12.4s, v27.4s + fmax v13.4s, v13.4s, v27.4s + fmax v14.4s, v14.4s, v27.4s + fmax v15.4s, v15.4s, v27.4s + Write16: + st1 {v0.4s}, [x3], x9 + st1 {v1.4s}, [x3], x9 + st1 {v2.4s}, [x3], x9 + st1 {v3.4s}, [x3], x9 + st1 {v4.4s}, [x3], x9 + st1 {v5.4s}, [x3], x9 + st1 {v6.4s}, [x3], x9 + st1 {v7.4s}, [x3], x9 + st1 {v8.4s}, [x3], x9 + st1 {v9.4s}, [x3], x9 + st1 {v10.4s}, [x3], x9 + st1 {v11.4s}, [x3], x9 + st1 {v12.4s}, [x3], x9 + st1 {v13.4s}, [x3], x9 + st1 {v14.4s}, [x3], x9 + st1 {v15.4s}, [x3], x9 + add x23, x23, x19 + sub x24, x24, #16 + cmp x24, #0 + ble LoopWEnd + cmp x24, #8 + blt LoopW + cmp x24, #16 + bge LoopW16 + LoopW8: + mov x19, #8 + mul x19, x19, x11 + mov x16, x23 + mov x17, x2 + mov x20, x6 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v3.16b, v24.16b + mov v4.16b, v24.16b + mov v5.16b, v24.16b + mov v6.16b, v24.16b + mov v7.16b, v24.16b + LoopKh8: + mov x18, x7 + mov x21, x16 + LoopKw8: + mov x22, x21 + ld1 {v25.4s}, [x17], #16 + ld1 {v16.4s}, [x22], x11 + ld1 {v17.4s}, [x22], x11 + fmla v0.4s, v16.4s, v25.4s + fmla v1.4s, v17.4s, v25.4s + ld1 {v18.4s}, [x22], x11 + ld1 {v19.4s}, [x22], x11 + fmla v2.4s, v18.4s, v25.4s + fmla v3.4s, v19.4s, v25.4s + ld1 {v20.4s}, [x22], x11 + ld1 {v21.4s}, [x22], x11 + fmla v4.4s, v20.4s, v25.4s + fmla v5.4s, v21.4s, v25.4s + ld1 {v22.4s}, [x22], x11 + ld1 {v23.4s}, [x22], x11 + fmla v6.4s, v22.4s, v25.4s + fmla v7.4s, v23.4s, v25.4s + subs x18, x18, #1 + add x21, x21, x13 + bne LoopKw8 + add x16, x16, x12 + subs x20, x20, #1 + bne LoopKh8 + cbnz x15, Relu68 + cbnz x14, Relu8 + b Write8 + Relu68: + fmin v0.4s, v0.4s, v26.4s + fmin v1.4s, v1.4s, v26.4s + fmin v2.4s, v2.4s, v26.4s + fmin v3.4s, v3.4s, v26.4s + fmin v4.4s, v4.4s, v26.4s + fmin v5.4s, v5.4s, v26.4s + fmin v6.4s, v6.4s, v26.4s + fmin v7.4s, v7.4s, v26.4s + Relu8: + fmax v0.4s, v0.4s, v27.4s + fmax v1.4s, v1.4s, v27.4s + fmax v2.4s, v2.4s, v27.4s + fmax v3.4s, v3.4s, v27.4s + fmax v4.4s, v4.4s, v27.4s + fmax v5.4s, v5.4s, v27.4s + fmax v6.4s, v6.4s, v27.4s + fmax v7.4s, v7.4s, v27.4s + Write8: + st1 {v0.4s}, [x3], x9 + st1 {v1.4s}, [x3], x9 + st1 {v2.4s}, [x3], x9 + st1 {v3.4s}, [x3], x9 + st1 {v4.4s}, [x3], x9 + st1 {v5.4s}, [x3], x9 + st1 {v6.4s}, [x3], x9 + st1 {v7.4s}, [x3], x9 + add x23, x23, x19 + sub x24, x24, #8 + cmp x24, #0 + ble LoopWEnd + cmp x24, #8 + bge LoopW8 LoopW: mov x16, x23 mov x17, x2 mov x20, x6 - mov v0.16b, v5.16b + mov v0.16b, v24.16b LoopKh: mov x18, x7 mov x22, x16 LoopKw: - ld1 {v1.4s}, [x22], x13 - ld1 {v2.4s}, [x17], #16 - fmla v0.4s, v1.4s, v2.4s + ld1 {v16.4s}, [x22], x13 + ld1 {v25.4s}, [x17], #16 + fmla v0.4s, v16.4s, v25.4s subs x18, x18, #1 bne LoopKw add x16, x16, x12 @@ -59,17 +273,15 @@ ConvDwFp32Center: cbnz x14, Relu b Write Relu6: - movi v4.4s, #6 - scvtf v4.4s, v4.4s - fmin v0.4s, v0.4s, v4.4s + fmin v0.4s, v0.4s, v26.4s Relu: - dup v3.4s, wzr - fmax v0.4s, v0.4s, v3.4s + fmax v0.4s, v0.4s, v27.4s Write: st1 {v0.4s}, [x3], x9 add x23, x23, x11 subs x24, x24, #1 bne LoopW + LoopWEnd: add x0, x0, x8 add x1, x1, x10 subs x4, x4, #1 diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwInt8Center.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwInt8Center.S new file mode 100644 index 0000000000..0381b6bdb0 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwInt8Center.S @@ -0,0 +1,558 @@ +#ifdef __aarch64__ + +.text +.align 5 +.global ConvDwInt8Center +#ifndef __APPLE__ +.type ConvDwInt8Center, %function +#endif + +// void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width, +// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, +// size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift, +// int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max); +// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, +// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step +// x14: out_multiplier, #56: left_shift, #64: right_shift, #72:out_zp, #80: acc_min, #88: acc_max +ConvDwInt8Center: + // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to + // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers + // x19 ~ x29 should be also preserved + // whereas our coding style do not permit such amount of parameters + sub sp, sp, #48 + stp x19, x20, [sp], #16 + stp x21, x22, [sp], #16 + stp x23, x24, [sp], #16 + + ldr x8, [sp] + ldr x9, [sp, #8] + ldr x10, [sp, #16] + ldr x11, [sp, #24] + ldr x12, [sp, #32] + ldr x13, [sp, #40] + + ldr w14, [sp, #56] + dup v26.4s, w14 + + ldr x15, [sp, #48] + dup v27.4s, w15 + + ldr w16, [sp, #64] + dup v28.4s, w16 + + ldr w17, [sp, #72] + dup v29.4s, w17 + + ldr w18, [sp, #80] + dup v30.4s, w18 + + ldr w19, [sp, #88] + dup v31.4s, w19 + + ld1 {v24.4s}, [x3] + + LoopH: + mov x23, x1 + mov x24, x5 + mov x3, x0 + cmp x24, #8 + blt LoopW + cmp x24, #16 + blt LoopW8 + + LoopW16: + mov x19, #16 + mul x19, x19, x11 + mov x16, x23 + mov x17, x2 + mov x20, x6 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v3.16b, v24.16b + mov v4.16b, v24.16b + mov v5.16b, v24.16b + mov v6.16b, v24.16b + mov v7.16b, v24.16b + mov v8.16b, v24.16b + mov v9.16b, v24.16b + mov v10.16b, v24.16b + mov v11.16b, v24.16b + mov v12.16b, v24.16b + mov v13.16b, v24.16b + mov v14.16b, v24.16b + mov v15.16b, v24.16b + LoopKh16: + mov x18, x7 + mov x21, x16 + LoopKw16: + mov x22, x21 + ld1 {v25.4h}, [x17], #8 + ld1 {v16.4h}, [x22], x13 + ld1 {v17.4h}, [x22], x13 + smlal v0.4s, v16.4h, v25.4h + smlal v1.4s, v17.4h, v25.4h + ld1 {v18.4h}, [x22], x13 + ld1 {v19.4h}, [x22], x13 + smlal v2.4s, v18.4h, v25.4h + smlal v3.4s, v19.4h, v25.4h + ld1 {v20.4h}, [x22], x13 + ld1 {v21.4h}, [x22], x13 + smlal v4.4s, v20.4h, v25.4h + smlal v5.4s, v21.4h, v25.4h + ld1 {v22.4h}, [x22], x13 + ld1 {v23.4h}, [x22], x13 + smlal v6.4s, v22.4h, v25.4h + smlal v7.4s, v23.4h, v25.4h + ld1 {v16.4h}, [x22], x13 + ld1 {v17.4h}, [x22], x13 + smlal v8.4s, v16.4h, v25.4h + smlal v9.4s, v17.4h, v25.4h + ld1 {v18.4h}, [x22], x13 + ld1 {v19.4h}, [x22], x13 + smlal v10.4s, v18.4h, v25.4h + smlal v11.4s, v19.4h, v25.4h + ld1 {v20.4h}, [x22], x13 + ld1 {v21.4h}, [x22], x13 + smlal v12.4s, v20.4h, v25.4h + smlal v13.4s, v21.4h, v25.4h + ld1 {v22.4h}, [x22], x13 + ld1 {v23.4h}, [x22], x13 + smlal v14.4s, v22.4h, v25.4h + smlal v15.4s, v23.4h, v25.4h + subs x18, x18, #1 + add x21, x21, x13 + bne LoopKw16 + add x16, x16, x12 + subs x20, x20, #1 + bne LoopKh16 + + sqshl v0.4s, v0.4s ,v26.4s + sqshl v1.4s, v1.4s ,v26.4s + sqshl v2.4s, v2.4s ,v26.4s + sqshl v3.4s, v3.4s ,v26.4s + sqshl v4.4s, v4.4s ,v26.4s + sqshl v5.4s, v5.4s ,v26.4s + sqshl v6.4s, v6.4s ,v26.4s + sqshl v7.4s, v7.4s ,v26.4s + sqshl v8.4s, v8.4s ,v26.4s + sqshl v9.4s, v9.4s ,v26.4s + sqshl v10.4s, v10.4s ,v26.4s + sqshl v11.4s, v11.4s ,v26.4s + sqshl v12.4s, v12.4s ,v26.4s + sqshl v13.4s, v13.4s ,v26.4s + sqshl v14.4s, v14.4s ,v26.4s + sqshl v15.4s, v15.4s ,v26.4s + sqrdmulh v0.4s, v0.4s ,v27.4s + sqrdmulh v1.4s, v1.4s ,v27.4s + sqrdmulh v2.4s, v2.4s ,v27.4s + sqrdmulh v3.4s, v3.4s ,v27.4s + sqrdmulh v4.4s, v4.4s ,v27.4s + sqrdmulh v5.4s, v5.4s ,v27.4s + sqrdmulh v6.4s, v6.4s ,v27.4s + sqrdmulh v7.4s, v7.4s ,v27.4s + sqrdmulh v8.4s, v8.4s ,v27.4s + sqrdmulh v9.4s, v9.4s ,v27.4s + sqrdmulh v10.4s, v10.4s ,v27.4s + sqrdmulh v11.4s, v11.4s ,v27.4s + sqrdmulh v12.4s, v12.4s ,v27.4s + sqrdmulh v13.4s, v13.4s ,v27.4s + sqrdmulh v14.4s, v14.4s ,v27.4s + sqrdmulh v15.4s, v15.4s ,v27.4s + sqrshl v0.4s, v0.4s ,v28.4s + sqrshl v1.4s, v1.4s ,v28.4s + sqrshl v2.4s, v2.4s ,v28.4s + sqrshl v3.4s, v3.4s ,v28.4s + sqrshl v4.4s, v4.4s ,v28.4s + sqrshl v5.4s, v5.4s ,v28.4s + sqrshl v6.4s, v6.4s ,v28.4s + sqrshl v7.4s, v7.4s ,v28.4s + sqrshl v8.4s, v8.4s ,v28.4s + sqrshl v9.4s, v9.4s ,v28.4s + sqrshl v10.4s, v10.4s ,v28.4s + sqrshl v11.4s, v11.4s ,v28.4s + sqrshl v12.4s, v12.4s ,v28.4s + sqrshl v13.4s, v13.4s ,v28.4s + sqrshl v14.4s, v14.4s ,v28.4s + sqrshl v15.4s, v15.4s ,v28.4s + add v0.4s, v0.4s ,v29.4s + add v1.4s, v1.4s ,v29.4s + add v2.4s, v2.4s ,v29.4s + add v3.4s, v3.4s ,v29.4s + add v4.4s, v4.4s ,v29.4s + add v5.4s, v5.4s ,v29.4s + add v6.4s, v6.4s ,v29.4s + add v7.4s, v7.4s ,v29.4s + add v8.4s, v8.4s ,v29.4s + add v9.4s, v9.4s ,v29.4s + add v10.4s, v10.4s ,v29.4s + add v11.4s, v11.4s ,v29.4s + add v12.4s, v12.4s ,v29.4s + add v13.4s, v13.4s ,v29.4s + add v14.4s, v14.4s ,v29.4s + add v15.4s, v15.4s ,v29.4s + smax v0.4s, v0.4s ,v30.4s + smax v1.4s, v1.4s ,v30.4s + smax v2.4s, v2.4s ,v30.4s + smax v3.4s, v3.4s ,v30.4s + smax v4.4s, v4.4s ,v30.4s + smax v5.4s, v5.4s ,v30.4s + smax v6.4s, v6.4s ,v30.4s + smax v7.4s, v7.4s ,v30.4s + smax v8.4s, v8.4s ,v30.4s + smax v9.4s, v9.4s ,v30.4s + smax v10.4s, v10.4s ,v30.4s + smax v11.4s, v11.4s ,v30.4s + smax v12.4s, v12.4s ,v30.4s + smax v13.4s, v13.4s ,v30.4s + smax v14.4s, v14.4s ,v30.4s + smax v15.4s, v15.4s ,v30.4s + smin v0.4s, v0.4s ,v31.4s + smin v1.4s, v1.4s ,v31.4s + smin v2.4s, v2.4s ,v31.4s + smin v3.4s, v3.4s ,v31.4s + smin v4.4s, v4.4s ,v31.4s + smin v5.4s, v5.4s ,v31.4s + smin v6.4s, v6.4s ,v31.4s + smin v7.4s, v7.4s ,v31.4s + smin v8.4s, v8.4s ,v31.4s + smin v9.4s, v9.4s ,v31.4s + smin v10.4s, v10.4s ,v31.4s + smin v11.4s, v11.4s ,v31.4s + smin v12.4s, v12.4s ,v31.4s + smin v13.4s, v13.4s ,v31.4s + smin v14.4s, v14.4s ,v31.4s + smin v15.4s, v15.4s ,v31.4s + + sqxtn v0.4h, v0.4s + sqxtn v1.4h, v1.4s + sqxtn v2.4h, v2.4s + sqxtn v3.4h, v3.4s + sqxtn v4.4h, v4.4s + sqxtn v5.4h, v5.4s + sqxtn v6.4h, v6.4s + sqxtn v7.4h, v7.4s + sqxtn v8.4h, v8.4s + sqxtn v9.4h, v9.4s + sqxtn v10.4h, v10.4s + sqxtn v11.4h, v11.4s + sqxtn v12.4h, v12.4s + sqxtn v13.4h, v13.4s + sqxtn v14.4h, v14.4s + sqxtn v15.4h, v15.4s + sqxtn v0.8b, v0.8h + sqxtn v1.8b, v1.8h + sqxtn v2.8b, v2.8h + sqxtn v3.8b, v3.8h + sqxtn v4.8b, v4.8h + sqxtn v5.8b, v5.8h + sqxtn v6.8b, v6.8h + sqxtn v7.8b, v7.8h + sqxtn v8.8b, v8.8h + sqxtn v9.8b, v9.8h + sqxtn v10.8b, v10.8h + sqxtn v11.8b, v11.8h + sqxtn v12.8b, v12.8h + sqxtn v13.8b, v13.8h + sqxtn v14.8b, v14.8h + sqxtn v15.8b, v15.8h + + add x17, x3, #1 + add x18, x3, #2 + add x21, x3, #3 + st1 {v0.b}[0], [x3], x9 + st1 {v0.b}[1], [x17], x9 + st1 {v0.b}[2], [x18], x9 + st1 {v0.b}[3], [x21], x9 + + st1 {v1.b}[0], [x3], x9 + st1 {v1.b}[1], [x17], x9 + st1 {v1.b}[2], [x18], x9 + st1 {v1.b}[3], [x21], x9 + + st1 {v2.b}[0], [x3], x9 + st1 {v2.b}[1], [x17], x9 + st1 {v2.b}[2], [x18], x9 + st1 {v2.b}[3], [x21], x9 + + st1 {v3.b}[0], [x3], x9 + st1 {v3.b}[1], [x17], x9 + st1 {v3.b}[2], [x18], x9 + st1 {v3.b}[3], [x21], x9 + + st1 {v4.b}[0], [x3], x9 + st1 {v4.b}[1], [x17], x9 + st1 {v4.b}[2], [x18], x9 + st1 {v4.b}[3], [x21], x9 + + st1 {v5.b}[0], [x3], x9 + st1 {v5.b}[1], [x17], x9 + st1 {v5.b}[2], [x18], x9 + st1 {v5.b}[3], [x21], x9 + + st1 {v6.b}[0], [x3], x9 + st1 {v6.b}[1], [x17], x9 + st1 {v6.b}[2], [x18], x9 + st1 {v6.b}[3], [x21], x9 + + st1 {v7.b}[0], [x3], x9 + st1 {v7.b}[1], [x17], x9 + st1 {v7.b}[2], [x18], x9 + st1 {v7.b}[3], [x21], x9 + + st1 {v8.b}[0], [x3], x9 + st1 {v8.b}[1], [x17], x9 + st1 {v8.b}[2], [x18], x9 + st1 {v8.b}[3], [x21], x9 + + st1 {v9.b}[0], [x3], x9 + st1 {v9.b}[1], [x17], x9 + st1 {v9.b}[2], [x18], x9 + st1 {v9.b}[3], [x21], x9 + + st1 {v10.b}[0], [x3], x9 + st1 {v10.b}[1], [x17], x9 + st1 {v10.b}[2], [x18], x9 + st1 {v10.b}[3], [x21], x9 + + st1 {v11.b}[0], [x3], x9 + st1 {v11.b}[1], [x17], x9 + st1 {v11.b}[2], [x18], x9 + st1 {v11.b}[3], [x21], x9 + + st1 {v12.b}[0], [x3], x9 + st1 {v12.b}[1], [x17], x9 + st1 {v12.b}[2], [x18], x9 + st1 {v12.b}[3], [x21], x9 + + st1 {v13.b}[0], [x3], x9 + st1 {v13.b}[1], [x17], x9 + st1 {v13.b}[2], [x18], x9 + st1 {v13.b}[3], [x21], x9 + + st1 {v14.b}[0], [x3], x9 + st1 {v14.b}[1], [x17], x9 + st1 {v14.b}[2], [x18], x9 + st1 {v14.b}[3], [x21], x9 + + st1 {v15.b}[0], [x3], x9 + st1 {v15.b}[1], [x17], x9 + st1 {v15.b}[2], [x18], x9 + st1 {v15.b}[3], [x21], x9 + + add x23, x23, x19 + sub x24, x24, #16 + cmp x24, #0 + ble LoopWEnd + cmp x24, #8 + blt LoopW + cmp x24, #16 + bge LoopW16 + LoopW8: + mov x19, #8 + mul x19, x19, x11 + mov x16, x23 + mov x17, x2 + mov x20, x6 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v3.16b, v24.16b + mov v4.16b, v24.16b + mov v5.16b, v24.16b + mov v6.16b, v24.16b + mov v7.16b, v24.16b + LoopKh8: + mov x18, x7 + mov x21, x16 + LoopKw8: + mov x22, x21 + ld1 {v25.4h}, [x17], #8 + ld1 {v16.4h}, [x22], x13 + ld1 {v17.4h}, [x22], x13 + smlal v0.4s, v16.4h, v25.4h + smlal v1.4s, v17.4h, v25.4h + ld1 {v18.4h}, [x22], x13 + ld1 {v19.4h}, [x22], x13 + smlal v2.4s, v18.4h, v25.4h + smlal v3.4s, v19.4h, v25.4h + ld1 {v20.4h}, [x22], x13 + ld1 {v21.4h}, [x22], x13 + smlal v4.4s, v20.4h, v25.4h + smlal v5.4s, v21.4h, v25.4h + ld1 {v22.4h}, [x22], x13 + ld1 {v23.4h}, [x22], x13 + smlal v6.4s, v22.4h, v25.4h + smlal v7.4s, v23.4h, v25.4h + subs x18, x18, #1 + add x21, x21, x13 + bne LoopKw8 + add x16, x16, x12 + subs x20, x20, #1 + bne LoopKh8 + + sqshl v0.4s, v0.4s ,v26.4s + sqshl v1.4s, v1.4s ,v26.4s + sqshl v2.4s, v2.4s ,v26.4s + sqshl v3.4s, v3.4s ,v26.4s + sqshl v4.4s, v4.4s ,v26.4s + sqshl v5.4s, v5.4s ,v26.4s + sqshl v6.4s, v6.4s ,v26.4s + sqshl v7.4s, v7.4s ,v26.4s + sqrdmulh v0.4s, v0.4s ,v27.4s + sqrdmulh v1.4s, v1.4s ,v27.4s + sqrdmulh v2.4s, v2.4s ,v27.4s + sqrdmulh v3.4s, v3.4s ,v27.4s + sqrdmulh v4.4s, v4.4s ,v27.4s + sqrdmulh v5.4s, v5.4s ,v27.4s + sqrdmulh v6.4s, v6.4s ,v27.4s + sqrdmulh v7.4s, v7.4s ,v27.4s + sqrshl v0.4s, v0.4s ,v28.4s + sqrshl v1.4s, v1.4s ,v28.4s + sqrshl v2.4s, v2.4s ,v28.4s + sqrshl v3.4s, v3.4s ,v28.4s + sqrshl v4.4s, v4.4s ,v28.4s + sqrshl v5.4s, v5.4s ,v28.4s + sqrshl v6.4s, v6.4s ,v28.4s + sqrshl v7.4s, v7.4s ,v28.4s + add v0.4s, v0.4s ,v29.4s + add v1.4s, v1.4s ,v29.4s + add v2.4s, v2.4s ,v29.4s + add v3.4s, v3.4s ,v29.4s + add v4.4s, v4.4s ,v29.4s + add v5.4s, v5.4s ,v29.4s + add v6.4s, v6.4s ,v29.4s + add v7.4s, v7.4s ,v29.4s + smax v0.4s, v0.4s ,v30.4s + smax v1.4s, v1.4s ,v30.4s + smax v2.4s, v2.4s ,v30.4s + smax v3.4s, v3.4s ,v30.4s + smax v4.4s, v4.4s ,v30.4s + smax v5.4s, v5.4s ,v30.4s + smax v6.4s, v6.4s ,v30.4s + smax v7.4s, v7.4s ,v30.4s + smin v0.4s, v0.4s ,v31.4s + smin v1.4s, v1.4s ,v31.4s + smin v2.4s, v2.4s ,v31.4s + smin v3.4s, v3.4s ,v31.4s + smin v4.4s, v4.4s ,v31.4s + smin v5.4s, v5.4s ,v31.4s + smin v6.4s, v6.4s ,v31.4s + smin v7.4s, v7.4s ,v31.4s + + sqxtn v0.4h, v0.4s + sqxtn v1.4h, v1.4s + sqxtn v2.4h, v2.4s + sqxtn v3.4h, v3.4s + sqxtn v4.4h, v4.4s + sqxtn v5.4h, v5.4s + sqxtn v6.4h, v6.4s + sqxtn v7.4h, v7.4s + sqxtn v0.8b, v0.8h + sqxtn v1.8b, v1.8h + sqxtn v2.8b, v2.8h + sqxtn v3.8b, v3.8h + sqxtn v4.8b, v4.8h + sqxtn v5.8b, v5.8h + sqxtn v6.8b, v6.8h + sqxtn v7.8b, v7.8h + + add x17, x3, #1 + add x18, x3, #2 + add x21, x3, #3 + st1 {v0.b}[0], [x3], x9 + st1 {v0.b}[1], [x17], x9 + st1 {v0.b}[2], [x18], x9 + st1 {v0.b}[3], [x21], x9 + + st1 {v1.b}[0], [x3], x9 + st1 {v1.b}[1], [x17], x9 + st1 {v1.b}[2], [x18], x9 + st1 {v1.b}[3], [x21], x9 + + st1 {v2.b}[0], [x3], x9 + st1 {v2.b}[1], [x17], x9 + st1 {v2.b}[2], [x18], x9 + st1 {v2.b}[3], [x21], x9 + + st1 {v3.b}[0], [x3], x9 + st1 {v3.b}[1], [x17], x9 + st1 {v3.b}[2], [x18], x9 + st1 {v3.b}[3], [x21], x9 + + st1 {v4.b}[0], [x3], x9 + st1 {v4.b}[1], [x17], x9 + st1 {v4.b}[2], [x18], x9 + st1 {v4.b}[3], [x21], x9 + + st1 {v5.b}[0], [x3], x9 + st1 {v5.b}[1], [x17], x9 + st1 {v5.b}[2], [x18], x9 + st1 {v5.b}[3], [x21], x9 + + st1 {v6.b}[0], [x3], x9 + st1 {v6.b}[1], [x17], x9 + st1 {v6.b}[2], [x18], x9 + st1 {v6.b}[3], [x21], x9 + + st1 {v7.b}[0], [x3], x9 + st1 {v7.b}[1], [x17], x9 + st1 {v7.b}[2], [x18], x9 + st1 {v7.b}[3], [x21], x9 + + add x23, x23, x19 + sub x24, x24, #8 + cmp x24, #0 + ble LoopWEnd + cmp x24, #8 + bge LoopW8 + LoopW: + mov x16, x23 + mov x17, x2 + mov x20, x6 + mov v0.16b, v24.16b + LoopKh: + mov x18, x7 + mov x22, x16 + LoopKw: + ld1 {v16.4h}, [x22], x13 + ld1 {v25.4h}, [x17], #8 + smlal v0.4s, v16.4h, v25.4h + subs x18, x18, #1 + bne LoopKw + add x16, x16, x12 + subs x20, x20, #1 + bne LoopKh + + sqshl v0.4s, v0.4s ,v26.4s + sqrdmulh v0.4s, v0.4s ,v27.4s + sqrshl v0.4s, v0.4s ,v28.4s + add v0.4s, v0.4s ,v29.4s + smax v0.4s, v0.4s ,v30.4s + smin v0.4s, v0.4s ,v31.4s + + sqxtn v0.4h, v0.4s + sqxtn v0.8b, v0.8h + + mov x17, x3 + st1 {v0.b}[0], [x17], #1 + st1 {v0.b}[1], [x17], #1 + st1 {v0.b}[2], [x17], #1 + st1 {v0.b}[3], [x17], #1 + add x3, x3, x9 + + add x23, x23, x11 + subs x24, x24, #1 + bne LoopW + LoopWEnd: + add x0, x0, x8 + add x1, x1, x10 + subs x4, x4, #1 + bne LoopH + + sub sp, sp, #48 + ldp x19, x20, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x23, x24, [sp], #16 + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S index d88c61047c..07cd1a5cea 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S @@ -35,12 +35,12 @@ DeconvDwFp32Center: mov x18, x15 mov x19, x2 mov x20, x5 - dup v0.4s, wzr + ld1 {v1.4s}, [x16], x8 LoopKh: mov x21, x18 mov x13, x6 LoopKw: - ld1 {v1.4s}, [x16] + ld1 {v0.4s}, [x21] ld1 {v2.4s}, [x19], #16 fmla v0.4s, v1.4s, v2.4s st1 {v0.4s}, [x21], x12 @@ -50,7 +50,6 @@ DeconvDwFp32Center: subs x20, x20, #1 bne LoopKh add x15, x15, x10 - add x16, x16, x8 subs x17, x17, #1 bne LoopW add x0, x0, x9 diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwInt8Center.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwInt8Center.S new file mode 100644 index 0000000000..25433d7a5f --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwInt8Center.S @@ -0,0 +1,65 @@ +#ifdef __aarch64__ + +.text +.align 5 +.global DeconvDwInt8Center +#ifndef __APPLE__ +.type DeconvDwInt8Center, %function +#endif + +// void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width, +// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, +// size_t in_kh_step, size_t in_kw_step); +// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step +// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step +DeconvDwInt8Center: + // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to + // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers + // x19 ~ x29 should be also preserved + // whereas our coding style do not permit such amount of parameters + sub sp, sp, #32 + stp x19, x20, [sp], #16 + stp x21, x22, [sp], #16 + + ldr x8, [sp] + ldr x9, [sp, #8] + ldr x10, [sp, #16] + ldr x11, [sp, #24] + ldr x12, [sp, #32] + + LoopH: + mov x15, x0 + mov x16, x1 + mov x17, x4 + LoopW: + mov x18, x15 + mov x19, x2 + mov x20, x5 + ld1 {v1.4h}, [x16], x8 + LoopKh: + mov x21, x18 + mov x13, x6 + LoopKw: + ld1 {v0.4s}, [x21] + ld1 {v2.4h}, [x19], #8 + smlal v0.4s, v1.4h, v2.4h + st1 {v0.4s}, [x21], x12 + subs x13, x13, #1 + bne LoopKw + add x18, x18, x11 + subs x20, x20, #1 + bne LoopKh + add x15, x15, x10 + add x16, x16, x8 + subs x17, x17, #1 + bne LoopW + add x0, x0, x9 + add x1, x1, x7 + subs x3, x3, #1 + bne LoopH + + sub sp, sp, #32 + ldp x19, x20, [sp], #16 + ldp x21, x22, [sp], #16 + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/ConvDwFp16Center.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/ConvDwFp16Center.S new file mode 100644 index 0000000000..6b27af6a6e --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/ConvDwFp16Center.S @@ -0,0 +1,294 @@ +#ifdef __aarch64__ + +.text +.align 5 +.global ConvDwFp16Center +#ifndef __APPLE__ +.type ConvDwFp16Center, %function +#endif + +// void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, size_t height, size_t width, +// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, +// size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); +// x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, +// x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step +// x14: relu, x15: relu6 +ConvDwFp16Center: + // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to + // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers + // x19 ~ x29 should be also preserved + // whereas our coding style do not permit such amount of parameters + sub sp, sp, #48 + stp x19, x20, [sp], #16 + stp x21, x22, [sp], #16 + stp x23, x24, [sp], #16 + + ldr x8, [sp] + ldr x9, [sp, #8] + ldr x10, [sp, #16] + ldr x11, [sp, #24] + ldr x12, [sp, #32] + ldr x13, [sp, #40] + ldr x14, [sp, #48] + ldr x15, [sp, #56] + + ld1 {v24.8h}, [x3] + movi v26.8h, #0x46, lsl #8 + dup v27.4s, wzr + + LoopH: + mov x23, x1 + mov x24, x5 + mov x3, x0 + cmp x24, #8 + blt LoopW + cmp x24, #16 + blt LoopW8 + + LoopW16: + mov x19, #16 + mul x19, x19, x11 + mov x16, x23 + mov x17, x2 + mov x20, x6 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v3.16b, v24.16b + mov v4.16b, v24.16b + mov v5.16b, v24.16b + mov v6.16b, v24.16b + mov v7.16b, v24.16b + mov v8.16b, v24.16b + mov v9.16b, v24.16b + mov v10.16b, v24.16b + mov v11.16b, v24.16b + mov v12.16b, v24.16b + mov v13.16b, v24.16b + mov v14.16b, v24.16b + mov v15.16b, v24.16b + LoopKh16: + mov x18, x7 + mov x21, x16 + LoopKw16: + mov x22, x21 + ld1 {v25.8h}, [x17], #16 + ld1 {v16.8h}, [x22], x11 + ld1 {v17.8h}, [x22], x11 + fmla v0.8h, v16.8h, v25.8h + fmla v1.8h, v17.8h, v25.8h + ld1 {v18.8h}, [x22], x11 + ld1 {v19.8h}, [x22], x11 + fmla v2.8h, v18.8h, v25.8h + fmla v3.8h, v19.8h, v25.8h + ld1 {v20.8h}, [x22], x11 + ld1 {v21.8h}, [x22], x11 + fmla v4.8h, v20.8h, v25.8h + fmla v5.8h, v21.8h, v25.8h + ld1 {v22.8h}, [x22], x11 + ld1 {v23.8h}, [x22], x11 + fmla v6.8h, v22.8h, v25.8h + fmla v7.8h, v23.8h, v25.8h + ld1 {v16.8h}, [x22], x11 + ld1 {v17.8h}, [x22], x11 + fmla v8.8h, v16.8h, v25.8h + fmla v9.8h, v17.8h, v25.8h + ld1 {v18.8h}, [x22], x11 + ld1 {v19.8h}, [x22], x11 + fmla v10.8h, v18.8h, v25.8h + fmla v11.8h, v19.8h, v25.8h + ld1 {v20.8h}, [x22], x11 + ld1 {v21.8h}, [x22], x11 + fmla v12.8h, v20.8h, v25.8h + fmla v13.8h, v21.8h, v25.8h + ld1 {v22.8h}, [x22], x11 + ld1 {v23.8h}, [x22], x11 + fmla v14.8h, v22.8h, v25.8h + fmla v15.8h, v23.8h, v25.8h + subs x18, x18, #1 + add x21, x21, x13 + bne LoopKw16 + add x16, x16, x12 + subs x20, x20, #1 + bne LoopKh16 + cbnz x15, Relu616 + cbnz x14, Relu16 + b Write16 + Relu616: + fmin v0.8h, v0.8h, v26.8h + fmin v1.8h, v1.8h, v26.8h + fmin v2.8h, v2.8h, v26.8h + fmin v3.8h, v3.8h, v26.8h + fmin v4.8h, v4.8h, v26.8h + fmin v5.8h, v5.8h, v26.8h + fmin v6.8h, v6.8h, v26.8h + fmin v7.8h, v7.8h, v26.8h + fmin v8.8h, v8.8h, v26.8h + fmin v9.8h, v9.8h, v26.8h + fmin v10.8h, v10.8h, v26.8h + fmin v11.8h, v11.8h, v26.8h + fmin v12.8h, v12.8h, v26.8h + fmin v13.8h, v13.8h, v26.8h + fmin v14.8h, v14.8h, v26.8h + fmin v15.8h, v15.8h, v26.8h + Relu16: + fmax v0.8h, v0.8h, v27.8h + fmax v1.8h, v1.8h, v27.8h + fmax v2.8h, v2.8h, v27.8h + fmax v3.8h, v3.8h, v27.8h + fmax v4.8h, v4.8h, v27.8h + fmax v5.8h, v5.8h, v27.8h + fmax v6.8h, v6.8h, v27.8h + fmax v7.8h, v7.8h, v27.8h + fmax v8.8h, v8.8h, v27.8h + fmax v9.8h, v9.8h, v27.8h + fmax v10.8h, v10.8h, v27.8h + fmax v11.8h, v11.8h, v27.8h + fmax v12.8h, v12.8h, v27.8h + fmax v13.8h, v13.8h, v27.8h + fmax v14.8h, v14.8h, v27.8h + fmax v15.8h, v15.8h, v27.8h + Write16: + st1 {v0.8h}, [x3], x9 + st1 {v1.8h}, [x3], x9 + st1 {v2.8h}, [x3], x9 + st1 {v3.8h}, [x3], x9 + st1 {v4.8h}, [x3], x9 + st1 {v5.8h}, [x3], x9 + st1 {v6.8h}, [x3], x9 + st1 {v7.8h}, [x3], x9 + st1 {v8.8h}, [x3], x9 + st1 {v9.8h}, [x3], x9 + st1 {v10.8h}, [x3], x9 + st1 {v11.8h}, [x3], x9 + st1 {v12.8h}, [x3], x9 + st1 {v13.8h}, [x3], x9 + st1 {v14.8h}, [x3], x9 + st1 {v15.8h}, [x3], x9 + add x23, x23, x19 + sub x24, x24, #16 + cmp x24, #0 + ble LoopWEnd + cmp x24, #8 + blt LoopW + cmp x24, #16 + bge LoopW16 + LoopW8: + mov x19, #8 + mul x19, x19, x11 + mov x16, x23 + mov x17, x2 + mov x20, x6 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v3.16b, v24.16b + mov v4.16b, v24.16b + mov v5.16b, v24.16b + mov v6.16b, v24.16b + mov v7.16b, v24.16b + LoopKh8: + mov x18, x7 + mov x21, x16 + LoopKw8: + mov x22, x21 + ld1 {v25.8h}, [x17], #16 + ld1 {v16.8h}, [x22], x11 + ld1 {v17.8h}, [x22], x11 + fmla v0.8h, v16.8h, v25.8h + fmla v1.8h, v17.8h, v25.8h + ld1 {v18.8h}, [x22], x11 + ld1 {v19.8h}, [x22], x11 + fmla v2.8h, v18.8h, v25.8h + fmla v3.8h, v19.8h, v25.8h + ld1 {v20.8h}, [x22], x11 + ld1 {v21.8h}, [x22], x11 + fmla v4.8h, v20.8h, v25.8h + fmla v5.8h, v21.8h, v25.8h + ld1 {v22.8h}, [x22], x11 + ld1 {v23.8h}, [x22], x11 + fmla v6.8h, v22.8h, v25.8h + fmla v7.8h, v23.8h, v25.8h + subs x18, x18, #1 + add x21, x21, x13 + bne LoopKw8 + add x16, x16, x12 + subs x20, x20, #1 + bne LoopKh8 + cbnz x15, Relu68 + cbnz x14, Relu8 + b Write8 + Relu68: + fmin v0.8h, v0.8h, v26.8h + fmin v1.8h, v1.8h, v26.8h + fmin v2.8h, v2.8h, v26.8h + fmin v3.8h, v3.8h, v26.8h + fmin v4.8h, v4.8h, v26.8h + fmin v5.8h, v5.8h, v26.8h + fmin v6.8h, v6.8h, v26.8h + fmin v7.8h, v7.8h, v26.8h + Relu8: + fmax v0.8h, v0.8h, v27.8h + fmax v1.8h, v1.8h, v27.8h + fmax v2.8h, v2.8h, v27.8h + fmax v3.8h, v3.8h, v27.8h + fmax v4.8h, v4.8h, v27.8h + fmax v5.8h, v5.8h, v27.8h + fmax v6.8h, v6.8h, v27.8h + fmax v7.8h, v7.8h, v27.8h + Write8: + st1 {v0.8h}, [x3], x9 + st1 {v1.8h}, [x3], x9 + st1 {v2.8h}, [x3], x9 + st1 {v3.8h}, [x3], x9 + st1 {v4.8h}, [x3], x9 + st1 {v5.8h}, [x3], x9 + st1 {v6.8h}, [x3], x9 + st1 {v7.8h}, [x3], x9 + add x23, x23, x19 + sub x24, x24, #8 + cmp x24, #0 + ble LoopWEnd + cmp x24, #8 + bge LoopW8 + LoopW: + mov x16, x23 + mov x17, x2 + mov x20, x6 + mov v0.16b, v24.16b + LoopKh: + mov x18, x7 + mov x22, x16 + LoopKw: + ld1 {v16.8h}, [x22], x13 + ld1 {v25.8h}, [x17], #16 + fmla v0.8h, v16.8h, v25.8h + subs x18, x18, #1 + bne LoopKw + add x16, x16, x12 + subs x20, x20, #1 + bne LoopKh + cbnz x15, Relu6 + cbnz x14, Relu + b Write + Relu6: + fmin v0.8h, v0.8h, v26.8h + Relu: + fmax v0.8h, v0.8h, v27.8h + Write: + st1 {v0.8h}, [x3], x9 + add x23, x23, x11 + subs x24, x24, #1 + bne LoopW + LoopWEnd: + add x0, x0, x8 + add x1, x1, x10 + subs x4, x4, #1 + bne LoopH + + sub sp, sp, #48 + ldp x19, x20, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x23, x24, [sp], #16 + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/DeconvDwFp16Center.S b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/DeconvDwFp16Center.S new file mode 100644 index 0000000000..1087856cb5 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/assembly/opt/DeconvDwFp16Center.S @@ -0,0 +1,64 @@ +#ifdef __aarch64__ + +.text +.align 5 +.global DeconvDwFp16Center +#ifndef __APPLE__ +.type DeconvDwFp16Center, %function +#endif + +// void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width, +// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, +// size_t in_kh_step, size_t in_kw_step); +// x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step +// x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step +DeconvDwFp16Center: + // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to + // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers + // x19 ~ x29 should be also preserved + // whereas our coding style do not permit such amount of parameters + sub sp, sp, #32 + stp x19, x20, [sp], #16 + stp x21, x22, [sp], #16 + + ldr x8, [sp] + ldr x9, [sp, #8] + ldr x10, [sp, #16] + ldr x11, [sp, #24] + ldr x12, [sp, #32] + + LoopH: + mov x15, x0 + mov x16, x1 + mov x17, x4 + LoopW: + mov x18, x15 + mov x19, x2 + mov x20, x5 + ld1 {v1.8h}, [x16], x8 + LoopKh: + mov x21, x18 + mov x13, x6 + LoopKw: + ld1 {v0.8h}, [x21] + ld1 {v2.8h}, [x19], #16 + fmla v0.8h, v1.8h, v2.8h + st1 {v0.8h}, [x21], x12 + subs x13, x13, #1 + bne LoopKw + add x18, x18, x11 + subs x20, x20, #1 + bne LoopKh + add x15, x15, x10 + subs x17, x17, #1 + bne LoopW + add x0, x0, x9 + add x1, x1, x7 + subs x3, x3, #1 + bne LoopH + + sub sp, sp, #32 + ldp x19, x20, [sp], #16 + ldp x21, x22, [sp], #16 + ret +#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/common_func.h b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/common_func.h new file mode 100644 index 0000000000..30383c90ad --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/common_func.h @@ -0,0 +1,44 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_COMMON_FUNC_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_COMMON_FUNC_H_ + +#include +#include +#include +#include "src/runtime/kernel/arm/opclib/op_base.h" +#include "src/runtime/kernel/arm/opclib/conv_parameter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef ENABLE_ARM64 +void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, + size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, + size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, + size_t in_kw_step, size_t relu, size_t relu6); +void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width, + size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, + size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_COMMON_FUNC_H_ */ diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc index 122f1fe29d..e038ab24ce 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc @@ -16,6 +16,7 @@ #include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h" #include +#include "src/runtime/kernel/arm/opclib/fp16/common_func.h" /*conv depthwise fp16 begin*/ void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, @@ -79,6 +80,7 @@ void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t * } // height loop } +#ifndef ENABLE_ARM64 void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int height, int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, int in_sw_step, int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6) { @@ -97,12 +99,17 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t * const float16_t *src_kw = src_kh; const float16_t *weight_kw = weight_kh; for (int kw = 0; kw < kernel_w; kw++) { +#ifdef ENABLE_ARM64 float16x8_t src_8 = vld1q_f16(src_kw); float16x8_t weight_8 = vld1q_f16(weight_kw); float16x8_t dst_8 = vld1q_f16(dst_w); dst_8 = vfmaq_f16(dst_8, src_8, weight_8); vst1q_f16(dst_w, dst_8); - +#else + for (int c = 0; c < C8NUM; c++) { + dst_w[c] += src_kw[c] * weight_kw[c]; + } +#endif src_kw += in_kw_step; weight_kw += C8NUM; } // kernel_w loop @@ -122,6 +129,7 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t * src_h += in_sh_step; } // dst_height loop } +#endif // conv depthwise fp16: sliding window void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, @@ -149,11 +157,19 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; - +#ifdef ENABLE_ARM64 + ConvDwFp16Center(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, + sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, + sliding->out_h_step_ * sizeof(float16_t), sliding->block_channel_ * sizeof(float16_t), + sliding->in_sh_step_ * sizeof(float16_t), sliding->in_sw_step_ * sizeof(float16_t), + sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t), + conv_param->is_relu_, conv_param->is_relu6_); +#else DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_); +#endif } } // output C8 loop src += sliding->in_step_; @@ -214,6 +230,7 @@ void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float } // height loop } +#ifndef ENABLE_ARM64 void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height, int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, int in_sw_step, int in_kh_step, int in_kw_step) { @@ -229,12 +246,17 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float float16_t *dst_kw = dst_kh; const float16_t *weight_kw = weight_kh; for (int kw = 0; kw < kernel_w; kw++) { +#ifdef ENABLE_ARM64 float16x8_t src_8 = vld1q_f16(src_w); float16x8_t weight_8 = vld1q_f16(weight_kw); float16x8_t dst_8 = vld1q_f16(dst_kw); dst_8 = vfmaq_f16(dst_8, src_8, weight_8); vst1q_f16(dst_kw, dst_8); - +#else + for (int c = 0; c < C8NUM; c++) { + dst_kw[c] += src_w[c] * weight_kw[c]; + } +#endif dst_kw += in_kw_step; weight_kw += C8NUM; } // kernel_w loop @@ -248,6 +270,7 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float src_h += out_h_step; } // dst_height loop } +#endif void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel, const ConvParameter *conv_param) { @@ -289,11 +312,18 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_; const float16_t *in_t = src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; - +#ifdef ENABLE_ARM64 + DeconvDwFp16Center(out_t, in_t, weight, sliding->bottom_ - sliding->top_, + sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, + sliding->out_h_step_ * sizeof(float16_t), sliding->block_channel_ * sizeof(float16_t), + sliding->in_sh_step_ * sizeof(float16_t), sliding->in_sw_step_ * sizeof(float16_t), + sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t)); +#else DeconvDepthwiseCenterFp16(out_t, in_t, weight, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_); +#endif } DeconvDepthwisePostFuncFp16(dst_data, bias, sliding->block_channel_, conv_param); } // output C8 loop diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h index e152b9baba..1c1f454caf 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h @@ -38,6 +38,15 @@ void MatrixSub(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stri void MatrixMultiAdd(float *c11, float *c12, float *c21, float *c22, float *x_ptr, size_t row, size_t col, size_t c_stride, size_t x_stride); +#ifdef ENABLE_ARM +void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, + size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, + size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); +void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, + size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, + size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); +#endif + #ifdef ENABLE_ARM64 void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size); void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size); @@ -49,12 +58,6 @@ void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, void C4BiasAddRelu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride); void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); -void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, - size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, - size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); -void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, - size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, - size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); #endif #ifdef __cplusplus diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/common_func.h b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/common_func.h new file mode 100644 index 0000000000..0f361bebaf --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/common_func.h @@ -0,0 +1,62 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_COMMON_FUNC_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_COMMON_FUNC_H_ + +#include +#include +#include +#include "src/runtime/kernel/arm/opclib/op_base.h" +#include "src/runtime/kernel/arm/opclib/conv_parameter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef ENABLE_ARM +void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8, + size_t oc4, size_t offset); + +#ifdef ENABLE_ARM64 +void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize, + size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min, + size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before, + size_t shift_after); +#elif defined(ENABLE_ARM32) +void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize, + size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min, + size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before, + size_t shift_after); +#endif +#endif + +#ifdef ENABLE_ARM +void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width, + size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, + size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); +void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, + size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, + size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int out_multiplier, + int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_COMMON_FUNC_H_ */ + diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc index b44024d913..2af0b5acf7 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc @@ -17,6 +17,7 @@ #include "src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.h" #include #include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h" +#include "src/runtime/kernel/arm/opclib/int8/common_func.h" /*conv depthwise int8 begin*/ void DepthwiseBorderPixelInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height, @@ -85,6 +86,7 @@ void DepthwiseBorderInt8(int8_t *dst, const int16_t *src, const int16_t *weight, } // height loop } +#ifndef ENABLE_ARM64 void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height, int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, int in_sw_step, int in_kh_step, int in_kw_step, int out_multiplier, int left_shift, @@ -133,6 +135,7 @@ void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight, src_h += in_sh_step; } // dst_height loop } +#endif void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) { @@ -158,7 +161,17 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; const int16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * C4NUM; int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * C4NUM; - +#ifdef ENABLE_ARM64 + ConvDwInt8Center( + out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, + conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(int8_t), + sliding->block_channel_ * sizeof(int8_t), sliding->in_sh_step_ * sizeof(int16_t), + sliding->in_sw_step_ * sizeof(int16_t), sliding->in_kh_step_ * sizeof(int16_t), + sliding->in_kw_step_ * sizeof(int16_t), conv_param->conv_quant_arg_.quant_multiplier_[0], + conv_param->conv_quant_arg_.left_shift_[0], conv_param->conv_quant_arg_.right_shift_[0], + conv_param->conv_quant_arg_.quant_args_[2][0].zp_, conv_param->conv_quant_arg_.out_act_min_[0], + conv_param->conv_quant_arg_.out_act_max_[0]); +#else DepthwiseCenterInt8( out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_, @@ -166,6 +179,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w conv_param->conv_quant_arg_.quant_multiplier_[0], conv_param->conv_quant_arg_.left_shift_[0], conv_param->conv_quant_arg_.right_shift_[0], conv_param->conv_quant_arg_.quant_args_[2][0].zp_, conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0]); +#endif } } // output C4 loop src += sliding->in_step_; @@ -222,6 +236,7 @@ void DeconvDepthwiseBorderInt8(int32_t *dst, const int16_t *src, const int16_t * } // height loop } +#ifndef ENABLE_ARM64 void DeconvDepthwiseCenterInt8(int32_t *dst, const int16_t *src, const int16_t *weight, int height, int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, int in_sw_step, int in_kh_step, int in_kw_step) { @@ -253,6 +268,7 @@ void DeconvDepthwiseCenterInt8(int32_t *dst, const int16_t *src, const int16_t * src_h += out_h_step; } // dst_height loop } +#endif void DeconvDepthwisePostFuncInt8(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, const ConvParameter *conv_param, int out_multiplier, int left_shift, int right_shift, @@ -302,11 +318,18 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in int32_t *out_t = output_buffer + oh_h_start * sliding->in_h_step_ + oh_w_start * C4NUM; const int16_t *in_t = src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; - +#ifdef ENABLE_ARM64 + DeconvDwInt8Center(out_t, in_t, weight, sliding->bottom_ - sliding->top_, + sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, + sliding->out_h_step_ * sizeof(int16_t), sliding->block_channel_ * sizeof(int16_t), + sliding->in_sh_step_ * sizeof(int32_t), sliding->in_sw_step_ * sizeof(int32_t), + sliding->in_kh_step_ * sizeof(int32_t), sliding->in_kw_step_ * sizeof(int32_t)); +#else DeconvDepthwiseCenterInt8(out_t, in_t, weight, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_); +#endif } DeconvDepthwisePostFuncInt8( dst_data, output_buffer, bias, sliding->block_channel_, conv_param, diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc index 2c460e0713..81c1f8f30d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc @@ -17,25 +17,7 @@ #include "src/runtime/kernel/arm/opclib/int8/conv_int8.h" #include #include "src/runtime/kernel/arm/opclib/winograd_transform.h" - -extern "C" { -#ifdef ENABLE_ARM -void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8, - size_t oc4, size_t offset); - -#ifdef ENABLE_ARM64 -void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize, - size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min, - size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before, - size_t shift_after); -#elif defined(ENABLE_ARM32) -void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize, - size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min, - size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before, - size_t shift_after); -#endif -#endif -} +#include "src/runtime/kernel/arm/opclib/int8/common_func.h" void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias, int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum,