Merge pull request !3961 from lixian/mastertags/v0.7.0-beta
| @@ -0,0 +1,161 @@ | |||
| #ifdef __arm__ | |||
| #ifndef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global ConvDwFp32Center | |||
| #ifndef __APPLE__ | |||
| .type ConvDwFp32Center, %function | |||
| #endif | |||
| // void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||
| // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | |||
| // size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); | |||
| // r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: weight, #56: kernel_h, #60: kernel_w, | |||
| // #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step | |||
| // #88: relu, #92: relu6 | |||
| ConvDwFp32Center: | |||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | |||
| // according to https://stackoverflow.com/questions/53625807 | |||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | |||
| // clang's rule seems more simple, though there are no subroutine calls here | |||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | |||
| push {r0-r8, r10, r11, lr} | |||
| vpush {v4-v7} | |||
| add sp, sp, #112 | |||
| ldr r4, [sp, #48] | |||
| vld1.32 {q13}, [r3] | |||
| vmov.i32 q14, #6 | |||
| vcvt.f32.s32 q14, q14 | |||
| veor q15, q15, q15 | |||
| LoopH: | |||
| ldr r1, [sp, #4] // src_w | |||
| ldr r5, [sp, #52] // width | |||
| ldr r0, [sp] // dst_w | |||
| cmp r5, #4 | |||
| blt LoopW | |||
| LoopW4: | |||
| mov r11, [sp, #76] // in_sw_step | |||
| mov r8, r1 // src_kh | |||
| ldr r2, [sp, #8] // weight_kh | |||
| ldr r6, [sp, #56] // kernel_h | |||
| vmov q0, q13 | |||
| LoopKh4: | |||
| ldr r12, [sp, #80] //in_kh_step | |||
| ldr r7, [sp, #60] // kernel_w | |||
| mov lr, r8 // src_kw | |||
| LoopKw4: | |||
| mov r10, lr | |||
| vld1.32 {q12}, [r2]! | |||
| vld1.32 {q4}, [r10] | |||
| add r10, r10, r11 | |||
| vmla.f32 q0, q4, q12 | |||
| vld1.32 {q5}, [r10] | |||
| add r10, r10, r11 | |||
| vmla.f32 q1, q5, q12 | |||
| vld1.32 {q6}, [r10] | |||
| add r10, r10, r11 | |||
| vmla.f32 q2, q6, q12 | |||
| vld1.32 {q7}, [r10] | |||
| add r10, r10, r11 | |||
| vmla.f32 q3, q7, q12 | |||
| subs r7, r7, #1 | |||
| add lr, lr, r12 | |||
| bne LoopKw4 | |||
| ldr r12, [sp, #80] | |||
| add r8, r8, r12 | |||
| subs r6, r6, #1 | |||
| bne LoopKh4 | |||
| ldr r12, [sp, #92] | |||
| cmp r12, #0 | |||
| bne Relu64 | |||
| ldr r12, [sp, #88] | |||
| cmp r12, #0 | |||
| bne Relu4 | |||
| b Write4 | |||
| Relu64: | |||
| vmin.f32 q0, q0, q14 | |||
| vmin.f32 q1, q1, q14 | |||
| vmin.f32 q2, q2, q14 | |||
| vmin.f32 q3, q3, q14 | |||
| Relu4: | |||
| vmax.f32 q0, q0, q15 | |||
| vmax.f32 q1, q1, q15 | |||
| vmax.f32 q2, q2, q15 | |||
| vmax.f32 q3, q3, q15 | |||
| Write4: | |||
| ldr r12, [sp, #68] | |||
| vst1.32 {q0}, [r0] | |||
| add r0, r0, r12 | |||
| vst1.32 {q1}, [r0] | |||
| add r0, r0, r12 | |||
| vst1.32 {q2}, [r0] | |||
| add r0, r0, r12 | |||
| vst1.32 {q3}, [r0] | |||
| add r0, r0, r12 | |||
| mov r12, #4 | |||
| mul r11, r11, r12 | |||
| add r1, r1, r11 | |||
| sub r5, r5, #4 | |||
| cmp r5, r5, #0 | |||
| ble LoopWEnd | |||
| cmp r5, #4 | |||
| bge LoopW | |||
| LoopW: | |||
| mov r8, r1 // src_kh | |||
| ldr r2, [sp, #8] // weight_kh | |||
| ldr r6, [sp, #56] // kernel_h | |||
| vmov q0, q13 | |||
| LoopKh: | |||
| ldr r12, [sp, #84] //in_kw_step | |||
| ldr r7, [sp, #60] // kernel_w | |||
| mov r10, r8 // src_kw | |||
| LoopKw: | |||
| vld1.32 {q1}, [r10] | |||
| add r10, r10, r12 | |||
| vld1.32 {q12}, [r2]! | |||
| vmla.f32 q0, q1, q12 | |||
| subs r7, r7, #1 | |||
| bne LoopKw | |||
| ldr r12, [sp, #80] | |||
| add r8, r8, r12 | |||
| subs r6, r6, #1 | |||
| bne LoopKh | |||
| ldr r12, [sp, #92] | |||
| cmp r12, #0 | |||
| bne Relu6 | |||
| ldr r12, [sp, #88] | |||
| cmp r12, #0 | |||
| bne Relu | |||
| b Write | |||
| Relu6: | |||
| vmin.f32 q0, q0, q14 | |||
| Relu: | |||
| vmax.f32 q0, q0, q15 | |||
| Write: | |||
| ldr r12, [sp, #68] | |||
| vst1.32 {q0}, [r0] | |||
| add r0, r0, r12 | |||
| ldr r12, [sp, #76] | |||
| add r1, r1, r12 | |||
| subs r5, r5, #1 | |||
| bne LoopW | |||
| ldr r3, [sp, #64] | |||
| ldr r12, [sp] | |||
| add r12, r12, r3 | |||
| str r12, [sp] | |||
| ldr r3, [sp, #72] | |||
| ldr r12, [sp, #4] | |||
| add r12, r12, r3 | |||
| str r12, [sp, #4] | |||
| subs r4, r4, #1 | |||
| bne LoopH | |||
| LoopWEnd: | |||
| sub sp, sp, #112 | |||
| vpop {v4-v7} | |||
| pop {r0-r8, r10, r11, pc} | |||
| #endif | |||
| #endif | |||
| @@ -0,0 +1,207 @@ | |||
| #ifdef __arm__ | |||
| #ifndef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global ConvDwInt8Center | |||
| #ifndef __APPLE__ | |||
| .type ConvDwInt8Center, %function | |||
| #endif | |||
| // void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width, | |||
| // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | |||
| // size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift, | |||
| // int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max); | |||
| // r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: width, #56: kernel_h, #60: kernel_w, | |||
| // #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step | |||
| // #88: out_multiplier, #92: left_shift, #96: right_shift, #100: out_zp, #104: acc_min, #108: acc_max | |||
| ConvDwInt8Center: | |||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | |||
| // according to https://stackoverflow.com/questions/53625807 | |||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | |||
| // clang's rule seems more simple, though there are no subroutine calls here | |||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | |||
| push {r0-r8, r10, r11, lr} | |||
| vpush {q4-q7} | |||
| add sp, sp, #112 | |||
| ldr r4, [sp, #48] | |||
| ldr r12, [sp, #92] | |||
| vdup.32 q9, r12 | |||
| ldr r11, [sp, #88] | |||
| vdup.32 q10, r11 | |||
| ldr r10, [sp, #96] | |||
| vdup.32 q11, r10 | |||
| ldr r8, [sp, #100] | |||
| vdup.32 q12, r8 | |||
| ldr r7, [sp, #104] | |||
| vdup.32 q13, r7 | |||
| ldr r6, [sp, #108] | |||
| vdup.32 q14, r6 | |||
| vld1.32 {q15}, [r3] | |||
| LoopH: | |||
| ldr r1, [sp, #4] // src_w | |||
| ldr r5, [sp, #52] // width | |||
| ldr r0, [sp] // dst_w | |||
| LoopW4: | |||
| mov r11, [sp, #76] // in_sw_step | |||
| mov r8, r1 // src_kh | |||
| ldr r2, [sp, #8] // weight_kh | |||
| ldr r6, [sp, #56] // kernel_h | |||
| vmov q0, q15 | |||
| LoopKh4: | |||
| ldr r12, [sp, #80] //in_kh_step | |||
| ldr r7, [sp, #60] // kernel_w | |||
| mov r10, r8 // src_kw | |||
| LoopKw4: | |||
| vld1.16 {d24}, [r2]! | |||
| vld1.16 {d8}, [r10] | |||
| add r10, r10, r11 | |||
| vmlal.s16 q0, d8, d24 | |||
| vld1.16 {d10}, [r10] | |||
| add r10, r10, r11 | |||
| vmlal.s16 q1, d10, d24 | |||
| vld1.16 {d12}, [r10] | |||
| add r10, r10, r11 | |||
| vmlal.s16 q2, d12, d24 | |||
| vld1.16 {d14}, [r10] | |||
| add r10, r10, r11 | |||
| vmlal.s16 q3, d14, d24 | |||
| subs r7, r7, #1 | |||
| bne LoopKw4 | |||
| ldr r12, [sp, #80] | |||
| add r8, r8, r12 | |||
| subs r6, r6, #1 | |||
| bne LoopKh4 | |||
| vshl.s32 q0, q0, q9 | |||
| vshl.s32 q1, q1, q9 | |||
| vshl.s32 q2, q2, q9 | |||
| vshl.s32 q3, q3, q9 | |||
| vqrdmulh.s32 q0, q0, q10 | |||
| vqrdmulh.s32 q1, q1, q10 | |||
| vqrdmulh.s32 q2, q2, q10 | |||
| vqrdmulh.s32 q3, q3, q10 | |||
| vrshl.s32 q0, q0, q11 | |||
| vrshl.s32 q1, q1, q11 | |||
| vrshl.s32 q2, q2, q11 | |||
| vrshl.s32 q3, q3, q11 | |||
| vadd.i32 q0, q0, q12 | |||
| vadd.i32 q1, q1, q12 | |||
| vadd.i32 q2, q2, q12 | |||
| vadd.i32 q3, q3, q12 | |||
| vmax.s32 q0, q0, q13 | |||
| vmax.s32 q1, q1, q13 | |||
| vmax.s32 q2, q2, q13 | |||
| vmax.s32 q3, q3, q13 | |||
| vmin.s32 q0, q0, q14 | |||
| vmin.s32 q1, q1, q14 | |||
| vmin.s32 q2, q2, q14 | |||
| vmin.s32 q3, q3, q14 | |||
| vqmovn.s32 d0, q0 | |||
| vqmovn.s32 d2, q1 | |||
| vqmovn.s32 d4, q2 | |||
| vqmovn.s32 d6, q3 | |||
| vqmovn.s16 d0, q0 | |||
| vqmovn.s16 d2, q1 | |||
| vqmovn.s16 d4, q2 | |||
| vqmovn.s16 d6, q3 | |||
| mov r3, r0 | |||
| ldr r12, [sp, #68] | |||
| vst1.8 {d0[0]}, [r3]! | |||
| vst1.8 {d0[1]}, [r3]! | |||
| vst1.8 {d0[2]}, [r3]! | |||
| vst1.8 {d0[3]}, [r3]! | |||
| add r0, r0, r12 | |||
| mov r3, r0 | |||
| vst1.8 {d2[0]}, [r3]! | |||
| vst1.8 {d2[1]}, [r3]! | |||
| vst1.8 {d2[2]}, [r3]! | |||
| vst1.8 {d2[3]}, [r3]! | |||
| add r0, r0, r12 | |||
| mov r3, r0 | |||
| vst1.8 {d4[0]}, [r3]! | |||
| vst1.8 {d4[1]}, [r3]! | |||
| vst1.8 {d4[2]}, [r3]! | |||
| vst1.8 {d4[3]}, [r3]! | |||
| add r0, r0, r12 | |||
| mov r3, r0 | |||
| vst1.8 {d6[0]}, [r3]! | |||
| vst1.8 {d6[1]}, [r3]! | |||
| vst1.8 {d6[2]}, [r3]! | |||
| vst1.8 {d6[3]}, [r3]! | |||
| add r0, r0, r12 | |||
| mov r3, r0 | |||
| mov r12, #4 | |||
| mul r11, r11, r12 | |||
| add r1, r1, r11 | |||
| subs r5, r5, #1 | |||
| bne LoopW4 | |||
| LoopW: | |||
| mov r8, r1 // src_kh | |||
| ldr r2, [sp, #8] // weight_kh | |||
| ldr r6, [sp, #56] // kernel_h | |||
| vmov q0, q15 | |||
| LoopKh: | |||
| ldr r12, [sp, #84] //in_kw_step | |||
| ldr r7, [sp, #60] // kernel_w | |||
| mov r10, r8 // src_kw | |||
| LoopKw: | |||
| vld1.16 {d2}, [r10] | |||
| add r10, r10, r12 | |||
| vld1.16 {d24}, [r2]! | |||
| vmlal.s16 q0, d2, d24 | |||
| subs r7, r7, #1 | |||
| bne LoopKw | |||
| ldr r12, [sp, #80] | |||
| add r8, r8, r12 | |||
| subs r6, r6, #1 | |||
| bne LoopKh | |||
| vshl.s32 q0, q0, q9 | |||
| vqrdmulh.s32 q0, q0, q10 | |||
| vrshl.s32 q0, q0, q11 | |||
| vadd.i32 q0, q0, q12 | |||
| vmax.s32 q0, q0, q13 | |||
| vmin.s32 q0, q0, q14 | |||
| vqmovn.s32 d0, q0 | |||
| vqmovn.s16 d0, q0 | |||
| mov r3, r0 | |||
| ldr r12, [sp, #68] | |||
| vst1.8 {d0[0]}, [r3]! | |||
| vst1.8 {d0[1]}, [r3]! | |||
| vst1.8 {d0[2]}, [r3]! | |||
| vst1.8 {d0[3]}, [r3]! | |||
| add r0, r0, r12 | |||
| ldr r12, [sp, #76] | |||
| add r1, r1, r12 | |||
| subs r5, r5, #1 | |||
| bne LoopW | |||
| ldr r3, [sp, #64] | |||
| ldr r12, [sp] | |||
| add r12, r12, r3 | |||
| str r12, [sp] | |||
| ldr r3, [sp, #72] | |||
| ldr r12, [sp, #4] | |||
| add r12, r12, r3 | |||
| str r12, [sp, #4] | |||
| subs r4, r4, #1 | |||
| bne LoopH | |||
| sub sp, sp, #112 | |||
| vpop {q4-q7} | |||
| pop {r0-r8, r10, r11, pc} | |||
| #endif | |||
| #endif | |||
| @@ -0,0 +1,69 @@ | |||
| #ifdef __arm__ | |||
| #ifndef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global DeconvDwFp32Center | |||
| #ifndef __APPLE__ | |||
| .type DeconvDwFp32Center, %function | |||
| #endif | |||
| // void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, | |||
| // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, | |||
| // size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); | |||
| // r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step | |||
| // #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step | |||
| DeconvDwFp32Center: | |||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | |||
| // according to https://stackoverflow.com/questions/53625807 | |||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | |||
| // clang's rule seems more simple, though there are no subroutine calls here | |||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | |||
| push {r0-r8, r10, r11, lr} | |||
| ldr r10, [sp, #80] // in_kw_step | |||
| ldr r11, [sp, #76] // in_kh_step | |||
| LoopH: | |||
| ldr r0, [sp] // dst_w | |||
| ldr r1, [sp, #4] // src_w | |||
| ldr r4, [sp, #48] // width | |||
| LoopW: | |||
| mov r6, r0 // dst_kh | |||
| ldr r2, [sp, #8] // weight_kh | |||
| ldr r5, [sp, #52] // kernel_h | |||
| vld1.32 {q1}, [r1] | |||
| LoopKh: | |||
| mov r7, r6 // dst_kw | |||
| ldr r12, [sp, #56] // kernel_w | |||
| LoopKw: | |||
| vld1.32 {q0}, [r7] | |||
| vld1.32 {q2}, [r2]! | |||
| vmla.f32 q0, q1, q2 | |||
| vst1.32 {q0}, [r7] | |||
| add r7, r7, r10 | |||
| subs r12, r12, #1 | |||
| bne LoopKw | |||
| add r6, r6, r11 | |||
| subs r5, r5, #1 | |||
| bne LoopKh | |||
| ldr r12, [sp, #72] | |||
| add r0, r0, r12 | |||
| ldr r8, [sp, #64] | |||
| add r1, r1, r8 | |||
| subs r4, r4, #1 | |||
| bne LoopW | |||
| ldr r8, [sp, #68] | |||
| ldr r12, [sp] | |||
| add r12, r12, r8 | |||
| str r12, [sp] | |||
| ldr r8, [sp, #60] | |||
| ldr r12, [sp, #4] | |||
| add r12, r12, r8 | |||
| str r12, [sp, #4] | |||
| subs r3, r3, #1 | |||
| bne LoopH | |||
| pop {r0-r8, r10, r11, pc} | |||
| #endif | |||
| #endif | |||
| @@ -0,0 +1,69 @@ | |||
| #ifdef __arm__ | |||
| #ifndef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global DeconvDwInt8Center | |||
| #ifndef __APPLE__ | |||
| .type DeconvDwInt8Center, %function | |||
| #endif | |||
| // void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width, | |||
| // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, | |||
| // size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); | |||
| // r0: dst, r1: src, r2: weight, r3: height, r4: width, #52: kernel_h, #56: kernel_w, #60: out_h_step | |||
| // #64: block_channel, #68: in_sh_step, #72: in_sw_step, #76: in_kh_step, #80: in_kw_step | |||
| DeconvDwInt8Center: | |||
| // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" | |||
| // according to https://stackoverflow.com/questions/53625807 | |||
| // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway | |||
| // clang's rule seems more simple, though there are no subroutine calls here | |||
| // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf | |||
| push {r0-r8, r10, r11, lr} | |||
| ldr r10, [sp, #80] // in_kw_step | |||
| ldr r11, [sp, #76] // in_kh_step | |||
| LoopH: | |||
| ldr r0, [sp] // dst_w | |||
| ldr r1, [sp, #4] // src_w | |||
| ldr r4, [sp, #48] // width | |||
| LoopW: | |||
| mov r6, r0 // dst_kh | |||
| ldr r2, [sp, #8] // weight_kh | |||
| ldr r5, [sp, #52] // kernel_h | |||
| vld1.16 {d2}, [r1] | |||
| LoopKh: | |||
| mov r7, r6 // dst_kw | |||
| ldr r12, [sp, #56] // kernel_w | |||
| LoopKw: | |||
| vld1.32 {q0}, [r7] | |||
| vld1.16 {d24}, [r2]! | |||
| vmlal.s16 q0, d2, d24 | |||
| vst1.32 {q0}, [r7] | |||
| add r7, r7, r10 | |||
| subs r12, r12, #1 | |||
| bne LoopKw | |||
| add r6, r6, r11 | |||
| subs r5, r5, #1 | |||
| bne LoopKh | |||
| ldr r12, [sp, #72] | |||
| add r0, r0, r12 | |||
| ldr r8, [sp, #64] | |||
| add r1, r1, r8 | |||
| subs r4, r4, #1 | |||
| bne LoopW | |||
| ldr r8, [sp, #68] | |||
| ldr r12, [sp] | |||
| add r12, r12, r8 | |||
| str r12, [sp] | |||
| ldr r8, [sp, #60] | |||
| ldr r12, [sp, #4] | |||
| add r12, r12, r8 | |||
| str r12, [sp, #4] | |||
| subs r3, r3, #1 | |||
| bne LoopH | |||
| pop {r0-r8, r10, r11, pc} | |||
| #endif | |||
| #endif | |||
| @@ -32,24 +32,238 @@ ConvDwFp32Center: | |||
| ldr x14, [sp, #48] | |||
| ldr x15, [sp, #56] | |||
| ld1 {v5.4s}, [x3] | |||
| ld1 {v24.4s}, [x3] | |||
| movi v26.4s, #6 | |||
| scvtf v26.4s, v26.4s | |||
| dup v27.4s, wzr | |||
| LoopH: | |||
| mov x23, x1 | |||
| mov x24, x5 | |||
| mov x3, x0 | |||
| cmp x24, #8 | |||
| blt LoopW | |||
| cmp x24, #16 | |||
| blt LoopW8 | |||
| LoopW16: | |||
| mov x19, #16 | |||
| mul x19, x19, x11 | |||
| mov x16, x23 | |||
| mov x17, x2 | |||
| mov x20, x6 | |||
| mov v0.16b, v24.16b | |||
| mov v1.16b, v24.16b | |||
| mov v2.16b, v24.16b | |||
| mov v3.16b, v24.16b | |||
| mov v4.16b, v24.16b | |||
| mov v5.16b, v24.16b | |||
| mov v6.16b, v24.16b | |||
| mov v7.16b, v24.16b | |||
| mov v8.16b, v24.16b | |||
| mov v9.16b, v24.16b | |||
| mov v10.16b, v24.16b | |||
| mov v11.16b, v24.16b | |||
| mov v12.16b, v24.16b | |||
| mov v13.16b, v24.16b | |||
| mov v14.16b, v24.16b | |||
| mov v15.16b, v24.16b | |||
| LoopKh16: | |||
| mov x18, x7 | |||
| mov x21, x16 | |||
| LoopKw16: | |||
| mov x22, x21 | |||
| ld1 {v25.4s}, [x17], #16 | |||
| ld1 {v16.4s}, [x22], x11 | |||
| ld1 {v17.4s}, [x22], x11 | |||
| fmla v0.4s, v16.4s, v25.4s | |||
| fmla v1.4s, v17.4s, v25.4s | |||
| ld1 {v18.4s}, [x22], x11 | |||
| ld1 {v19.4s}, [x22], x11 | |||
| fmla v2.4s, v18.4s, v25.4s | |||
| fmla v3.4s, v19.4s, v25.4s | |||
| ld1 {v20.4s}, [x22], x11 | |||
| ld1 {v21.4s}, [x22], x11 | |||
| fmla v4.4s, v20.4s, v25.4s | |||
| fmla v5.4s, v21.4s, v25.4s | |||
| ld1 {v22.4s}, [x22], x11 | |||
| ld1 {v23.4s}, [x22], x11 | |||
| fmla v6.4s, v22.4s, v25.4s | |||
| fmla v7.4s, v23.4s, v25.4s | |||
| ld1 {v16.4s}, [x22], x11 | |||
| ld1 {v17.4s}, [x22], x11 | |||
| fmla v8.4s, v16.4s, v25.4s | |||
| fmla v9.4s, v17.4s, v25.4s | |||
| ld1 {v18.4s}, [x22], x11 | |||
| ld1 {v19.4s}, [x22], x11 | |||
| fmla v10.4s, v18.4s, v25.4s | |||
| fmla v11.4s, v19.4s, v25.4s | |||
| ld1 {v20.4s}, [x22], x11 | |||
| ld1 {v21.4s}, [x22], x11 | |||
| fmla v12.4s, v20.4s, v25.4s | |||
| fmla v13.4s, v21.4s, v25.4s | |||
| ld1 {v22.4s}, [x22], x11 | |||
| ld1 {v23.4s}, [x22], x11 | |||
| fmla v14.4s, v22.4s, v25.4s | |||
| fmla v15.4s, v23.4s, v25.4s | |||
| subs x18, x18, #1 | |||
| add x21, x21, x13 | |||
| bne LoopKw16 | |||
| add x16, x16, x12 | |||
| subs x20, x20, #1 | |||
| bne LoopKh16 | |||
| cbnz x15, Relu616 | |||
| cbnz x14, Relu16 | |||
| b Write16 | |||
| Relu616: | |||
| fmin v0.4s, v0.4s, v26.4s | |||
| fmin v1.4s, v1.4s, v26.4s | |||
| fmin v2.4s, v2.4s, v26.4s | |||
| fmin v3.4s, v3.4s, v26.4s | |||
| fmin v4.4s, v4.4s, v26.4s | |||
| fmin v5.4s, v5.4s, v26.4s | |||
| fmin v6.4s, v6.4s, v26.4s | |||
| fmin v7.4s, v7.4s, v26.4s | |||
| fmin v8.4s, v8.4s, v26.4s | |||
| fmin v9.4s, v9.4s, v26.4s | |||
| fmin v10.4s, v10.4s, v26.4s | |||
| fmin v11.4s, v11.4s, v26.4s | |||
| fmin v12.4s, v12.4s, v26.4s | |||
| fmin v13.4s, v13.4s, v26.4s | |||
| fmin v14.4s, v14.4s, v26.4s | |||
| fmin v15.4s, v15.4s, v26.4s | |||
| Relu16: | |||
| fmax v0.4s, v0.4s, v27.4s | |||
| fmax v1.4s, v1.4s, v27.4s | |||
| fmax v2.4s, v2.4s, v27.4s | |||
| fmax v3.4s, v3.4s, v27.4s | |||
| fmax v4.4s, v4.4s, v27.4s | |||
| fmax v5.4s, v5.4s, v27.4s | |||
| fmax v6.4s, v6.4s, v27.4s | |||
| fmax v7.4s, v7.4s, v27.4s | |||
| fmax v8.4s, v8.4s, v27.4s | |||
| fmax v9.4s, v9.4s, v27.4s | |||
| fmax v10.4s, v10.4s, v27.4s | |||
| fmax v11.4s, v11.4s, v27.4s | |||
| fmax v12.4s, v12.4s, v27.4s | |||
| fmax v13.4s, v13.4s, v27.4s | |||
| fmax v14.4s, v14.4s, v27.4s | |||
| fmax v15.4s, v15.4s, v27.4s | |||
| Write16: | |||
| st1 {v0.4s}, [x3], x9 | |||
| st1 {v1.4s}, [x3], x9 | |||
| st1 {v2.4s}, [x3], x9 | |||
| st1 {v3.4s}, [x3], x9 | |||
| st1 {v4.4s}, [x3], x9 | |||
| st1 {v5.4s}, [x3], x9 | |||
| st1 {v6.4s}, [x3], x9 | |||
| st1 {v7.4s}, [x3], x9 | |||
| st1 {v8.4s}, [x3], x9 | |||
| st1 {v9.4s}, [x3], x9 | |||
| st1 {v10.4s}, [x3], x9 | |||
| st1 {v11.4s}, [x3], x9 | |||
| st1 {v12.4s}, [x3], x9 | |||
| st1 {v13.4s}, [x3], x9 | |||
| st1 {v14.4s}, [x3], x9 | |||
| st1 {v15.4s}, [x3], x9 | |||
| add x23, x23, x19 | |||
| sub x24, x24, #16 | |||
| cmp x24, #0 | |||
| ble LoopWEnd | |||
| cmp x24, #8 | |||
| blt LoopW | |||
| cmp x24, #16 | |||
| bge LoopW16 | |||
| LoopW8: | |||
| mov x19, #8 | |||
| mul x19, x19, x11 | |||
| mov x16, x23 | |||
| mov x17, x2 | |||
| mov x20, x6 | |||
| mov v0.16b, v24.16b | |||
| mov v1.16b, v24.16b | |||
| mov v2.16b, v24.16b | |||
| mov v3.16b, v24.16b | |||
| mov v4.16b, v24.16b | |||
| mov v5.16b, v24.16b | |||
| mov v6.16b, v24.16b | |||
| mov v7.16b, v24.16b | |||
| LoopKh8: | |||
| mov x18, x7 | |||
| mov x21, x16 | |||
| LoopKw8: | |||
| mov x22, x21 | |||
| ld1 {v25.4s}, [x17], #16 | |||
| ld1 {v16.4s}, [x22], x11 | |||
| ld1 {v17.4s}, [x22], x11 | |||
| fmla v0.4s, v16.4s, v25.4s | |||
| fmla v1.4s, v17.4s, v25.4s | |||
| ld1 {v18.4s}, [x22], x11 | |||
| ld1 {v19.4s}, [x22], x11 | |||
| fmla v2.4s, v18.4s, v25.4s | |||
| fmla v3.4s, v19.4s, v25.4s | |||
| ld1 {v20.4s}, [x22], x11 | |||
| ld1 {v21.4s}, [x22], x11 | |||
| fmla v4.4s, v20.4s, v25.4s | |||
| fmla v5.4s, v21.4s, v25.4s | |||
| ld1 {v22.4s}, [x22], x11 | |||
| ld1 {v23.4s}, [x22], x11 | |||
| fmla v6.4s, v22.4s, v25.4s | |||
| fmla v7.4s, v23.4s, v25.4s | |||
| subs x18, x18, #1 | |||
| add x21, x21, x13 | |||
| bne LoopKw8 | |||
| add x16, x16, x12 | |||
| subs x20, x20, #1 | |||
| bne LoopKh8 | |||
| cbnz x15, Relu68 | |||
| cbnz x14, Relu8 | |||
| b Write8 | |||
| Relu68: | |||
| fmin v0.4s, v0.4s, v26.4s | |||
| fmin v1.4s, v1.4s, v26.4s | |||
| fmin v2.4s, v2.4s, v26.4s | |||
| fmin v3.4s, v3.4s, v26.4s | |||
| fmin v4.4s, v4.4s, v26.4s | |||
| fmin v5.4s, v5.4s, v26.4s | |||
| fmin v6.4s, v6.4s, v26.4s | |||
| fmin v7.4s, v7.4s, v26.4s | |||
| Relu8: | |||
| fmax v0.4s, v0.4s, v27.4s | |||
| fmax v1.4s, v1.4s, v27.4s | |||
| fmax v2.4s, v2.4s, v27.4s | |||
| fmax v3.4s, v3.4s, v27.4s | |||
| fmax v4.4s, v4.4s, v27.4s | |||
| fmax v5.4s, v5.4s, v27.4s | |||
| fmax v6.4s, v6.4s, v27.4s | |||
| fmax v7.4s, v7.4s, v27.4s | |||
| Write8: | |||
| st1 {v0.4s}, [x3], x9 | |||
| st1 {v1.4s}, [x3], x9 | |||
| st1 {v2.4s}, [x3], x9 | |||
| st1 {v3.4s}, [x3], x9 | |||
| st1 {v4.4s}, [x3], x9 | |||
| st1 {v5.4s}, [x3], x9 | |||
| st1 {v6.4s}, [x3], x9 | |||
| st1 {v7.4s}, [x3], x9 | |||
| add x23, x23, x19 | |||
| sub x24, x24, #8 | |||
| cmp x24, #0 | |||
| ble LoopWEnd | |||
| cmp x24, #8 | |||
| bge LoopW8 | |||
| LoopW: | |||
| mov x16, x23 | |||
| mov x17, x2 | |||
| mov x20, x6 | |||
| mov v0.16b, v5.16b | |||
| mov v0.16b, v24.16b | |||
| LoopKh: | |||
| mov x18, x7 | |||
| mov x22, x16 | |||
| LoopKw: | |||
| ld1 {v1.4s}, [x22], x13 | |||
| ld1 {v2.4s}, [x17], #16 | |||
| fmla v0.4s, v1.4s, v2.4s | |||
| ld1 {v16.4s}, [x22], x13 | |||
| ld1 {v25.4s}, [x17], #16 | |||
| fmla v0.4s, v16.4s, v25.4s | |||
| subs x18, x18, #1 | |||
| bne LoopKw | |||
| add x16, x16, x12 | |||
| @@ -59,17 +273,15 @@ ConvDwFp32Center: | |||
| cbnz x14, Relu | |||
| b Write | |||
| Relu6: | |||
| movi v4.4s, #6 | |||
| scvtf v4.4s, v4.4s | |||
| fmin v0.4s, v0.4s, v4.4s | |||
| fmin v0.4s, v0.4s, v26.4s | |||
| Relu: | |||
| dup v3.4s, wzr | |||
| fmax v0.4s, v0.4s, v3.4s | |||
| fmax v0.4s, v0.4s, v27.4s | |||
| Write: | |||
| st1 {v0.4s}, [x3], x9 | |||
| add x23, x23, x11 | |||
| subs x24, x24, #1 | |||
| bne LoopW | |||
| LoopWEnd: | |||
| add x0, x0, x8 | |||
| add x1, x1, x10 | |||
| subs x4, x4, #1 | |||
| @@ -0,0 +1,558 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global ConvDwInt8Center | |||
| #ifndef __APPLE__ | |||
| .type ConvDwInt8Center, %function | |||
| #endif | |||
| // void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width, | |||
| // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | |||
| // size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift, | |||
| // int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max); | |||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, | |||
| // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step | |||
| // x14: out_multiplier, #56: left_shift, #64: right_shift, #72:out_zp, #80: acc_min, #88: acc_max | |||
| ConvDwInt8Center: | |||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| sub sp, sp, #48 | |||
| stp x19, x20, [sp], #16 | |||
| stp x21, x22, [sp], #16 | |||
| stp x23, x24, [sp], #16 | |||
| ldr x8, [sp] | |||
| ldr x9, [sp, #8] | |||
| ldr x10, [sp, #16] | |||
| ldr x11, [sp, #24] | |||
| ldr x12, [sp, #32] | |||
| ldr x13, [sp, #40] | |||
| ldr w14, [sp, #56] | |||
| dup v26.4s, w14 | |||
| ldr x15, [sp, #48] | |||
| dup v27.4s, w15 | |||
| ldr w16, [sp, #64] | |||
| dup v28.4s, w16 | |||
| ldr w17, [sp, #72] | |||
| dup v29.4s, w17 | |||
| ldr w18, [sp, #80] | |||
| dup v30.4s, w18 | |||
| ldr w19, [sp, #88] | |||
| dup v31.4s, w19 | |||
| ld1 {v24.4s}, [x3] | |||
| LoopH: | |||
| mov x23, x1 | |||
| mov x24, x5 | |||
| mov x3, x0 | |||
| cmp x24, #8 | |||
| blt LoopW | |||
| cmp x24, #16 | |||
| blt LoopW8 | |||
| LoopW16: | |||
| mov x19, #16 | |||
| mul x19, x19, x11 | |||
| mov x16, x23 | |||
| mov x17, x2 | |||
| mov x20, x6 | |||
| mov v0.16b, v24.16b | |||
| mov v1.16b, v24.16b | |||
| mov v2.16b, v24.16b | |||
| mov v3.16b, v24.16b | |||
| mov v4.16b, v24.16b | |||
| mov v5.16b, v24.16b | |||
| mov v6.16b, v24.16b | |||
| mov v7.16b, v24.16b | |||
| mov v8.16b, v24.16b | |||
| mov v9.16b, v24.16b | |||
| mov v10.16b, v24.16b | |||
| mov v11.16b, v24.16b | |||
| mov v12.16b, v24.16b | |||
| mov v13.16b, v24.16b | |||
| mov v14.16b, v24.16b | |||
| mov v15.16b, v24.16b | |||
| LoopKh16: | |||
| mov x18, x7 | |||
| mov x21, x16 | |||
| LoopKw16: | |||
| mov x22, x21 | |||
| ld1 {v25.4h}, [x17], #8 | |||
| ld1 {v16.4h}, [x22], x13 | |||
| ld1 {v17.4h}, [x22], x13 | |||
| smlal v0.4s, v16.4h, v25.4h | |||
| smlal v1.4s, v17.4h, v25.4h | |||
| ld1 {v18.4h}, [x22], x13 | |||
| ld1 {v19.4h}, [x22], x13 | |||
| smlal v2.4s, v18.4h, v25.4h | |||
| smlal v3.4s, v19.4h, v25.4h | |||
| ld1 {v20.4h}, [x22], x13 | |||
| ld1 {v21.4h}, [x22], x13 | |||
| smlal v4.4s, v20.4h, v25.4h | |||
| smlal v5.4s, v21.4h, v25.4h | |||
| ld1 {v22.4h}, [x22], x13 | |||
| ld1 {v23.4h}, [x22], x13 | |||
| smlal v6.4s, v22.4h, v25.4h | |||
| smlal v7.4s, v23.4h, v25.4h | |||
| ld1 {v16.4h}, [x22], x13 | |||
| ld1 {v17.4h}, [x22], x13 | |||
| smlal v8.4s, v16.4h, v25.4h | |||
| smlal v9.4s, v17.4h, v25.4h | |||
| ld1 {v18.4h}, [x22], x13 | |||
| ld1 {v19.4h}, [x22], x13 | |||
| smlal v10.4s, v18.4h, v25.4h | |||
| smlal v11.4s, v19.4h, v25.4h | |||
| ld1 {v20.4h}, [x22], x13 | |||
| ld1 {v21.4h}, [x22], x13 | |||
| smlal v12.4s, v20.4h, v25.4h | |||
| smlal v13.4s, v21.4h, v25.4h | |||
| ld1 {v22.4h}, [x22], x13 | |||
| ld1 {v23.4h}, [x22], x13 | |||
| smlal v14.4s, v22.4h, v25.4h | |||
| smlal v15.4s, v23.4h, v25.4h | |||
| subs x18, x18, #1 | |||
| add x21, x21, x13 | |||
| bne LoopKw16 | |||
| add x16, x16, x12 | |||
| subs x20, x20, #1 | |||
| bne LoopKh16 | |||
| sqshl v0.4s, v0.4s ,v26.4s | |||
| sqshl v1.4s, v1.4s ,v26.4s | |||
| sqshl v2.4s, v2.4s ,v26.4s | |||
| sqshl v3.4s, v3.4s ,v26.4s | |||
| sqshl v4.4s, v4.4s ,v26.4s | |||
| sqshl v5.4s, v5.4s ,v26.4s | |||
| sqshl v6.4s, v6.4s ,v26.4s | |||
| sqshl v7.4s, v7.4s ,v26.4s | |||
| sqshl v8.4s, v8.4s ,v26.4s | |||
| sqshl v9.4s, v9.4s ,v26.4s | |||
| sqshl v10.4s, v10.4s ,v26.4s | |||
| sqshl v11.4s, v11.4s ,v26.4s | |||
| sqshl v12.4s, v12.4s ,v26.4s | |||
| sqshl v13.4s, v13.4s ,v26.4s | |||
| sqshl v14.4s, v14.4s ,v26.4s | |||
| sqshl v15.4s, v15.4s ,v26.4s | |||
| sqrdmulh v0.4s, v0.4s ,v27.4s | |||
| sqrdmulh v1.4s, v1.4s ,v27.4s | |||
| sqrdmulh v2.4s, v2.4s ,v27.4s | |||
| sqrdmulh v3.4s, v3.4s ,v27.4s | |||
| sqrdmulh v4.4s, v4.4s ,v27.4s | |||
| sqrdmulh v5.4s, v5.4s ,v27.4s | |||
| sqrdmulh v6.4s, v6.4s ,v27.4s | |||
| sqrdmulh v7.4s, v7.4s ,v27.4s | |||
| sqrdmulh v8.4s, v8.4s ,v27.4s | |||
| sqrdmulh v9.4s, v9.4s ,v27.4s | |||
| sqrdmulh v10.4s, v10.4s ,v27.4s | |||
| sqrdmulh v11.4s, v11.4s ,v27.4s | |||
| sqrdmulh v12.4s, v12.4s ,v27.4s | |||
| sqrdmulh v13.4s, v13.4s ,v27.4s | |||
| sqrdmulh v14.4s, v14.4s ,v27.4s | |||
| sqrdmulh v15.4s, v15.4s ,v27.4s | |||
| sqrshl v0.4s, v0.4s ,v28.4s | |||
| sqrshl v1.4s, v1.4s ,v28.4s | |||
| sqrshl v2.4s, v2.4s ,v28.4s | |||
| sqrshl v3.4s, v3.4s ,v28.4s | |||
| sqrshl v4.4s, v4.4s ,v28.4s | |||
| sqrshl v5.4s, v5.4s ,v28.4s | |||
| sqrshl v6.4s, v6.4s ,v28.4s | |||
| sqrshl v7.4s, v7.4s ,v28.4s | |||
| sqrshl v8.4s, v8.4s ,v28.4s | |||
| sqrshl v9.4s, v9.4s ,v28.4s | |||
| sqrshl v10.4s, v10.4s ,v28.4s | |||
| sqrshl v11.4s, v11.4s ,v28.4s | |||
| sqrshl v12.4s, v12.4s ,v28.4s | |||
| sqrshl v13.4s, v13.4s ,v28.4s | |||
| sqrshl v14.4s, v14.4s ,v28.4s | |||
| sqrshl v15.4s, v15.4s ,v28.4s | |||
| add v0.4s, v0.4s ,v29.4s | |||
| add v1.4s, v1.4s ,v29.4s | |||
| add v2.4s, v2.4s ,v29.4s | |||
| add v3.4s, v3.4s ,v29.4s | |||
| add v4.4s, v4.4s ,v29.4s | |||
| add v5.4s, v5.4s ,v29.4s | |||
| add v6.4s, v6.4s ,v29.4s | |||
| add v7.4s, v7.4s ,v29.4s | |||
| add v8.4s, v8.4s ,v29.4s | |||
| add v9.4s, v9.4s ,v29.4s | |||
| add v10.4s, v10.4s ,v29.4s | |||
| add v11.4s, v11.4s ,v29.4s | |||
| add v12.4s, v12.4s ,v29.4s | |||
| add v13.4s, v13.4s ,v29.4s | |||
| add v14.4s, v14.4s ,v29.4s | |||
| add v15.4s, v15.4s ,v29.4s | |||
| smax v0.4s, v0.4s ,v30.4s | |||
| smax v1.4s, v1.4s ,v30.4s | |||
| smax v2.4s, v2.4s ,v30.4s | |||
| smax v3.4s, v3.4s ,v30.4s | |||
| smax v4.4s, v4.4s ,v30.4s | |||
| smax v5.4s, v5.4s ,v30.4s | |||
| smax v6.4s, v6.4s ,v30.4s | |||
| smax v7.4s, v7.4s ,v30.4s | |||
| smax v8.4s, v8.4s ,v30.4s | |||
| smax v9.4s, v9.4s ,v30.4s | |||
| smax v10.4s, v10.4s ,v30.4s | |||
| smax v11.4s, v11.4s ,v30.4s | |||
| smax v12.4s, v12.4s ,v30.4s | |||
| smax v13.4s, v13.4s ,v30.4s | |||
| smax v14.4s, v14.4s ,v30.4s | |||
| smax v15.4s, v15.4s ,v30.4s | |||
| smin v0.4s, v0.4s ,v31.4s | |||
| smin v1.4s, v1.4s ,v31.4s | |||
| smin v2.4s, v2.4s ,v31.4s | |||
| smin v3.4s, v3.4s ,v31.4s | |||
| smin v4.4s, v4.4s ,v31.4s | |||
| smin v5.4s, v5.4s ,v31.4s | |||
| smin v6.4s, v6.4s ,v31.4s | |||
| smin v7.4s, v7.4s ,v31.4s | |||
| smin v8.4s, v8.4s ,v31.4s | |||
| smin v9.4s, v9.4s ,v31.4s | |||
| smin v10.4s, v10.4s ,v31.4s | |||
| smin v11.4s, v11.4s ,v31.4s | |||
| smin v12.4s, v12.4s ,v31.4s | |||
| smin v13.4s, v13.4s ,v31.4s | |||
| smin v14.4s, v14.4s ,v31.4s | |||
| smin v15.4s, v15.4s ,v31.4s | |||
| sqxtn v0.4h, v0.4s | |||
| sqxtn v1.4h, v1.4s | |||
| sqxtn v2.4h, v2.4s | |||
| sqxtn v3.4h, v3.4s | |||
| sqxtn v4.4h, v4.4s | |||
| sqxtn v5.4h, v5.4s | |||
| sqxtn v6.4h, v6.4s | |||
| sqxtn v7.4h, v7.4s | |||
| sqxtn v8.4h, v8.4s | |||
| sqxtn v9.4h, v9.4s | |||
| sqxtn v10.4h, v10.4s | |||
| sqxtn v11.4h, v11.4s | |||
| sqxtn v12.4h, v12.4s | |||
| sqxtn v13.4h, v13.4s | |||
| sqxtn v14.4h, v14.4s | |||
| sqxtn v15.4h, v15.4s | |||
| sqxtn v0.8b, v0.8h | |||
| sqxtn v1.8b, v1.8h | |||
| sqxtn v2.8b, v2.8h | |||
| sqxtn v3.8b, v3.8h | |||
| sqxtn v4.8b, v4.8h | |||
| sqxtn v5.8b, v5.8h | |||
| sqxtn v6.8b, v6.8h | |||
| sqxtn v7.8b, v7.8h | |||
| sqxtn v8.8b, v8.8h | |||
| sqxtn v9.8b, v9.8h | |||
| sqxtn v10.8b, v10.8h | |||
| sqxtn v11.8b, v11.8h | |||
| sqxtn v12.8b, v12.8h | |||
| sqxtn v13.8b, v13.8h | |||
| sqxtn v14.8b, v14.8h | |||
| sqxtn v15.8b, v15.8h | |||
| add x17, x3, #1 | |||
| add x18, x3, #2 | |||
| add x21, x3, #3 | |||
| st1 {v0.b}[0], [x3], x9 | |||
| st1 {v0.b}[1], [x17], x9 | |||
| st1 {v0.b}[2], [x18], x9 | |||
| st1 {v0.b}[3], [x21], x9 | |||
| st1 {v1.b}[0], [x3], x9 | |||
| st1 {v1.b}[1], [x17], x9 | |||
| st1 {v1.b}[2], [x18], x9 | |||
| st1 {v1.b}[3], [x21], x9 | |||
| st1 {v2.b}[0], [x3], x9 | |||
| st1 {v2.b}[1], [x17], x9 | |||
| st1 {v2.b}[2], [x18], x9 | |||
| st1 {v2.b}[3], [x21], x9 | |||
| st1 {v3.b}[0], [x3], x9 | |||
| st1 {v3.b}[1], [x17], x9 | |||
| st1 {v3.b}[2], [x18], x9 | |||
| st1 {v3.b}[3], [x21], x9 | |||
| st1 {v4.b}[0], [x3], x9 | |||
| st1 {v4.b}[1], [x17], x9 | |||
| st1 {v4.b}[2], [x18], x9 | |||
| st1 {v4.b}[3], [x21], x9 | |||
| st1 {v5.b}[0], [x3], x9 | |||
| st1 {v5.b}[1], [x17], x9 | |||
| st1 {v5.b}[2], [x18], x9 | |||
| st1 {v5.b}[3], [x21], x9 | |||
| st1 {v6.b}[0], [x3], x9 | |||
| st1 {v6.b}[1], [x17], x9 | |||
| st1 {v6.b}[2], [x18], x9 | |||
| st1 {v6.b}[3], [x21], x9 | |||
| st1 {v7.b}[0], [x3], x9 | |||
| st1 {v7.b}[1], [x17], x9 | |||
| st1 {v7.b}[2], [x18], x9 | |||
| st1 {v7.b}[3], [x21], x9 | |||
| st1 {v8.b}[0], [x3], x9 | |||
| st1 {v8.b}[1], [x17], x9 | |||
| st1 {v8.b}[2], [x18], x9 | |||
| st1 {v8.b}[3], [x21], x9 | |||
| st1 {v9.b}[0], [x3], x9 | |||
| st1 {v9.b}[1], [x17], x9 | |||
| st1 {v9.b}[2], [x18], x9 | |||
| st1 {v9.b}[3], [x21], x9 | |||
| st1 {v10.b}[0], [x3], x9 | |||
| st1 {v10.b}[1], [x17], x9 | |||
| st1 {v10.b}[2], [x18], x9 | |||
| st1 {v10.b}[3], [x21], x9 | |||
| st1 {v11.b}[0], [x3], x9 | |||
| st1 {v11.b}[1], [x17], x9 | |||
| st1 {v11.b}[2], [x18], x9 | |||
| st1 {v11.b}[3], [x21], x9 | |||
| st1 {v12.b}[0], [x3], x9 | |||
| st1 {v12.b}[1], [x17], x9 | |||
| st1 {v12.b}[2], [x18], x9 | |||
| st1 {v12.b}[3], [x21], x9 | |||
| st1 {v13.b}[0], [x3], x9 | |||
| st1 {v13.b}[1], [x17], x9 | |||
| st1 {v13.b}[2], [x18], x9 | |||
| st1 {v13.b}[3], [x21], x9 | |||
| st1 {v14.b}[0], [x3], x9 | |||
| st1 {v14.b}[1], [x17], x9 | |||
| st1 {v14.b}[2], [x18], x9 | |||
| st1 {v14.b}[3], [x21], x9 | |||
| st1 {v15.b}[0], [x3], x9 | |||
| st1 {v15.b}[1], [x17], x9 | |||
| st1 {v15.b}[2], [x18], x9 | |||
| st1 {v15.b}[3], [x21], x9 | |||
| add x23, x23, x19 | |||
| sub x24, x24, #16 | |||
| cmp x24, #0 | |||
| ble LoopWEnd | |||
| cmp x24, #8 | |||
| blt LoopW | |||
| cmp x24, #16 | |||
| bge LoopW16 | |||
| LoopW8: | |||
| mov x19, #8 | |||
| mul x19, x19, x11 | |||
| mov x16, x23 | |||
| mov x17, x2 | |||
| mov x20, x6 | |||
| mov v0.16b, v24.16b | |||
| mov v1.16b, v24.16b | |||
| mov v2.16b, v24.16b | |||
| mov v3.16b, v24.16b | |||
| mov v4.16b, v24.16b | |||
| mov v5.16b, v24.16b | |||
| mov v6.16b, v24.16b | |||
| mov v7.16b, v24.16b | |||
| LoopKh8: | |||
| mov x18, x7 | |||
| mov x21, x16 | |||
| LoopKw8: | |||
| mov x22, x21 | |||
| ld1 {v25.4h}, [x17], #8 | |||
| ld1 {v16.4h}, [x22], x13 | |||
| ld1 {v17.4h}, [x22], x13 | |||
| smlal v0.4s, v16.4h, v25.4h | |||
| smlal v1.4s, v17.4h, v25.4h | |||
| ld1 {v18.4h}, [x22], x13 | |||
| ld1 {v19.4h}, [x22], x13 | |||
| smlal v2.4s, v18.4h, v25.4h | |||
| smlal v3.4s, v19.4h, v25.4h | |||
| ld1 {v20.4h}, [x22], x13 | |||
| ld1 {v21.4h}, [x22], x13 | |||
| smlal v4.4s, v20.4h, v25.4h | |||
| smlal v5.4s, v21.4h, v25.4h | |||
| ld1 {v22.4h}, [x22], x13 | |||
| ld1 {v23.4h}, [x22], x13 | |||
| smlal v6.4s, v22.4h, v25.4h | |||
| smlal v7.4s, v23.4h, v25.4h | |||
| subs x18, x18, #1 | |||
| add x21, x21, x13 | |||
| bne LoopKw8 | |||
| add x16, x16, x12 | |||
| subs x20, x20, #1 | |||
| bne LoopKh8 | |||
| sqshl v0.4s, v0.4s ,v26.4s | |||
| sqshl v1.4s, v1.4s ,v26.4s | |||
| sqshl v2.4s, v2.4s ,v26.4s | |||
| sqshl v3.4s, v3.4s ,v26.4s | |||
| sqshl v4.4s, v4.4s ,v26.4s | |||
| sqshl v5.4s, v5.4s ,v26.4s | |||
| sqshl v6.4s, v6.4s ,v26.4s | |||
| sqshl v7.4s, v7.4s ,v26.4s | |||
| sqrdmulh v0.4s, v0.4s ,v27.4s | |||
| sqrdmulh v1.4s, v1.4s ,v27.4s | |||
| sqrdmulh v2.4s, v2.4s ,v27.4s | |||
| sqrdmulh v3.4s, v3.4s ,v27.4s | |||
| sqrdmulh v4.4s, v4.4s ,v27.4s | |||
| sqrdmulh v5.4s, v5.4s ,v27.4s | |||
| sqrdmulh v6.4s, v6.4s ,v27.4s | |||
| sqrdmulh v7.4s, v7.4s ,v27.4s | |||
| sqrshl v0.4s, v0.4s ,v28.4s | |||
| sqrshl v1.4s, v1.4s ,v28.4s | |||
| sqrshl v2.4s, v2.4s ,v28.4s | |||
| sqrshl v3.4s, v3.4s ,v28.4s | |||
| sqrshl v4.4s, v4.4s ,v28.4s | |||
| sqrshl v5.4s, v5.4s ,v28.4s | |||
| sqrshl v6.4s, v6.4s ,v28.4s | |||
| sqrshl v7.4s, v7.4s ,v28.4s | |||
| add v0.4s, v0.4s ,v29.4s | |||
| add v1.4s, v1.4s ,v29.4s | |||
| add v2.4s, v2.4s ,v29.4s | |||
| add v3.4s, v3.4s ,v29.4s | |||
| add v4.4s, v4.4s ,v29.4s | |||
| add v5.4s, v5.4s ,v29.4s | |||
| add v6.4s, v6.4s ,v29.4s | |||
| add v7.4s, v7.4s ,v29.4s | |||
| smax v0.4s, v0.4s ,v30.4s | |||
| smax v1.4s, v1.4s ,v30.4s | |||
| smax v2.4s, v2.4s ,v30.4s | |||
| smax v3.4s, v3.4s ,v30.4s | |||
| smax v4.4s, v4.4s ,v30.4s | |||
| smax v5.4s, v5.4s ,v30.4s | |||
| smax v6.4s, v6.4s ,v30.4s | |||
| smax v7.4s, v7.4s ,v30.4s | |||
| smin v0.4s, v0.4s ,v31.4s | |||
| smin v1.4s, v1.4s ,v31.4s | |||
| smin v2.4s, v2.4s ,v31.4s | |||
| smin v3.4s, v3.4s ,v31.4s | |||
| smin v4.4s, v4.4s ,v31.4s | |||
| smin v5.4s, v5.4s ,v31.4s | |||
| smin v6.4s, v6.4s ,v31.4s | |||
| smin v7.4s, v7.4s ,v31.4s | |||
| sqxtn v0.4h, v0.4s | |||
| sqxtn v1.4h, v1.4s | |||
| sqxtn v2.4h, v2.4s | |||
| sqxtn v3.4h, v3.4s | |||
| sqxtn v4.4h, v4.4s | |||
| sqxtn v5.4h, v5.4s | |||
| sqxtn v6.4h, v6.4s | |||
| sqxtn v7.4h, v7.4s | |||
| sqxtn v0.8b, v0.8h | |||
| sqxtn v1.8b, v1.8h | |||
| sqxtn v2.8b, v2.8h | |||
| sqxtn v3.8b, v3.8h | |||
| sqxtn v4.8b, v4.8h | |||
| sqxtn v5.8b, v5.8h | |||
| sqxtn v6.8b, v6.8h | |||
| sqxtn v7.8b, v7.8h | |||
| add x17, x3, #1 | |||
| add x18, x3, #2 | |||
| add x21, x3, #3 | |||
| st1 {v0.b}[0], [x3], x9 | |||
| st1 {v0.b}[1], [x17], x9 | |||
| st1 {v0.b}[2], [x18], x9 | |||
| st1 {v0.b}[3], [x21], x9 | |||
| st1 {v1.b}[0], [x3], x9 | |||
| st1 {v1.b}[1], [x17], x9 | |||
| st1 {v1.b}[2], [x18], x9 | |||
| st1 {v1.b}[3], [x21], x9 | |||
| st1 {v2.b}[0], [x3], x9 | |||
| st1 {v2.b}[1], [x17], x9 | |||
| st1 {v2.b}[2], [x18], x9 | |||
| st1 {v2.b}[3], [x21], x9 | |||
| st1 {v3.b}[0], [x3], x9 | |||
| st1 {v3.b}[1], [x17], x9 | |||
| st1 {v3.b}[2], [x18], x9 | |||
| st1 {v3.b}[3], [x21], x9 | |||
| st1 {v4.b}[0], [x3], x9 | |||
| st1 {v4.b}[1], [x17], x9 | |||
| st1 {v4.b}[2], [x18], x9 | |||
| st1 {v4.b}[3], [x21], x9 | |||
| st1 {v5.b}[0], [x3], x9 | |||
| st1 {v5.b}[1], [x17], x9 | |||
| st1 {v5.b}[2], [x18], x9 | |||
| st1 {v5.b}[3], [x21], x9 | |||
| st1 {v6.b}[0], [x3], x9 | |||
| st1 {v6.b}[1], [x17], x9 | |||
| st1 {v6.b}[2], [x18], x9 | |||
| st1 {v6.b}[3], [x21], x9 | |||
| st1 {v7.b}[0], [x3], x9 | |||
| st1 {v7.b}[1], [x17], x9 | |||
| st1 {v7.b}[2], [x18], x9 | |||
| st1 {v7.b}[3], [x21], x9 | |||
| add x23, x23, x19 | |||
| sub x24, x24, #8 | |||
| cmp x24, #0 | |||
| ble LoopWEnd | |||
| cmp x24, #8 | |||
| bge LoopW8 | |||
| LoopW: | |||
| mov x16, x23 | |||
| mov x17, x2 | |||
| mov x20, x6 | |||
| mov v0.16b, v24.16b | |||
| LoopKh: | |||
| mov x18, x7 | |||
| mov x22, x16 | |||
| LoopKw: | |||
| ld1 {v16.4h}, [x22], x13 | |||
| ld1 {v25.4h}, [x17], #8 | |||
| smlal v0.4s, v16.4h, v25.4h | |||
| subs x18, x18, #1 | |||
| bne LoopKw | |||
| add x16, x16, x12 | |||
| subs x20, x20, #1 | |||
| bne LoopKh | |||
| sqshl v0.4s, v0.4s ,v26.4s | |||
| sqrdmulh v0.4s, v0.4s ,v27.4s | |||
| sqrshl v0.4s, v0.4s ,v28.4s | |||
| add v0.4s, v0.4s ,v29.4s | |||
| smax v0.4s, v0.4s ,v30.4s | |||
| smin v0.4s, v0.4s ,v31.4s | |||
| sqxtn v0.4h, v0.4s | |||
| sqxtn v0.8b, v0.8h | |||
| mov x17, x3 | |||
| st1 {v0.b}[0], [x17], #1 | |||
| st1 {v0.b}[1], [x17], #1 | |||
| st1 {v0.b}[2], [x17], #1 | |||
| st1 {v0.b}[3], [x17], #1 | |||
| add x3, x3, x9 | |||
| add x23, x23, x11 | |||
| subs x24, x24, #1 | |||
| bne LoopW | |||
| LoopWEnd: | |||
| add x0, x0, x8 | |||
| add x1, x1, x10 | |||
| subs x4, x4, #1 | |||
| bne LoopH | |||
| sub sp, sp, #48 | |||
| ldp x19, x20, [sp], #16 | |||
| ldp x21, x22, [sp], #16 | |||
| ldp x23, x24, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -35,12 +35,12 @@ DeconvDwFp32Center: | |||
| mov x18, x15 | |||
| mov x19, x2 | |||
| mov x20, x5 | |||
| dup v0.4s, wzr | |||
| ld1 {v1.4s}, [x16], x8 | |||
| LoopKh: | |||
| mov x21, x18 | |||
| mov x13, x6 | |||
| LoopKw: | |||
| ld1 {v1.4s}, [x16] | |||
| ld1 {v0.4s}, [x21] | |||
| ld1 {v2.4s}, [x19], #16 | |||
| fmla v0.4s, v1.4s, v2.4s | |||
| st1 {v0.4s}, [x21], x12 | |||
| @@ -50,7 +50,6 @@ DeconvDwFp32Center: | |||
| subs x20, x20, #1 | |||
| bne LoopKh | |||
| add x15, x15, x10 | |||
| add x16, x16, x8 | |||
| subs x17, x17, #1 | |||
| bne LoopW | |||
| add x0, x0, x9 | |||
| @@ -0,0 +1,65 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global DeconvDwInt8Center | |||
| #ifndef __APPLE__ | |||
| .type DeconvDwInt8Center, %function | |||
| #endif | |||
| // void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width, | |||
| // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | |||
| // size_t in_kh_step, size_t in_kw_step); | |||
| // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step | |||
| // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step | |||
| DeconvDwInt8Center: | |||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| sub sp, sp, #32 | |||
| stp x19, x20, [sp], #16 | |||
| stp x21, x22, [sp], #16 | |||
| ldr x8, [sp] | |||
| ldr x9, [sp, #8] | |||
| ldr x10, [sp, #16] | |||
| ldr x11, [sp, #24] | |||
| ldr x12, [sp, #32] | |||
| LoopH: | |||
| mov x15, x0 | |||
| mov x16, x1 | |||
| mov x17, x4 | |||
| LoopW: | |||
| mov x18, x15 | |||
| mov x19, x2 | |||
| mov x20, x5 | |||
| ld1 {v1.4h}, [x16], x8 | |||
| LoopKh: | |||
| mov x21, x18 | |||
| mov x13, x6 | |||
| LoopKw: | |||
| ld1 {v0.4s}, [x21] | |||
| ld1 {v2.4h}, [x19], #8 | |||
| smlal v0.4s, v1.4h, v2.4h | |||
| st1 {v0.4s}, [x21], x12 | |||
| subs x13, x13, #1 | |||
| bne LoopKw | |||
| add x18, x18, x11 | |||
| subs x20, x20, #1 | |||
| bne LoopKh | |||
| add x15, x15, x10 | |||
| add x16, x16, x8 | |||
| subs x17, x17, #1 | |||
| bne LoopW | |||
| add x0, x0, x9 | |||
| add x1, x1, x7 | |||
| subs x3, x3, #1 | |||
| bne LoopH | |||
| sub sp, sp, #32 | |||
| ldp x19, x20, [sp], #16 | |||
| ldp x21, x22, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -0,0 +1,294 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global ConvDwFp16Center | |||
| #ifndef __APPLE__ | |||
| .type ConvDwFp16Center, %function | |||
| #endif | |||
| // void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, size_t height, size_t width, | |||
| // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | |||
| // size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); | |||
| // x0: dst, x1: src, x2: weight, x3: bias, x4: height, x5: weight, x6: kernel_h, x7: kernel_w, | |||
| // x8: out_h_step, x9: block_channel, x10: in_sh_step, x11: in_sw_step, x12: in_kh_step, x13: in_kw_step | |||
| // x14: relu, x15: relu6 | |||
| ConvDwFp16Center: | |||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| sub sp, sp, #48 | |||
| stp x19, x20, [sp], #16 | |||
| stp x21, x22, [sp], #16 | |||
| stp x23, x24, [sp], #16 | |||
| ldr x8, [sp] | |||
| ldr x9, [sp, #8] | |||
| ldr x10, [sp, #16] | |||
| ldr x11, [sp, #24] | |||
| ldr x12, [sp, #32] | |||
| ldr x13, [sp, #40] | |||
| ldr x14, [sp, #48] | |||
| ldr x15, [sp, #56] | |||
| ld1 {v24.8h}, [x3] | |||
| movi v26.8h, #0x46, lsl #8 | |||
| dup v27.4s, wzr | |||
| LoopH: | |||
| mov x23, x1 | |||
| mov x24, x5 | |||
| mov x3, x0 | |||
| cmp x24, #8 | |||
| blt LoopW | |||
| cmp x24, #16 | |||
| blt LoopW8 | |||
| LoopW16: | |||
| mov x19, #16 | |||
| mul x19, x19, x11 | |||
| mov x16, x23 | |||
| mov x17, x2 | |||
| mov x20, x6 | |||
| mov v0.16b, v24.16b | |||
| mov v1.16b, v24.16b | |||
| mov v2.16b, v24.16b | |||
| mov v3.16b, v24.16b | |||
| mov v4.16b, v24.16b | |||
| mov v5.16b, v24.16b | |||
| mov v6.16b, v24.16b | |||
| mov v7.16b, v24.16b | |||
| mov v8.16b, v24.16b | |||
| mov v9.16b, v24.16b | |||
| mov v10.16b, v24.16b | |||
| mov v11.16b, v24.16b | |||
| mov v12.16b, v24.16b | |||
| mov v13.16b, v24.16b | |||
| mov v14.16b, v24.16b | |||
| mov v15.16b, v24.16b | |||
| LoopKh16: | |||
| mov x18, x7 | |||
| mov x21, x16 | |||
| LoopKw16: | |||
| mov x22, x21 | |||
| ld1 {v25.8h}, [x17], #16 | |||
| ld1 {v16.8h}, [x22], x11 | |||
| ld1 {v17.8h}, [x22], x11 | |||
| fmla v0.8h, v16.8h, v25.8h | |||
| fmla v1.8h, v17.8h, v25.8h | |||
| ld1 {v18.8h}, [x22], x11 | |||
| ld1 {v19.8h}, [x22], x11 | |||
| fmla v2.8h, v18.8h, v25.8h | |||
| fmla v3.8h, v19.8h, v25.8h | |||
| ld1 {v20.8h}, [x22], x11 | |||
| ld1 {v21.8h}, [x22], x11 | |||
| fmla v4.8h, v20.8h, v25.8h | |||
| fmla v5.8h, v21.8h, v25.8h | |||
| ld1 {v22.8h}, [x22], x11 | |||
| ld1 {v23.8h}, [x22], x11 | |||
| fmla v6.8h, v22.8h, v25.8h | |||
| fmla v7.8h, v23.8h, v25.8h | |||
| ld1 {v16.8h}, [x22], x11 | |||
| ld1 {v17.8h}, [x22], x11 | |||
| fmla v8.8h, v16.8h, v25.8h | |||
| fmla v9.8h, v17.8h, v25.8h | |||
| ld1 {v18.8h}, [x22], x11 | |||
| ld1 {v19.8h}, [x22], x11 | |||
| fmla v10.8h, v18.8h, v25.8h | |||
| fmla v11.8h, v19.8h, v25.8h | |||
| ld1 {v20.8h}, [x22], x11 | |||
| ld1 {v21.8h}, [x22], x11 | |||
| fmla v12.8h, v20.8h, v25.8h | |||
| fmla v13.8h, v21.8h, v25.8h | |||
| ld1 {v22.8h}, [x22], x11 | |||
| ld1 {v23.8h}, [x22], x11 | |||
| fmla v14.8h, v22.8h, v25.8h | |||
| fmla v15.8h, v23.8h, v25.8h | |||
| subs x18, x18, #1 | |||
| add x21, x21, x13 | |||
| bne LoopKw16 | |||
| add x16, x16, x12 | |||
| subs x20, x20, #1 | |||
| bne LoopKh16 | |||
| cbnz x15, Relu616 | |||
| cbnz x14, Relu16 | |||
| b Write16 | |||
| Relu616: | |||
| fmin v0.8h, v0.8h, v26.8h | |||
| fmin v1.8h, v1.8h, v26.8h | |||
| fmin v2.8h, v2.8h, v26.8h | |||
| fmin v3.8h, v3.8h, v26.8h | |||
| fmin v4.8h, v4.8h, v26.8h | |||
| fmin v5.8h, v5.8h, v26.8h | |||
| fmin v6.8h, v6.8h, v26.8h | |||
| fmin v7.8h, v7.8h, v26.8h | |||
| fmin v8.8h, v8.8h, v26.8h | |||
| fmin v9.8h, v9.8h, v26.8h | |||
| fmin v10.8h, v10.8h, v26.8h | |||
| fmin v11.8h, v11.8h, v26.8h | |||
| fmin v12.8h, v12.8h, v26.8h | |||
| fmin v13.8h, v13.8h, v26.8h | |||
| fmin v14.8h, v14.8h, v26.8h | |||
| fmin v15.8h, v15.8h, v26.8h | |||
| Relu16: | |||
| fmax v0.8h, v0.8h, v27.8h | |||
| fmax v1.8h, v1.8h, v27.8h | |||
| fmax v2.8h, v2.8h, v27.8h | |||
| fmax v3.8h, v3.8h, v27.8h | |||
| fmax v4.8h, v4.8h, v27.8h | |||
| fmax v5.8h, v5.8h, v27.8h | |||
| fmax v6.8h, v6.8h, v27.8h | |||
| fmax v7.8h, v7.8h, v27.8h | |||
| fmax v8.8h, v8.8h, v27.8h | |||
| fmax v9.8h, v9.8h, v27.8h | |||
| fmax v10.8h, v10.8h, v27.8h | |||
| fmax v11.8h, v11.8h, v27.8h | |||
| fmax v12.8h, v12.8h, v27.8h | |||
| fmax v13.8h, v13.8h, v27.8h | |||
| fmax v14.8h, v14.8h, v27.8h | |||
| fmax v15.8h, v15.8h, v27.8h | |||
| Write16: | |||
| st1 {v0.8h}, [x3], x9 | |||
| st1 {v1.8h}, [x3], x9 | |||
| st1 {v2.8h}, [x3], x9 | |||
| st1 {v3.8h}, [x3], x9 | |||
| st1 {v4.8h}, [x3], x9 | |||
| st1 {v5.8h}, [x3], x9 | |||
| st1 {v6.8h}, [x3], x9 | |||
| st1 {v7.8h}, [x3], x9 | |||
| st1 {v8.8h}, [x3], x9 | |||
| st1 {v9.8h}, [x3], x9 | |||
| st1 {v10.8h}, [x3], x9 | |||
| st1 {v11.8h}, [x3], x9 | |||
| st1 {v12.8h}, [x3], x9 | |||
| st1 {v13.8h}, [x3], x9 | |||
| st1 {v14.8h}, [x3], x9 | |||
| st1 {v15.8h}, [x3], x9 | |||
| add x23, x23, x19 | |||
| sub x24, x24, #16 | |||
| cmp x24, #0 | |||
| ble LoopWEnd | |||
| cmp x24, #8 | |||
| blt LoopW | |||
| cmp x24, #16 | |||
| bge LoopW16 | |||
| LoopW8: | |||
| mov x19, #8 | |||
| mul x19, x19, x11 | |||
| mov x16, x23 | |||
| mov x17, x2 | |||
| mov x20, x6 | |||
| mov v0.16b, v24.16b | |||
| mov v1.16b, v24.16b | |||
| mov v2.16b, v24.16b | |||
| mov v3.16b, v24.16b | |||
| mov v4.16b, v24.16b | |||
| mov v5.16b, v24.16b | |||
| mov v6.16b, v24.16b | |||
| mov v7.16b, v24.16b | |||
| LoopKh8: | |||
| mov x18, x7 | |||
| mov x21, x16 | |||
| LoopKw8: | |||
| mov x22, x21 | |||
| ld1 {v25.8h}, [x17], #16 | |||
| ld1 {v16.8h}, [x22], x11 | |||
| ld1 {v17.8h}, [x22], x11 | |||
| fmla v0.8h, v16.8h, v25.8h | |||
| fmla v1.8h, v17.8h, v25.8h | |||
| ld1 {v18.8h}, [x22], x11 | |||
| ld1 {v19.8h}, [x22], x11 | |||
| fmla v2.8h, v18.8h, v25.8h | |||
| fmla v3.8h, v19.8h, v25.8h | |||
| ld1 {v20.8h}, [x22], x11 | |||
| ld1 {v21.8h}, [x22], x11 | |||
| fmla v4.8h, v20.8h, v25.8h | |||
| fmla v5.8h, v21.8h, v25.8h | |||
| ld1 {v22.8h}, [x22], x11 | |||
| ld1 {v23.8h}, [x22], x11 | |||
| fmla v6.8h, v22.8h, v25.8h | |||
| fmla v7.8h, v23.8h, v25.8h | |||
| subs x18, x18, #1 | |||
| add x21, x21, x13 | |||
| bne LoopKw8 | |||
| add x16, x16, x12 | |||
| subs x20, x20, #1 | |||
| bne LoopKh8 | |||
| cbnz x15, Relu68 | |||
| cbnz x14, Relu8 | |||
| b Write8 | |||
| Relu68: | |||
| fmin v0.8h, v0.8h, v26.8h | |||
| fmin v1.8h, v1.8h, v26.8h | |||
| fmin v2.8h, v2.8h, v26.8h | |||
| fmin v3.8h, v3.8h, v26.8h | |||
| fmin v4.8h, v4.8h, v26.8h | |||
| fmin v5.8h, v5.8h, v26.8h | |||
| fmin v6.8h, v6.8h, v26.8h | |||
| fmin v7.8h, v7.8h, v26.8h | |||
| Relu8: | |||
| fmax v0.8h, v0.8h, v27.8h | |||
| fmax v1.8h, v1.8h, v27.8h | |||
| fmax v2.8h, v2.8h, v27.8h | |||
| fmax v3.8h, v3.8h, v27.8h | |||
| fmax v4.8h, v4.8h, v27.8h | |||
| fmax v5.8h, v5.8h, v27.8h | |||
| fmax v6.8h, v6.8h, v27.8h | |||
| fmax v7.8h, v7.8h, v27.8h | |||
| Write8: | |||
| st1 {v0.8h}, [x3], x9 | |||
| st1 {v1.8h}, [x3], x9 | |||
| st1 {v2.8h}, [x3], x9 | |||
| st1 {v3.8h}, [x3], x9 | |||
| st1 {v4.8h}, [x3], x9 | |||
| st1 {v5.8h}, [x3], x9 | |||
| st1 {v6.8h}, [x3], x9 | |||
| st1 {v7.8h}, [x3], x9 | |||
| add x23, x23, x19 | |||
| sub x24, x24, #8 | |||
| cmp x24, #0 | |||
| ble LoopWEnd | |||
| cmp x24, #8 | |||
| bge LoopW8 | |||
| LoopW: | |||
| mov x16, x23 | |||
| mov x17, x2 | |||
| mov x20, x6 | |||
| mov v0.16b, v24.16b | |||
| LoopKh: | |||
| mov x18, x7 | |||
| mov x22, x16 | |||
| LoopKw: | |||
| ld1 {v16.8h}, [x22], x13 | |||
| ld1 {v25.8h}, [x17], #16 | |||
| fmla v0.8h, v16.8h, v25.8h | |||
| subs x18, x18, #1 | |||
| bne LoopKw | |||
| add x16, x16, x12 | |||
| subs x20, x20, #1 | |||
| bne LoopKh | |||
| cbnz x15, Relu6 | |||
| cbnz x14, Relu | |||
| b Write | |||
| Relu6: | |||
| fmin v0.8h, v0.8h, v26.8h | |||
| Relu: | |||
| fmax v0.8h, v0.8h, v27.8h | |||
| Write: | |||
| st1 {v0.8h}, [x3], x9 | |||
| add x23, x23, x11 | |||
| subs x24, x24, #1 | |||
| bne LoopW | |||
| LoopWEnd: | |||
| add x0, x0, x8 | |||
| add x1, x1, x10 | |||
| subs x4, x4, #1 | |||
| bne LoopH | |||
| sub sp, sp, #48 | |||
| ldp x19, x20, [sp], #16 | |||
| ldp x21, x22, [sp], #16 | |||
| ldp x23, x24, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -0,0 +1,64 @@ | |||
| #ifdef __aarch64__ | |||
| .text | |||
| .align 5 | |||
| .global DeconvDwFp16Center | |||
| #ifndef __APPLE__ | |||
| .type DeconvDwFp16Center, %function | |||
| #endif | |||
| // void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width, | |||
| // size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, | |||
| // size_t in_kh_step, size_t in_kw_step); | |||
| // x0: dst, x1: src, x2: weight, x3: height, x4: weight, x5: kernel_h, x6: kernel_w, x7: out_h_step | |||
| // x8: block_channel, x9: in_sh_step, x10: in_sw_step, x11: in_kh_step, x12: in_kw_step | |||
| DeconvDwFp16Center: | |||
| // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to | |||
| // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers | |||
| // x19 ~ x29 should be also preserved | |||
| // whereas our coding style do not permit such amount of parameters | |||
| sub sp, sp, #32 | |||
| stp x19, x20, [sp], #16 | |||
| stp x21, x22, [sp], #16 | |||
| ldr x8, [sp] | |||
| ldr x9, [sp, #8] | |||
| ldr x10, [sp, #16] | |||
| ldr x11, [sp, #24] | |||
| ldr x12, [sp, #32] | |||
| LoopH: | |||
| mov x15, x0 | |||
| mov x16, x1 | |||
| mov x17, x4 | |||
| LoopW: | |||
| mov x18, x15 | |||
| mov x19, x2 | |||
| mov x20, x5 | |||
| ld1 {v1.8h}, [x16], x8 | |||
| LoopKh: | |||
| mov x21, x18 | |||
| mov x13, x6 | |||
| LoopKw: | |||
| ld1 {v0.8h}, [x21] | |||
| ld1 {v2.8h}, [x19], #16 | |||
| fmla v0.8h, v1.8h, v2.8h | |||
| st1 {v0.8h}, [x21], x12 | |||
| subs x13, x13, #1 | |||
| bne LoopKw | |||
| add x18, x18, x11 | |||
| subs x20, x20, #1 | |||
| bne LoopKh | |||
| add x15, x15, x10 | |||
| subs x17, x17, #1 | |||
| bne LoopW | |||
| add x0, x0, x9 | |||
| add x1, x1, x7 | |||
| subs x3, x3, #1 | |||
| bne LoopH | |||
| sub sp, sp, #32 | |||
| ldp x19, x20, [sp], #16 | |||
| ldp x21, x22, [sp], #16 | |||
| ret | |||
| #endif | |||
| @@ -0,0 +1,44 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_COMMON_FUNC_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP16_COMMON_FUNC_H_ | |||
| #include <stdint.h> | |||
| #include <stdio.h> | |||
| #include <string.h> | |||
| #include "src/runtime/kernel/arm/opclib/op_base.h" | |||
| #include "src/runtime/kernel/arm/opclib/conv_parameter.h" | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| #ifdef ENABLE_ARM64 | |||
| void ConvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, | |||
| size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, | |||
| size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, | |||
| size_t in_kw_step, size_t relu, size_t relu6); | |||
| void DeconvDwFp16Center(float16_t *dst, const float16_t *src, const float16_t *weight, size_t height, size_t width, | |||
| size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, | |||
| size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); | |||
| #endif | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| #endif /* MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_COMMON_FUNC_H_ */ | |||
| @@ -16,6 +16,7 @@ | |||
| #include "src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.h" | |||
| #include <arm_neon.h> | |||
| #include "src/runtime/kernel/arm/opclib/fp16/common_func.h" | |||
| /*conv depthwise fp16 begin*/ | |||
| void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, | |||
| @@ -79,6 +80,7 @@ void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t * | |||
| } // height loop | |||
| } | |||
| #ifndef ENABLE_ARM64 | |||
| void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, | |||
| int height, int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, | |||
| int in_sh_step, int in_sw_step, int in_kh_step, int in_kw_step, bool is_relu, bool is_relu6) { | |||
| @@ -97,12 +99,17 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t * | |||
| const float16_t *src_kw = src_kh; | |||
| const float16_t *weight_kw = weight_kh; | |||
| for (int kw = 0; kw < kernel_w; kw++) { | |||
| #ifdef ENABLE_ARM64 | |||
| float16x8_t src_8 = vld1q_f16(src_kw); | |||
| float16x8_t weight_8 = vld1q_f16(weight_kw); | |||
| float16x8_t dst_8 = vld1q_f16(dst_w); | |||
| dst_8 = vfmaq_f16(dst_8, src_8, weight_8); | |||
| vst1q_f16(dst_w, dst_8); | |||
| #else | |||
| for (int c = 0; c < C8NUM; c++) { | |||
| dst_w[c] += src_kw[c] * weight_kw[c]; | |||
| } | |||
| #endif | |||
| src_kw += in_kw_step; | |||
| weight_kw += C8NUM; | |||
| } // kernel_w loop | |||
| @@ -122,6 +129,7 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t * | |||
| src_h += in_sh_step; | |||
| } // dst_height loop | |||
| } | |||
| #endif | |||
| // conv depthwise fp16: sliding window | |||
| void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data, | |||
| @@ -149,11 +157,19 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo | |||
| int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; | |||
| const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; | |||
| float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | |||
| #ifdef ENABLE_ARM64 | |||
| ConvDwFp16Center(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, | |||
| sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, | |||
| sliding->out_h_step_ * sizeof(float16_t), sliding->block_channel_ * sizeof(float16_t), | |||
| sliding->in_sh_step_ * sizeof(float16_t), sliding->in_sw_step_ * sizeof(float16_t), | |||
| sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t), | |||
| conv_param->is_relu_, conv_param->is_relu6_); | |||
| #else | |||
| DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, | |||
| sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, | |||
| sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_, | |||
| sliding->in_kh_step_, sliding->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_); | |||
| #endif | |||
| } | |||
| } // output C8 loop | |||
| src += sliding->in_step_; | |||
| @@ -214,6 +230,7 @@ void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float | |||
| } // height loop | |||
| } | |||
| #ifndef ENABLE_ARM64 | |||
| void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight, int height, int width, | |||
| int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, | |||
| int in_sw_step, int in_kh_step, int in_kw_step) { | |||
| @@ -229,12 +246,17 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float | |||
| float16_t *dst_kw = dst_kh; | |||
| const float16_t *weight_kw = weight_kh; | |||
| for (int kw = 0; kw < kernel_w; kw++) { | |||
| #ifdef ENABLE_ARM64 | |||
| float16x8_t src_8 = vld1q_f16(src_w); | |||
| float16x8_t weight_8 = vld1q_f16(weight_kw); | |||
| float16x8_t dst_8 = vld1q_f16(dst_kw); | |||
| dst_8 = vfmaq_f16(dst_8, src_8, weight_8); | |||
| vst1q_f16(dst_kw, dst_8); | |||
| #else | |||
| for (int c = 0; c < C8NUM; c++) { | |||
| dst_kw[c] += src_w[c] * weight_kw[c]; | |||
| } | |||
| #endif | |||
| dst_kw += in_kw_step; | |||
| weight_kw += C8NUM; | |||
| } // kernel_w loop | |||
| @@ -248,6 +270,7 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float | |||
| src_h += out_h_step; | |||
| } // dst_height loop | |||
| } | |||
| #endif | |||
| void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel, | |||
| const ConvParameter *conv_param) { | |||
| @@ -289,11 +312,18 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f | |||
| float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_; | |||
| const float16_t *in_t = | |||
| src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | |||
| #ifdef ENABLE_ARM64 | |||
| DeconvDwFp16Center(out_t, in_t, weight, sliding->bottom_ - sliding->top_, | |||
| sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, | |||
| sliding->out_h_step_ * sizeof(float16_t), sliding->block_channel_ * sizeof(float16_t), | |||
| sliding->in_sh_step_ * sizeof(float16_t), sliding->in_sw_step_ * sizeof(float16_t), | |||
| sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t)); | |||
| #else | |||
| DeconvDepthwiseCenterFp16(out_t, in_t, weight, sliding->bottom_ - sliding->top_, | |||
| sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, | |||
| sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, | |||
| sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_); | |||
| #endif | |||
| } | |||
| DeconvDepthwisePostFuncFp16(dst_data, bias, sliding->block_channel_, conv_param); | |||
| } // output C8 loop | |||
| @@ -38,6 +38,15 @@ void MatrixSub(const float *a_ptr, const float *b_ptr, float *dst, size_t a_stri | |||
| void MatrixMultiAdd(float *c11, float *c12, float *c21, float *c22, float *x_ptr, size_t row, size_t col, | |||
| size_t c_stride, size_t x_stride); | |||
| #ifdef ENABLE_ARM | |||
| void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||
| size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, | |||
| size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); | |||
| void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, | |||
| size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, | |||
| size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); | |||
| #endif | |||
| #ifdef ENABLE_ARM64 | |||
| void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size); | |||
| void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size); | |||
| @@ -49,12 +58,6 @@ void C4BiasAddRelu(float *dst, const float *input, const float* bias, size_t oc, | |||
| void C4BiasAddRelu6(float *dst, const float *input, const float* bias, size_t oc, size_t plane_size, size_t stride); | |||
| void C4Relu(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); | |||
| void C4Relu6(float *dst, const float *input, size_t oc, size_t plane_size, size_t stride); | |||
| void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||
| size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, | |||
| size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, size_t relu, size_t relu6); | |||
| void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_t height, size_t width, | |||
| size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, | |||
| size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); | |||
| #endif | |||
| #ifdef __cplusplus | |||
| @@ -0,0 +1,62 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_COMMON_FUNC_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_COMMON_FUNC_H_ | |||
| #include <stdint.h> | |||
| #include <stdio.h> | |||
| #include <string.h> | |||
| #include "src/runtime/kernel/arm/opclib/op_base.h" | |||
| #include "src/runtime/kernel/arm/opclib/conv_parameter.h" | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| #ifdef ENABLE_ARM | |||
| void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8, | |||
| size_t oc4, size_t offset); | |||
| #ifdef ENABLE_ARM64 | |||
| void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize, | |||
| size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min, | |||
| size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before, | |||
| size_t shift_after); | |||
| #elif defined(ENABLE_ARM32) | |||
| void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize, | |||
| size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min, | |||
| size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before, | |||
| size_t shift_after); | |||
| #endif | |||
| #endif | |||
| #ifdef ENABLE_ARM | |||
| void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width, | |||
| size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, | |||
| size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); | |||
| void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, | |||
| size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, | |||
| size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int out_multiplier, | |||
| int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max); | |||
| #endif | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| #endif /* MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_COMMON_FUNC_H_ */ | |||
| @@ -17,6 +17,7 @@ | |||
| #include "src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.h" | |||
| #include <string.h> | |||
| #include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h" | |||
| #include "src/runtime/kernel/arm/opclib/int8/common_func.h" | |||
| /*conv depthwise int8 begin*/ | |||
| void DepthwiseBorderPixelInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height, | |||
| @@ -85,6 +86,7 @@ void DepthwiseBorderInt8(int8_t *dst, const int16_t *src, const int16_t *weight, | |||
| } // height loop | |||
| } | |||
| #ifndef ENABLE_ARM64 | |||
| void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, int height, | |||
| int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, | |||
| int in_sw_step, int in_kh_step, int in_kw_step, int out_multiplier, int left_shift, | |||
| @@ -133,6 +135,7 @@ void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight, | |||
| src_h += in_sh_step; | |||
| } // dst_height loop | |||
| } | |||
| #endif | |||
| void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *weight_data, const int32_t *bias_data, | |||
| const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) { | |||
| @@ -158,7 +161,17 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w | |||
| int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_; | |||
| const int16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * C4NUM; | |||
| int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * C4NUM; | |||
| #ifdef ENABLE_ARM64 | |||
| ConvDwInt8Center( | |||
| out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, | |||
| conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(int8_t), | |||
| sliding->block_channel_ * sizeof(int8_t), sliding->in_sh_step_ * sizeof(int16_t), | |||
| sliding->in_sw_step_ * sizeof(int16_t), sliding->in_kh_step_ * sizeof(int16_t), | |||
| sliding->in_kw_step_ * sizeof(int16_t), conv_param->conv_quant_arg_.quant_multiplier_[0], | |||
| conv_param->conv_quant_arg_.left_shift_[0], conv_param->conv_quant_arg_.right_shift_[0], | |||
| conv_param->conv_quant_arg_.quant_args_[2][0].zp_, conv_param->conv_quant_arg_.out_act_min_[0], | |||
| conv_param->conv_quant_arg_.out_act_max_[0]); | |||
| #else | |||
| DepthwiseCenterInt8( | |||
| out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, | |||
| conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_, | |||
| @@ -166,6 +179,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w | |||
| conv_param->conv_quant_arg_.quant_multiplier_[0], conv_param->conv_quant_arg_.left_shift_[0], | |||
| conv_param->conv_quant_arg_.right_shift_[0], conv_param->conv_quant_arg_.quant_args_[2][0].zp_, | |||
| conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0]); | |||
| #endif | |||
| } | |||
| } // output C4 loop | |||
| src += sliding->in_step_; | |||
| @@ -222,6 +236,7 @@ void DeconvDepthwiseBorderInt8(int32_t *dst, const int16_t *src, const int16_t * | |||
| } // height loop | |||
| } | |||
| #ifndef ENABLE_ARM64 | |||
| void DeconvDepthwiseCenterInt8(int32_t *dst, const int16_t *src, const int16_t *weight, int height, int width, | |||
| int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, | |||
| int in_sw_step, int in_kh_step, int in_kw_step) { | |||
| @@ -253,6 +268,7 @@ void DeconvDepthwiseCenterInt8(int32_t *dst, const int16_t *src, const int16_t * | |||
| src_h += out_h_step; | |||
| } // dst_height loop | |||
| } | |||
| #endif | |||
| void DeconvDepthwisePostFuncInt8(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, | |||
| const ConvParameter *conv_param, int out_multiplier, int left_shift, int right_shift, | |||
| @@ -302,11 +318,18 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in | |||
| int32_t *out_t = output_buffer + oh_h_start * sliding->in_h_step_ + oh_w_start * C4NUM; | |||
| const int16_t *in_t = | |||
| src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; | |||
| #ifdef ENABLE_ARM64 | |||
| DeconvDwInt8Center(out_t, in_t, weight, sliding->bottom_ - sliding->top_, | |||
| sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, | |||
| sliding->out_h_step_ * sizeof(int16_t), sliding->block_channel_ * sizeof(int16_t), | |||
| sliding->in_sh_step_ * sizeof(int32_t), sliding->in_sw_step_ * sizeof(int32_t), | |||
| sliding->in_kh_step_ * sizeof(int32_t), sliding->in_kw_step_ * sizeof(int32_t)); | |||
| #else | |||
| DeconvDepthwiseCenterInt8(out_t, in_t, weight, sliding->bottom_ - sliding->top_, | |||
| sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, | |||
| sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, | |||
| sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_); | |||
| #endif | |||
| } | |||
| DeconvDepthwisePostFuncInt8( | |||
| dst_data, output_buffer, bias, sliding->block_channel_, conv_param, | |||
| @@ -17,25 +17,7 @@ | |||
| #include "src/runtime/kernel/arm/opclib/int8/conv_int8.h" | |||
| #include <string.h> | |||
| #include "src/runtime/kernel/arm/opclib/winograd_transform.h" | |||
| extern "C" { | |||
| #ifdef ENABLE_ARM | |||
| void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8, | |||
| size_t oc4, size_t offset); | |||
| #ifdef ENABLE_ARM64 | |||
| void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize, | |||
| size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min, | |||
| size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before, | |||
| size_t shift_after); | |||
| #elif defined(ENABLE_ARM32) | |||
| void IndirectGemmInt8_2x4(int8_t *output, const int8_t *input, const int8_t *weight, const int32_t *bias, size_t ksize, | |||
| size_t ic4, size_t oc, size_t offset, const int32_t *input_sum, size_t act_min, | |||
| size_t act_max, size_t out_zp, size_t out_multiplier, size_t shift_before, | |||
| size_t shift_after); | |||
| #endif | |||
| #endif | |||
| } | |||
| #include "src/runtime/kernel/arm/opclib/int8/common_func.h" | |||
| void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias, | |||
| int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum, | |||