diff --git a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S index eba7d8e605..66045743d7 100644 --- a/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S +++ b/mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S @@ -7,219 +7,273 @@ #ifndef __APPLE__ .type ConvDwInt8Center, %function #endif - -// void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width, -// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, -// size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift, -// int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max); -// r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: width, #56: kernel_h, #60: kernel_w, -// #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step -// #88: out_multiplier, #92: left_shift, #96: right_shift, #100: out_zp, #104: acc_min, #108: acc_max +// void DepthwiseCenterInt8(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int height, +// int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step, +// int in_sw_step, int in_kh_step, int in_kw_step, int8_t *in_zp, int32_t *out_zp, +// int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, int32_t *acc_min, +// int32_t *acc_max) +// #-48: dst, #-44: src, #-40: weight, #-36: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w, +// #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step, #36: in_kw_step +// #40: in_zp, #44: out_zp, #48: out_multiplier, #52: left_shift, #56: right_shift, #60:acc_min, #64: acc_max ConvDwInt8Center: - // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" - // according to https://stackoverflow.com/questions/53625807 - // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway - // clang's rule seems more simple, though there are no subroutine calls here - // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf +// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr" +// according to https://stackoverflow.com/questions/53625807 +// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway +// clang's rule seems more simple, though there are no subroutine calls here +// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf push {r0-r8, r10, r11, lr} vpush {q4-q7} - add sp, sp, #112 - ldr r4, [sp, #48] + ldr lr, [sp, #168] + vld1.32 {q0, q1}, [lr] + vpush {q0, q1} + ldr lr, [sp, #204] + vld1.32 {q0, q1}, [lr] + vpush {q0, q1} + ldr lr, [sp, #240] + vld1.32 {q0, q1}, [lr] + vpush {q0, q1} + add sp, sp, #208 - ldr r12, [sp, #92] - vdup.32 q9, r12 + ldr r1, [sp, #-36] + vld1.32 {q8, q9}, [r1] + ldr r1, [sp, #44] + vld1.32 {q10, q11}, [r1] + ldr r1, [sp, #48] + vld1.32 {q12, q13}, [r1] + ldr r1, [sp, #52] + vld1.32 {q14, q15}, [r1] - ldr r11, [sp, #88] - vdup.32 q10, r11 + ldr r11, [sp, #28] + ldr r4, [sp] + LoopH: + ldr r1, [sp, #-44] + ldr r0, [sp, #-48] + ldr r5, [sp, #4] + LoopW2: + vmov q4, q8 + vmov q5, q9 + vmov q6, q8 + vmov q7, q9 + mov r7, r1 + ldr r3, [sp, #-40] + ldr r6, [sp, #8] + LoopKH: + mov r9, r7 + ldr r10, [sp, #12] + LoopKW: + mov r8, r9 + vld1.16 {q0}, [r3]! + ldr lr, [sp, #40] + vld1.8 {d2}, [lr] - ldr r10, [sp, #96] - vdup.32 q11, r10 + vld1.8 {d3}, [r8] + add r8, r8, r11 + vsubl.s8 q2, d3, d2 + vmlal.s16 q4, d4, d0 + vmlal.s16 q5, d5, d1 - ldr r8, [sp, #100] - vdup.32 q12, r8 - - ldr r7, [sp, #104] - vdup.32 q13, r7 + vld1.8 {d3}, [r8] + add r8, r8, r11 + vsubl.s8 q2, d3, d2 + vmlal.s16 q6, d4, d0 + vmlal.s16 q7, d5, d1 - ldr r6, [sp, #108] - vdup.32 q14, r6 + ldr r12, [sp, #36] + add r9, r9, r12 + subs r10, r10, #1 + bne LoopKW + ldr r12, [sp, #32] + add r7, r7, r12 + subs r6, r6, #1 + bne LoopKH - vld1.32 {q15}, [r3] + vshl.s32 q4, q4, q14 + vshl.s32 q5, q5, q15 + vshl.s32 q6, q6, q14 + vshl.s32 q7, q7, q15 - LoopH: - ldr r1, [sp, #4] // src_w - ldr r5, [sp, #52] // width - ldr r0, [sp] // dst_w - LoopW4: - ldr r11, [sp, #76] // in_sw_step - mov r8, r1 // src_kh - ldr r2, [sp, #8] // weight_kh - ldr r6, [sp, #56] // kernel_h - vmov q0, q15 - LoopKh4: - ldr r12, [sp, #80] //in_kh_step - ldr r7, [sp, #60] // kernel_w - mov r10, r8 // src_kw - LoopKw4: - vld1.16 {d24}, [r2]! - vld1.16 {d8}, [r10] - add r10, r10, r11 - vmlal.s16 q0, d8, d24 - vld1.16 {d10}, [r10] - add r10, r10, r11 - vmlal.s16 q1, d10, d24 - vld1.16 {d12}, [r10] - add r10, r10, r11 - vmlal.s16 q2, d12, d24 - vld1.16 {d14}, [r10] - add r10, r10, r11 - vmlal.s16 q3, d14, d24 - subs r7, r7, #1 - bne LoopKw4 - ldr r12, [sp, #80] - add r8, r8, r12 - subs r6, r6, #1 - bne LoopKh4 - - vshl.s32 q0, q0, q9 - vshl.s32 q1, q1, q9 - vshl.s32 q2, q2, q9 - vshl.s32 q3, q3, q9 - vqrdmulh.s32 q0, q0, q10 - vqrdmulh.s32 q1, q1, q10 - vqrdmulh.s32 q2, q2, q10 - vqrdmulh.s32 q3, q3, q10 - vand q4, q0, q11 - vshr.s32 q4, q4, #31 - vqadd.s32 q0, q0, q4 - vrshl.s32 q0, q0, q11 - vand q5, q1, q11 - vshr.s32 q5, q5, #31 - vqadd.s32 q1, q1, q5 - vrshl.s32 q1, q1, q11 - vand q6, q2, q11 - vshr.s32 q6, q6, #31 - vqadd.s32 q2, q2, q6 - vrshl.s32 q2, q2, q11 - vand q7, q3, q11 - vshr.s32 q7, q7, #31 - vqadd.s32 q3, q3, q7 - vrshl.s32 q3, q3, q11 - vadd.i32 q0, q0, q12 - vadd.i32 q1, q1, q12 - vadd.i32 q2, q2, q12 - vadd.i32 q3, q3, q12 - vmax.s32 q0, q0, q13 - vmax.s32 q1, q1, q13 - vmax.s32 q2, q2, q13 - vmax.s32 q3, q3, q13 - vmin.s32 q0, q0, q14 - vmin.s32 q1, q1, q14 - vmin.s32 q2, q2, q14 - vmin.s32 q3, q3, q14 - - vqmovn.s32 d0, q0 - vqmovn.s32 d2, q1 - vqmovn.s32 d4, q2 - vqmovn.s32 d6, q3 - vqmovn.s16 d0, q0 - vqmovn.s16 d2, q1 - vqmovn.s16 d4, q2 - vqmovn.s16 d6, q3 - - mov r3, r0 - ldr r12, [sp, #68] - vst1.8 {d0[0]}, [r3]! - vst1.8 {d0[1]}, [r3]! - vst1.8 {d0[2]}, [r3]! - vst1.8 {d0[3]}, [r3]! - add r0, r0, r12 - mov r3, r0 - vst1.8 {d2[0]}, [r3]! - vst1.8 {d2[1]}, [r3]! - vst1.8 {d2[2]}, [r3]! - vst1.8 {d2[3]}, [r3]! - add r0, r0, r12 - mov r3, r0 - vst1.8 {d4[0]}, [r3]! - vst1.8 {d4[1]}, [r3]! - vst1.8 {d4[2]}, [r3]! - vst1.8 {d4[3]}, [r3]! - add r0, r0, r12 - mov r3, r0 - vst1.8 {d6[0]}, [r3]! - vst1.8 {d6[1]}, [r3]! - vst1.8 {d6[2]}, [r3]! - vst1.8 {d6[3]}, [r3]! - add r0, r0, r12 - mov r3, r0 - mov r12, #4 - mul r11, r11, r12 + vqrdmulh.s32 q4, q4, q12 + vqrdmulh.s32 q5, q5, q13 + vqrdmulh.s32 q6, q6, q12 + vqrdmulh.s32 q7, q7, q13 + + sub lr, sp, #144 + vld1.32 {q0, q1}, [lr] + + vand q2, q4, q0 + vshr.s32 q2, q2, #31 + vqadd.s32 q4, q4, q2 + vrshl.s32 q4, q4, q0 + + vand q2, q5, q1 + vshr.s32 q2, q2, #31 + vqadd.s32 q5, q5, q2 + vrshl.s32 q5, q5, q1 + + vand q2, q6, q0 + vshr.s32 q2, q2, #31 + vqadd.s32 q6, q6, q2 + vrshl.s32 q6, q6, q0 + + vand q2, q7, q1 + vshr.s32 q2, q2, #31 + vqadd.s32 q7, q7, q2 + vrshl.s32 q7, q7, q1 + + vadd.i32 q4, q4, q10 + vadd.i32 q5, q5, q11 + vadd.i32 q6, q6, q10 + vadd.i32 q7, q7, q11 + + sub lr, sp, #176 + vld1.32 {q0, q1}, [lr] + vmax.s32 q4, q4, q0 + vmax.s32 q5, q5, q1 + vmax.s32 q6, q6, q0 + vmax.s32 q7, q7, q1 + + sub lr, sp, #208 + vld1.32 {q0, q1}, [lr] + vmin.s32 q4, q4, q0 + vmin.s32 q5, q5, q1 + vmin.s32 q6, q6, q0 + vmin.s32 q7, q7, q1 + + vqmovn.s32 d0, q4 + vqmovn.s32 d1, q5 + vqmovn.s32 d2, q6 + vqmovn.s32 d3, q7 + vqmovn.s16 d0, q0 + vqmovn.s16 d1, q1 + + + ldr r12, [sp, #20] + mov r8, r0 + vst1.8 {d0[0]}, [r8]! + vst1.8 {d0[1]}, [r8]! + vst1.8 {d0[2]}, [r8]! + vst1.8 {d0[3]}, [r8]! + vst1.8 {d0[4]}, [r8]! + vst1.8 {d0[5]}, [r8]! + vst1.8 {d0[6]}, [r8]! + vst1.8 {d0[7]}, [r8]! + add r0, r0, r12 + + mov r8, r0 + vst1.8 {d1[0]}, [r8]! + vst1.8 {d1[1]}, [r8]! + vst1.8 {d1[2]}, [r8]! + vst1.8 {d1[3]}, [r8]! + vst1.8 {d1[4]}, [r8]! + vst1.8 {d1[5]}, [r8]! + vst1.8 {d1[6]}, [r8]! + vst1.8 {d1[7]}, [r8]! + add r0, r0, r12 + + add r1, r1, r11 add r1, r1, r11 - sub r5, r5, #4 - cmp r5, #0 - ble LoopWEnd - cmp r5, #4 - bge LoopW4 + subs r5, r5, #2 + beq LoopEndW + cmp r5, #2 + bge LoopW2 + LoopW: - mov r8, r1 // src_kh - ldr r2, [sp, #8] // weight_kh - ldr r6, [sp, #56] // kernel_h - vmov q0, q15 - LoopKh: - ldr r12, [sp, #84] //in_kw_step - ldr r7, [sp, #60] // kernel_w - mov r10, r8 // src_kw - LoopKw: - vld1.16 {d2}, [r10] - add r10, r10, r12 - vld1.16 {d24}, [r2]! - vmlal.s16 q0, d2, d24 - subs r7, r7, #1 - bne LoopKw - ldr r12, [sp, #80] - add r8, r8, r12 + vmov q4, q8 + vmov q5, q9 + mov r7, r1 + ldr r3, [sp, #-40] + ldr r6, [sp, #8] + LoopKH1: + mov r9, r7 + ldr r10, [sp, #12] + LoopKW1: + vld1.16 {q0}, [r3]! + ldr lr, [sp, #40] + vld1.8 {d2}, [lr] + + vld1.8 {d3}, [r9] + vsubl.s8 q2, d3, d2 + vmlal.s16 q4, d4, d0 + vmlal.s16 q5, d5, d1 + + ldr r12, [sp, #36] + add r9, r9, r12 + subs r10, r10, #1 + bne LoopKW1 + ldr r12, [sp, #32] + add r7, r7, r12 subs r6, r6, #1 - bne LoopKh - - vshl.s32 q0, q0, q9 - vqrdmulh.s32 q0, q0, q10 - vand q4, q0, q11 - vshr.s32 q4, q4, #31 - vqadd.s32 q0, q0, q4 - vrshl.s32 q0, q0, q11 - vadd.i32 q0, q0, q12 - vmax.s32 q0, q0, q13 - vmin.s32 q0, q0, q14 - - vqmovn.s32 d0, q0 - vqmovn.s16 d0, q0 - - mov r3, r0 - ldr r12, [sp, #68] - vst1.8 {d0[0]}, [r3]! - vst1.8 {d0[1]}, [r3]! - vst1.8 {d0[2]}, [r3]! - vst1.8 {d0[3]}, [r3]! + bne LoopKH1 + + vshl.s32 q4, q4, q14 + vshl.s32 q5, q5, q15 + + vqrdmulh.s32 q4, q4, q12 + vqrdmulh.s32 q5, q5, q13 + + sub lr, sp, #144 + vld1.32 {q0, q1}, [lr] + vand q2, q4, q0 + vshr.s32 q2, q2, #31 + vqadd.s32 q4, q4, q2 + vrshl.s32 q4, q4, q0 + + vand q2, q5, q1 + vshr.s32 q2, q2, #31 + vqadd.s32 q5, q5, q2 + vrshl.s32 q5, q5, q1 + + vadd.i32 q4, q4, q10 + vadd.i32 q5, q5, q11 + + sub lr, sp, #176 + vld1.32 {q0, q1}, [lr] + vmax.s32 q4, q4, q0 + vmax.s32 q5, q5, q1 + + sub lr, sp, #208 + vld1.32 {q0, q1}, [lr] + vmin.s32 q4, q4, q0 + vmin.s32 q5, q5, q1 + + vqmovn.s32 d0, q4 + vqmovn.s32 d1, q5 + vqmovn.s16 d0, q0 + + mov r8, r0 + vst1.8 {d0[0]}, [r8]! + vst1.8 {d0[1]}, [r8]! + vst1.8 {d0[2]}, [r8]! + vst1.8 {d0[3]}, [r8]! + vst1.8 {d0[4]}, [r8]! + vst1.8 {d0[5]}, [r8]! + vst1.8 {d0[6]}, [r8]! + vst1.8 {d0[7]}, [r8]! + ldr r12, [sp, #20] add r0, r0, r12 - ldr r12, [sp, #76] - add r1, r1, r12 + add r1, r1, r11 subs r5, r5, #1 bne LoopW - ldr r3, [sp, #64] - ldr r12, [sp] - add r12, r12, r3 - str r12, [sp] - ldr r3, [sp, #72] - ldr r12, [sp, #4] - add r12, r12, r3 - str r12, [sp, #4] - subs r4, r4, #1 - bne LoopH -LoopWEnd: - sub sp, sp, #112 - vpop {q4-q7} - pop {r0-r8, r10, r11, pc} + + LoopEndW: + ldr r12, [sp, #16] + ldr r1, [sp, #-48] + add r1, r1, r12 + str r1, [sp, #-48] + ldr r12, [sp, #24] + ldr r1, [sp, #-44] + add r1, r1, r12 + str r1, [sp, #-44] + subs r4, r4, #1 + bne LoopH + + LoopEndH: + sub sp, sp, #208 + vpop {q0, q1} + vpop {q0, q1} + vpop {q0, q1} + vpop {q4-q7} + pop {r0-r8, r10, r11, pc} #endif #endif diff --git a/mindspore/lite/nnacl/int8/common_func.h b/mindspore/lite/nnacl/int8/common_func.h index 3d35e6fc37..95ba01b808 100644 --- a/mindspore/lite/nnacl/int8/common_func.h +++ b/mindspore/lite/nnacl/int8/common_func.h @@ -39,6 +39,11 @@ void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max); void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8, size_t oc4, size_t offset); +void ConvDwInt8Center(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height, + size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, + size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int8_t *in_zp, + int32_t *out_zp, int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, + int32_t *acc_min, int32_t *acc_max); #endif #ifdef ENABLE_ARM32 @@ -59,11 +64,6 @@ void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *wei void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); -void ConvDwInt8Center(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height, - size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, - size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int8_t *in_zp, - int32_t *out_zp, int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, - int32_t *acc_min, int32_t *acc_max); void ConvDwInt8PostAlign4PerChannel(int8_t *dst, int32_t *buffer, int channel4, int32_t output_zp, int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, int32_t acc_min, int32_t acc_max); diff --git a/mindspore/lite/nnacl/int8/conv_depthwise_int8.c b/mindspore/lite/nnacl/int8/conv_depthwise_int8.c index c5590e1951..164edb387e 100644 --- a/mindspore/lite/nnacl/int8/conv_depthwise_int8.c +++ b/mindspore/lite/nnacl/int8/conv_depthwise_int8.c @@ -295,7 +295,7 @@ void ConvDwSWInt8(int8_t *output_data, const int8_t *input_data, const int16_t * int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; const int8_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; -#ifdef ENABLE_ARM64 +#ifdef ENABLE_ARM ConvDwInt8Center(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(int8_t), sliding->block_channel_ * sizeof(int8_t), sliding->in_sh_step_ * sizeof(int8_t),