Browse Source

!6453 [MS][LITE][CPU] fix arm32 ConvDwInt8Center.S bug

Merge pull request !6453 from liuzhongkai/arm32_new1
tags/v1.0.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
07e942c2da
3 changed files with 257 additions and 203 deletions
  1. +251
    -197
      mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S
  2. +5
    -5
      mindspore/lite/nnacl/int8/common_func.h
  3. +1
    -1
      mindspore/lite/nnacl/int8/conv_depthwise_int8.c

+ 251
- 197
mindspore/lite/nnacl/assembly/arm32/ConvDwInt8Center.S View File

@@ -7,219 +7,273 @@
#ifndef __APPLE__ #ifndef __APPLE__
.type ConvDwInt8Center, %function .type ConvDwInt8Center, %function
#endif #endif
// void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, const int32_t *bias, size_t height, size_t width,
// size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t in_sw_step,
// size_t in_kh_step, size_t in_kw_step, int out_multiplier, int left_shift,
// int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max);
// r0: dst, r1: src, r2: weight, r3: bias, #48: height, #52: width, #56: kernel_h, #60: kernel_w,
// #64: out_h_step, #68: block_channel, #72: in_sh_step, #76: in_sw_step, #80: in_kh_step,#84: in_kw_step
// #88: out_multiplier, #92: left_shift, #96: right_shift, #100: out_zp, #104: acc_min, #108: acc_max
// void DepthwiseCenterInt8(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, int height,
// int width, int kernel_h, int kernel_w, int out_h_step, int block_channel, int in_sh_step,
// int in_sw_step, int in_kh_step, int in_kw_step, int8_t *in_zp, int32_t *out_zp,
// int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, int32_t *acc_min,
// int32_t *acc_max)
// #-48: dst, #-44: src, #-40: weight, #-36: bias, #0: height, #4: width, #8: kernel_h, #12: kernel_w,
// #16: out_h_step, #20: block_channel, #24: in_sh_step, #28: in_sw_step, #32: in_kh_step, #36: in_kw_step
// #40: in_zp, #44: out_zp, #48: out_multiplier, #52: left_shift, #56: right_shift, #60:acc_min, #64: acc_max
ConvDwInt8Center: ConvDwInt8Center:
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
// clang's rule seems more simple, though there are no subroutine calls here
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
// at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
// according to https://stackoverflow.com/questions/53625807
// even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
// clang's rule seems more simple, though there are no subroutine calls here
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
push {r0-r8, r10, r11, lr} push {r0-r8, r10, r11, lr}
vpush {q4-q7} vpush {q4-q7}
add sp, sp, #112


ldr r4, [sp, #48]
ldr lr, [sp, #168]
vld1.32 {q0, q1}, [lr]
vpush {q0, q1}
ldr lr, [sp, #204]
vld1.32 {q0, q1}, [lr]
vpush {q0, q1}
ldr lr, [sp, #240]
vld1.32 {q0, q1}, [lr]
vpush {q0, q1}
add sp, sp, #208


ldr r12, [sp, #92]
vdup.32 q9, r12
ldr r1, [sp, #-36]
vld1.32 {q8, q9}, [r1]
ldr r1, [sp, #44]
vld1.32 {q10, q11}, [r1]
ldr r1, [sp, #48]
vld1.32 {q12, q13}, [r1]
ldr r1, [sp, #52]
vld1.32 {q14, q15}, [r1]


ldr r11, [sp, #88]
vdup.32 q10, r11
ldr r11, [sp, #28]
ldr r4, [sp]
LoopH:
ldr r1, [sp, #-44]
ldr r0, [sp, #-48]
ldr r5, [sp, #4]
LoopW2:
vmov q4, q8
vmov q5, q9
vmov q6, q8
vmov q7, q9
mov r7, r1
ldr r3, [sp, #-40]
ldr r6, [sp, #8]
LoopKH:
mov r9, r7
ldr r10, [sp, #12]
LoopKW:
mov r8, r9
vld1.16 {q0}, [r3]!
ldr lr, [sp, #40]
vld1.8 {d2}, [lr]


ldr r10, [sp, #96]
vdup.32 q11, r10
vld1.8 {d3}, [r8]
add r8, r8, r11
vsubl.s8 q2, d3, d2
vmlal.s16 q4, d4, d0
vmlal.s16 q5, d5, d1


ldr r8, [sp, #100]
vdup.32 q12, r8
ldr r7, [sp, #104]
vdup.32 q13, r7
vld1.8 {d3}, [r8]
add r8, r8, r11
vsubl.s8 q2, d3, d2
vmlal.s16 q6, d4, d0
vmlal.s16 q7, d5, d1


ldr r6, [sp, #108]
vdup.32 q14, r6
ldr r12, [sp, #36]
add r9, r9, r12
subs r10, r10, #1
bne LoopKW
ldr r12, [sp, #32]
add r7, r7, r12
subs r6, r6, #1
bne LoopKH


vld1.32 {q15}, [r3]
vshl.s32 q4, q4, q14
vshl.s32 q5, q5, q15
vshl.s32 q6, q6, q14
vshl.s32 q7, q7, q15


LoopH:
ldr r1, [sp, #4] // src_w
ldr r5, [sp, #52] // width
ldr r0, [sp] // dst_w
LoopW4:
ldr r11, [sp, #76] // in_sw_step
mov r8, r1 // src_kh
ldr r2, [sp, #8] // weight_kh
ldr r6, [sp, #56] // kernel_h
vmov q0, q15
LoopKh4:
ldr r12, [sp, #80] //in_kh_step
ldr r7, [sp, #60] // kernel_w
mov r10, r8 // src_kw
LoopKw4:
vld1.16 {d24}, [r2]!
vld1.16 {d8}, [r10]
add r10, r10, r11
vmlal.s16 q0, d8, d24
vld1.16 {d10}, [r10]
add r10, r10, r11
vmlal.s16 q1, d10, d24
vld1.16 {d12}, [r10]
add r10, r10, r11
vmlal.s16 q2, d12, d24
vld1.16 {d14}, [r10]
add r10, r10, r11
vmlal.s16 q3, d14, d24
subs r7, r7, #1
bne LoopKw4
ldr r12, [sp, #80]
add r8, r8, r12
subs r6, r6, #1
bne LoopKh4

vshl.s32 q0, q0, q9
vshl.s32 q1, q1, q9
vshl.s32 q2, q2, q9
vshl.s32 q3, q3, q9
vqrdmulh.s32 q0, q0, q10
vqrdmulh.s32 q1, q1, q10
vqrdmulh.s32 q2, q2, q10
vqrdmulh.s32 q3, q3, q10
vand q4, q0, q11
vshr.s32 q4, q4, #31
vqadd.s32 q0, q0, q4
vrshl.s32 q0, q0, q11
vand q5, q1, q11
vshr.s32 q5, q5, #31
vqadd.s32 q1, q1, q5
vrshl.s32 q1, q1, q11
vand q6, q2, q11
vshr.s32 q6, q6, #31
vqadd.s32 q2, q2, q6
vrshl.s32 q2, q2, q11
vand q7, q3, q11
vshr.s32 q7, q7, #31
vqadd.s32 q3, q3, q7
vrshl.s32 q3, q3, q11
vadd.i32 q0, q0, q12
vadd.i32 q1, q1, q12
vadd.i32 q2, q2, q12
vadd.i32 q3, q3, q12
vmax.s32 q0, q0, q13
vmax.s32 q1, q1, q13
vmax.s32 q2, q2, q13
vmax.s32 q3, q3, q13
vmin.s32 q0, q0, q14
vmin.s32 q1, q1, q14
vmin.s32 q2, q2, q14
vmin.s32 q3, q3, q14

vqmovn.s32 d0, q0
vqmovn.s32 d2, q1
vqmovn.s32 d4, q2
vqmovn.s32 d6, q3
vqmovn.s16 d0, q0
vqmovn.s16 d2, q1
vqmovn.s16 d4, q2
vqmovn.s16 d6, q3

mov r3, r0
ldr r12, [sp, #68]
vst1.8 {d0[0]}, [r3]!
vst1.8 {d0[1]}, [r3]!
vst1.8 {d0[2]}, [r3]!
vst1.8 {d0[3]}, [r3]!
add r0, r0, r12
mov r3, r0
vst1.8 {d2[0]}, [r3]!
vst1.8 {d2[1]}, [r3]!
vst1.8 {d2[2]}, [r3]!
vst1.8 {d2[3]}, [r3]!
add r0, r0, r12
mov r3, r0
vst1.8 {d4[0]}, [r3]!
vst1.8 {d4[1]}, [r3]!
vst1.8 {d4[2]}, [r3]!
vst1.8 {d4[3]}, [r3]!
add r0, r0, r12
mov r3, r0
vst1.8 {d6[0]}, [r3]!
vst1.8 {d6[1]}, [r3]!
vst1.8 {d6[2]}, [r3]!
vst1.8 {d6[3]}, [r3]!
add r0, r0, r12
mov r3, r0
mov r12, #4
mul r11, r11, r12
vqrdmulh.s32 q4, q4, q12
vqrdmulh.s32 q5, q5, q13
vqrdmulh.s32 q6, q6, q12
vqrdmulh.s32 q7, q7, q13

sub lr, sp, #144
vld1.32 {q0, q1}, [lr]

vand q2, q4, q0
vshr.s32 q2, q2, #31
vqadd.s32 q4, q4, q2
vrshl.s32 q4, q4, q0

vand q2, q5, q1
vshr.s32 q2, q2, #31
vqadd.s32 q5, q5, q2
vrshl.s32 q5, q5, q1

vand q2, q6, q0
vshr.s32 q2, q2, #31
vqadd.s32 q6, q6, q2
vrshl.s32 q6, q6, q0

vand q2, q7, q1
vshr.s32 q2, q2, #31
vqadd.s32 q7, q7, q2
vrshl.s32 q7, q7, q1

vadd.i32 q4, q4, q10
vadd.i32 q5, q5, q11
vadd.i32 q6, q6, q10
vadd.i32 q7, q7, q11

sub lr, sp, #176
vld1.32 {q0, q1}, [lr]
vmax.s32 q4, q4, q0
vmax.s32 q5, q5, q1
vmax.s32 q6, q6, q0
vmax.s32 q7, q7, q1

sub lr, sp, #208
vld1.32 {q0, q1}, [lr]
vmin.s32 q4, q4, q0
vmin.s32 q5, q5, q1
vmin.s32 q6, q6, q0
vmin.s32 q7, q7, q1

vqmovn.s32 d0, q4
vqmovn.s32 d1, q5
vqmovn.s32 d2, q6
vqmovn.s32 d3, q7
vqmovn.s16 d0, q0
vqmovn.s16 d1, q1


ldr r12, [sp, #20]
mov r8, r0
vst1.8 {d0[0]}, [r8]!
vst1.8 {d0[1]}, [r8]!
vst1.8 {d0[2]}, [r8]!
vst1.8 {d0[3]}, [r8]!
vst1.8 {d0[4]}, [r8]!
vst1.8 {d0[5]}, [r8]!
vst1.8 {d0[6]}, [r8]!
vst1.8 {d0[7]}, [r8]!
add r0, r0, r12

mov r8, r0
vst1.8 {d1[0]}, [r8]!
vst1.8 {d1[1]}, [r8]!
vst1.8 {d1[2]}, [r8]!
vst1.8 {d1[3]}, [r8]!
vst1.8 {d1[4]}, [r8]!
vst1.8 {d1[5]}, [r8]!
vst1.8 {d1[6]}, [r8]!
vst1.8 {d1[7]}, [r8]!
add r0, r0, r12

add r1, r1, r11
add r1, r1, r11 add r1, r1, r11
sub r5, r5, #4
cmp r5, #0
ble LoopWEnd
cmp r5, #4
bge LoopW4
subs r5, r5, #2
beq LoopEndW
cmp r5, #2
bge LoopW2

LoopW: LoopW:
mov r8, r1 // src_kh
ldr r2, [sp, #8] // weight_kh
ldr r6, [sp, #56] // kernel_h
vmov q0, q15
LoopKh:
ldr r12, [sp, #84] //in_kw_step
ldr r7, [sp, #60] // kernel_w
mov r10, r8 // src_kw
LoopKw:
vld1.16 {d2}, [r10]
add r10, r10, r12
vld1.16 {d24}, [r2]!
vmlal.s16 q0, d2, d24
subs r7, r7, #1
bne LoopKw
ldr r12, [sp, #80]
add r8, r8, r12
vmov q4, q8
vmov q5, q9
mov r7, r1
ldr r3, [sp, #-40]
ldr r6, [sp, #8]
LoopKH1:
mov r9, r7
ldr r10, [sp, #12]
LoopKW1:
vld1.16 {q0}, [r3]!
ldr lr, [sp, #40]
vld1.8 {d2}, [lr]

vld1.8 {d3}, [r9]
vsubl.s8 q2, d3, d2
vmlal.s16 q4, d4, d0
vmlal.s16 q5, d5, d1

ldr r12, [sp, #36]
add r9, r9, r12
subs r10, r10, #1
bne LoopKW1
ldr r12, [sp, #32]
add r7, r7, r12
subs r6, r6, #1 subs r6, r6, #1
bne LoopKh

vshl.s32 q0, q0, q9
vqrdmulh.s32 q0, q0, q10
vand q4, q0, q11
vshr.s32 q4, q4, #31
vqadd.s32 q0, q0, q4
vrshl.s32 q0, q0, q11
vadd.i32 q0, q0, q12
vmax.s32 q0, q0, q13
vmin.s32 q0, q0, q14

vqmovn.s32 d0, q0
vqmovn.s16 d0, q0

mov r3, r0
ldr r12, [sp, #68]
vst1.8 {d0[0]}, [r3]!
vst1.8 {d0[1]}, [r3]!
vst1.8 {d0[2]}, [r3]!
vst1.8 {d0[3]}, [r3]!
bne LoopKH1

vshl.s32 q4, q4, q14
vshl.s32 q5, q5, q15

vqrdmulh.s32 q4, q4, q12
vqrdmulh.s32 q5, q5, q13

sub lr, sp, #144
vld1.32 {q0, q1}, [lr]
vand q2, q4, q0
vshr.s32 q2, q2, #31
vqadd.s32 q4, q4, q2
vrshl.s32 q4, q4, q0

vand q2, q5, q1
vshr.s32 q2, q2, #31
vqadd.s32 q5, q5, q2
vrshl.s32 q5, q5, q1

vadd.i32 q4, q4, q10
vadd.i32 q5, q5, q11

sub lr, sp, #176
vld1.32 {q0, q1}, [lr]
vmax.s32 q4, q4, q0
vmax.s32 q5, q5, q1

sub lr, sp, #208
vld1.32 {q0, q1}, [lr]
vmin.s32 q4, q4, q0
vmin.s32 q5, q5, q1

vqmovn.s32 d0, q4
vqmovn.s32 d1, q5
vqmovn.s16 d0, q0

mov r8, r0
vst1.8 {d0[0]}, [r8]!
vst1.8 {d0[1]}, [r8]!
vst1.8 {d0[2]}, [r8]!
vst1.8 {d0[3]}, [r8]!
vst1.8 {d0[4]}, [r8]!
vst1.8 {d0[5]}, [r8]!
vst1.8 {d0[6]}, [r8]!
vst1.8 {d0[7]}, [r8]!
ldr r12, [sp, #20]
add r0, r0, r12 add r0, r0, r12
ldr r12, [sp, #76]
add r1, r1, r12
add r1, r1, r11
subs r5, r5, #1 subs r5, r5, #1
bne LoopW bne LoopW
ldr r3, [sp, #64]
ldr r12, [sp]
add r12, r12, r3
str r12, [sp]
ldr r3, [sp, #72]
ldr r12, [sp, #4]
add r12, r12, r3
str r12, [sp, #4]
subs r4, r4, #1
bne LoopH
LoopWEnd:
sub sp, sp, #112
vpop {q4-q7}
pop {r0-r8, r10, r11, pc}

LoopEndW:
ldr r12, [sp, #16]
ldr r1, [sp, #-48]
add r1, r1, r12
str r1, [sp, #-48]
ldr r12, [sp, #24]
ldr r1, [sp, #-44]
add r1, r1, r12
str r1, [sp, #-44]
subs r4, r4, #1
bne LoopH

LoopEndH:
sub sp, sp, #208
vpop {q0, q1}
vpop {q0, q1}
vpop {q0, q1}
vpop {q4-q7}
pop {r0-r8, r10, r11, pc}
#endif #endif
#endif #endif

+ 5
- 5
mindspore/lite/nnacl/int8/common_func.h View File

@@ -39,6 +39,11 @@ void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t
int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max); int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max);
void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8, void IndirectGemmInt16to32_8x4(int32_t *dst, const int16_t *src, const int16_t *weight, size_t ksize, size_t ic8,
size_t oc4, size_t offset); size_t oc4, size_t offset);
void ConvDwInt8Center(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height,
size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel,
size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int8_t *in_zp,
int32_t *out_zp, int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift,
int32_t *acc_min, int32_t *acc_max);
#endif #endif


#ifdef ENABLE_ARM32 #ifdef ENABLE_ARM32
@@ -59,11 +64,6 @@ void IndirectGemmInt8_4x4(int8_t *output, const int8_t *input, const int8_t *wei
void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width, void DeconvDwInt8Center(int32_t *dst, const int16_t *src, const int16_t *weight, size_t height, size_t width,
size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step,
size_t in_sw_step, size_t in_kh_step, size_t in_kw_step); size_t in_sw_step, size_t in_kh_step, size_t in_kw_step);
void ConvDwInt8Center(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height,
size_t width, size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel,
size_t in_sh_step, size_t in_sw_step, size_t in_kh_step, size_t in_kw_step, int8_t *in_zp,
int32_t *out_zp, int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift,
int32_t *acc_min, int32_t *acc_max);
void ConvDwInt8PostAlign4PerChannel(int8_t *dst, int32_t *buffer, int channel4, int32_t output_zp, void ConvDwInt8PostAlign4PerChannel(int8_t *dst, int32_t *buffer, int channel4, int32_t output_zp,
int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, int32_t acc_min, int32_t *out_multiplier, int32_t *left_shift, int32_t *right_shift, int32_t acc_min,
int32_t acc_max); int32_t acc_max);


+ 1
- 1
mindspore/lite/nnacl/int8/conv_depthwise_int8.c View File

@@ -295,7 +295,7 @@ void ConvDwSWInt8(int8_t *output_data, const int8_t *input_data, const int16_t *
int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_; int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
const int8_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_; const int8_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_; int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
#ifdef ENABLE_ARM64
#ifdef ENABLE_ARM
ConvDwInt8Center(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_, ConvDwInt8Center(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_,
conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(int8_t), conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(int8_t),
sliding->block_channel_ * sizeof(int8_t), sliding->in_sh_step_ * sizeof(int8_t), sliding->block_channel_ * sizeof(int8_t), sliding->in_sh_step_ * sizeof(int8_t),


Loading…
Cancel
Save