You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ConvDw3x3BorderPixelInt8.S 3.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. #ifdef __arm__
  2. #ifndef __aarch64__
  3. .text
  4. .align 5
  5. .global ConvDw3x3BorderPixelInt8
  6. #ifndef __APPLE__
  7. .type ConvDw3x3BorderPixelInt8, %function
  8. #endif
  9. // void ConvDw3x3BorderPixelInt8(int8_t *dst, const int8_t *src, const int16_t *weight, const int32_t *bias, size_t height,
  10. // size_t width, size_t in_kh_step, size_t in_kw_step, size_t channel, size_t in_zp, size_t out_zp,
  11. // size_t out_multiplier, size_t left_shift, size_t right_shift, size_t acc_min, size_t acc_max) {
  12. // r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step,
  13. // r8: channel, r9: in_zp, r10: out_zp, r11: out_multiplier, r12: left_shift, r13: right_shift
  14. // r14: acc_min, r15: acc_max
  15. ConvDw3x3BorderPixelInt8:
  16. // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
  17. // according to https://stackoverflow.com/questions/53625807
  18. // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
  19. // clang's rule seems more simple, though there are no subroutine calls here
  20. // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
  21. push {r4-r8, r9-r12, lr}
  22. vpush {q4-q7}
  23. add sp, sp, #104
  24. ldr r4, [sp]
  25. ldr r5, [sp, #4]
  26. ldr r6, [sp, #8]
  27. ldr r7, [sp, #12]
  28. ldr r8, [sp, #16]
  29. ldrb r10, [sp, #20] // in_zp
  30. vdup.8 d18, r10
  31. ldr r10, [sp, #24] // out_zp
  32. vdup.32 q15, r10
  33. ldr r10, [sp, #28] // out_multiplier
  34. vdup.32 q14, r10
  35. ldr r10, [sp, #32] // left_shift
  36. vdup.32 q13, r10
  37. ldr r10, [sp, #36] // right_shift
  38. vdup.32 q12, r10
  39. ldr r10, [sp, #40] // acc_min
  40. vdup.32 q11, r10
  41. ldr r10, [sp, #44] // acc_max
  42. vdup.32 q10, r10
  43. mov r4, #2
  44. mul lr, r8, r4
  45. LoopC:
  46. mov r9, r1
  47. mov r10, r2
  48. ldr r4, [sp]
  49. vld1.32 {q3}, [r3]!
  50. vld1.32 {q4}, [r3]!
  51. LoopH:
  52. mov r11, r9
  53. mov r12, r10
  54. ldr r5, [sp, #4]
  55. LoopW:
  56. vld1.8 {d0}, [r11], r7
  57. vld1.16 {d2, d3}, [r12], lr // weight
  58. vsubl.s8 q2, d0, d18 // -zp
  59. vmlal.s16 q3, d4, d2
  60. vmlal.s16 q4, d5, d3
  61. subs r5, r5, #1
  62. bne LoopW
  63. subs r4, r4, #1
  64. add r9, r9, r6
  65. mov r11, #3
  66. mul r5, lr, r11
  67. add r10, r10, r5
  68. bne LoopH
  69. vshl.s32 q3, q3, q13
  70. vqrdmulh.s32 q3, q3, q14
  71. vand q5, q3, q12
  72. vshr.s32 q5, q5, #31
  73. vqadd.s32 q3, q3, q5
  74. vrshl.s32 q3, q3, q12
  75. vadd.i32 q3, q3, q15
  76. vmax.s32 q3, q3, q11
  77. vmin.s32 q3, q3, q10
  78. vqmovn.s32 d14, q3
  79. vshl.s32 q4, q4, q13
  80. vqrdmulh.s32 q4, q4, q14
  81. vand q6, q4, q12
  82. vshr.s32 q6, q6, #31
  83. vqadd.s32 q4, q4, q6
  84. vrshl.s32 q4, q4, q12
  85. vadd.i32 q4, q4, q15
  86. vmax.s32 q4, q4, q11
  87. vmin.s32 q4, q4, q10
  88. vqmovn.s32 d15, q4
  89. vqmovn.s16 d16, q7
  90. vst1.8 {d16}, [r0]!
  91. add r1, r1, #8
  92. add r2, r2, #16
  93. sub r8, r8, #8
  94. cmp r8, #8
  95. bge LoopC
  96. sub sp, sp, #104
  97. vpop {q4-q7}
  98. pop {r4-r8, r9-r12, pc}
  99. #endif
  100. #endif