You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ConvDwInt8Row.S 3.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. #ifdef __arm__
  2. #ifndef __aarch64__
  3. .text
  4. .align 5
  5. .global ConvDwInt8Row
  6. #ifndef __APPLE__
  7. .type ConvDwInt8Row, %function
  8. #endif
  9. // void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
  10. // int output_channel, int input_step, int8_t input_zp)
  11. // r0: output_ptr, r1: input_ptr, r2: weight_ptr, r3: num_pixels,
  12. // r4: output_channel, r5: input_step, r6: input_zp,
  13. ConvDwInt8Row:
  14. // at return, clang generates "push {lr}, pop {pc}"" while gcc will generate "bx lr"
  15. // according to https://stackoverflow.com/questions/53625807
  16. // even if we jump to link register instead of saving it, we still have to save it in subroutine calls anyway
  17. // clang's rule seems more simple, though there are no subroutine calls here
  18. // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
  19. push {r4-r8, r9-r12, lr}
  20. vpush {q4-q7}
  21. add sp, sp, #104
  22. cmp r3, #0
  23. beq End
  24. ldr r4, [sp] // channel
  25. ldr r5, [sp, #4] // input_step
  26. ldr r6, [sp, #8] // input_zp
  27. vdup.8 d30, r6
  28. mov r7, r0
  29. LoopPixel:
  30. mov r8, r1 // input
  31. mov r10, r2 // weight
  32. mov r11, r4
  33. LoopDepth16In:
  34. cmp r11, #16
  35. blt L8
  36. sub r11, r11, #16
  37. vld1.8 {q0}, [r8]!
  38. vld1.16 {q1, q2}, [r10]! // weight
  39. vsubl.s8 q3, d0, d30 // -zp
  40. vld1.32 {q4, q5}, [r0]!
  41. vmlal.s16 q4, d6, d2
  42. vmlal.s16 q5, d7, d3
  43. cmp r11, #16
  44. blt LoopDepth16Out
  45. LoopDepth16:
  46. vst1.32 {q4, q5}, [r7]!
  47. vsubl.s8 q6, d1, d30
  48. vld1.32 {q7, q8}, [r0]!
  49. vmlal.s16 q7, d12, d4
  50. vmlal.s16 q8, d13, d5
  51. vst1.32 {q7, q8}, [r7]!
  52. vld1.8 {q0}, [r8]!
  53. vld1.16 {q1, q2}, [r10]! // weight
  54. vsubl.s8 q3, d0, d30 // -zp
  55. vld1.32 {q4, q5}, [r0]!
  56. vmlal.s16 q4, d6, d2
  57. vmlal.s16 q5, d7, d3
  58. sub r11, r11, #16
  59. cmp r11, #16
  60. bge LoopDepth16
  61. LoopDepth16Out:
  62. vst1.32 {q4, q5}, [r7]!
  63. vsubl.s8 q6, d1, d30
  64. vld1.32 {q7, q8}, [r0]!
  65. vmlal.s16 q7, d12, d4
  66. vmlal.s16 q8, d13, d5
  67. vst1.32 {q7, q8}, [r7]!
  68. L8:
  69. cmp r11, #8
  70. blt L0
  71. LoopDepth8:
  72. vld1.8 {d0}, [r8]!
  73. vld1.16 {d2, d3}, [r10]! // weight
  74. vsubl.s8 q2, d0, d30 // -zp
  75. vld1.32 {q3}, [r0]!
  76. vmlal.s16 q3, d4, d2
  77. vst1.32 {q3}, [r7]!
  78. vld1.32 {q4}, [r0]!
  79. vmlal.s16 q4, d5, d3
  80. vst1.32 {q4}, [r7]!
  81. sub r11, r11, #8
  82. cmp r11, #8
  83. bge LoopDepth8
  84. L0:
  85. cmp r11, #0
  86. beq LoopDepthEnd
  87. LoopDepth0:
  88. ldrsb r12, [r8], #1
  89. ldrsh r9, [r10], #2
  90. sub r12, r12, r6
  91. ldr lr, [r0], #4
  92. smlabb r12, r12, r9, lr
  93. str r12, [r7], #4
  94. subs r11, r11, #1
  95. bne L0
  96. LoopDepthEnd:
  97. add r1, r1, r5
  98. subs r3, r3, #1
  99. bne LoopPixel
  100. End:
  101. sub sp, sp, #104
  102. vpop {q4-q7}
  103. pop {r4-r8, r9-r12, pc}
  104. #endif
  105. #endif