You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ConvDwInt8Row.S 2.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. #ifdef __aarch64__
  2. .text
  3. .align 5
  4. .global ConvDwInt8Row
  5. #ifndef __APPLE__
  6. .type ConvDwInt8Row, %function
  7. #endif
  8. // void ConvDwInt8Row(int32_t *output_ptr, const int8_t *input_ptr, const int16_t *weight_ptr, int num_pixels,
  9. // int output_channel, int input_step, int8_t input_zp)
  10. // x0: output_ptr, x1: input_ptr, x2: weight_ptr, x3: num_pixels,
  11. // x4: output_channel, x5: input_step, x6: input_zp
  12. //
  13. ConvDwInt8Row:
  14. // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
  15. // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
  16. // x19 ~ x29 should be also preserved
  17. // whereas our coding style do not permit such amount of parameters
  18. cmp x3, #0
  19. beq End
  20. mov x10, x0
  21. dup v31.8b, w6
  22. LoopOutPixel:
  23. mov x7, x1
  24. mov x8, x2
  25. mov x9, x4
  26. LoopDepth16In:
  27. cmp x9, #16
  28. blt L8
  29. sub x9, x9, #16
  30. ld1 {v0.8b, v1.8b}, [x7], #16
  31. ld1 {v2.8h, v3.8h}, [x8], #32
  32. ld1 {v16.4s, v17.4s}, [x0], #32
  33. ssubl v20.8h, v0.8b, v31.8b
  34. smlal v16.4s, v20.4h, v2.4h
  35. smlal2 v17.4s, v20.8h, v2.8h
  36. cmp x9, #16
  37. blt LoopDepth16Out
  38. LoopDepth16:
  39. st1 {v16.4s, v17.4s}, [x10], #32
  40. ld1 {v18.4s, v19.4s}, [x0], #32
  41. ssubl v21.8h, v1.8b, v31.8b
  42. smlal v18.4s, v21.4h, v3.4h
  43. smlal2 v19.4s, v21.8h, v3.8h
  44. st1 {v18.4s, v19.4s}, [x10], #32
  45. ld1 {v0.8b, v1.8b}, [x7], #16
  46. ld1 {v2.8h, v3.8h}, [x8], #32
  47. ld1 {v16.4s, v17.4s}, [x0], #32
  48. ssubl v20.8h, v0.8b, v31.8b
  49. smlal v16.4s, v20.4h, v2.4h
  50. smlal2 v17.4s, v20.8h, v2.8h
  51. sub x9, x9, #16
  52. cmp x9, #16
  53. bge LoopDepth16
  54. LoopDepth16Out:
  55. st1 {v16.4s, v17.4s}, [x10], #32
  56. ld1 {v18.4s, v19.4s}, [x0], #32
  57. ssubl v21.8h, v1.8b, v31.8b
  58. smlal v18.4s, v21.4h, v3.4h
  59. smlal2 v19.4s, v21.8h, v3.8h
  60. st1 {v18.4s, v19.4s}, [x10], #32
  61. L8:
  62. cmp x9, #8
  63. blt L0
  64. LoopDepth8:
  65. ld1 {v0.8b}, [x7], #8
  66. ld1 {v2.8h}, [x8], #16
  67. ld1 {v16.4s, v17.4s}, [x0], #32
  68. ssubl v20.8h, v0.8b, v31.8b
  69. smlal v16.4s, v20.4h, v2.4h
  70. smlal2 v17.4s, v20.8h, v2.8h
  71. st1 {v16.4s, v17.4s}, [x10], #32
  72. sub x9, x9, #8
  73. cmp x9, #8
  74. bge LoopDepth8
  75. L0:
  76. cmp x9, #0
  77. beq Loop16LineEnd
  78. LoopDepth0:
  79. ldrsb w14, [x7], #1
  80. ldrsh w15, [x8], #2
  81. ldr w16, [x0], #4
  82. add w14, w14, w6
  83. sxth w14, w14
  84. madd w14, w14, w15, w16
  85. str w14, [x10], #4
  86. subs x9, x9, #1
  87. bne LoopDepth0
  88. Loop16LineEnd:
  89. subs x3, x3, #1
  90. add x1, x1, x5
  91. bne LoopOutPixel
  92. End:
  93. ret
  94. #endif