You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ConvDwInt8PostAlign4.S 4.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. #ifdef __aarch64__
  2. .text
  3. .align 5
  4. .global ConvDwInt8PostAlign4
  5. #ifndef __APPLE__
  6. .type ConvDwInt8PostAlign4, %function
  7. #endif
  8. // void ConvDwInt8PostAlign4(int8_t *dst, int32_t *buffer, int num_pixels, int32_t output_zp, int32_t out_multiplier,
  9. // int32_t left_shift, int32_t right_shift, int32_t acc_min, int32_t acc_max);
  10. // x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier,
  11. // x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max
  12. ConvDwInt8PostAlign4:
  13. // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
  14. // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
  15. // x19 ~ x29 should be also preserved
  16. // whereas our coding style do not permit such amount of parameters
  17. ldr x8, [sp]
  18. dup v26.4s, w5
  19. dup v27.4s, w4
  20. dup v28.4s, w6
  21. dup v29.4s, w3
  22. dup v30.4s, w7
  23. dup v31.4s, w8
  24. cmp x2, #16
  25. blt LoopDepth8
  26. LoopDepth16:
  27. ld1 {v0.4s}, [x1], #16
  28. ld1 {v1.4s}, [x1], #16
  29. ld1 {v2.4s}, [x1], #16
  30. ld1 {v3.4s}, [x1], #16
  31. sqshl v0.4s, v0.4s, v26.4s
  32. sqshl v1.4s, v1.4s, v26.4s
  33. sqshl v2.4s, v2.4s, v26.4s
  34. sqshl v3.4s, v3.4s, v26.4s
  35. sqrdmulh v0.4s, v0.4s, v27.4s
  36. sqrdmulh v1.4s, v1.4s, v27.4s
  37. sqrdmulh v2.4s, v2.4s, v27.4s
  38. sqrdmulh v3.4s, v3.4s, v27.4s
  39. and v16.16b, v28.16b, v0.16b
  40. sshr v16.4s, v16.4s, #31
  41. sqadd v0.4s, v0.4s, v16.4s
  42. srshl v0.4s, v0.4s, v28.4s
  43. and v17.16b, v28.16b, v1.16b
  44. sshr v17.4s, v17.4s, #31
  45. sqadd v1.4s, v1.4s, v17.4s
  46. srshl v1.4s, v1.4s, v28.4s
  47. and v18.16b, v28.16b, v2.16b
  48. sshr v18.4s, v18.4s, #31
  49. sqadd v2.4s, v2.4s, v18.4s
  50. srshl v2.4s, v2.4s, v28.4s
  51. and v19.16b, v28.16b, v3.16b
  52. sshr v19.4s, v19.4s, #31
  53. sqadd v3.4s, v3.4s, v19.4s
  54. srshl v3.4s, v3.4s, v28.4s
  55. add v0.4s, v0.4s, v29.4s
  56. add v1.4s, v1.4s, v29.4s
  57. add v2.4s, v2.4s, v29.4s
  58. add v3.4s, v3.4s, v29.4s
  59. smax v0.4s, v0.4s, v30.4s
  60. smax v1.4s, v1.4s, v30.4s
  61. smax v2.4s, v2.4s, v30.4s
  62. smax v3.4s, v3.4s, v30.4s
  63. smin v0.4s, v0.4s, v31.4s
  64. smin v1.4s, v1.4s, v31.4s
  65. smin v2.4s, v2.4s, v31.4s
  66. smin v3.4s, v3.4s, v31.4s
  67. sqxtn v0.4h, v0.4s
  68. sqxtn v1.4h, v1.4s
  69. sqxtn v2.4h, v2.4s
  70. sqxtn v3.4h, v3.4s
  71. sqxtn v0.8b, v0.8h
  72. sqxtn v1.8b, v1.8h
  73. sqxtn v2.8b, v2.8h
  74. sqxtn v3.8b, v3.8h
  75. st1 {v0.s}[0], [x0], #4
  76. st1 {v1.s}[0], [x0], #4
  77. st1 {v2.s}[0], [x0], #4
  78. st1 {v3.s}[0], [x0], #4
  79. sub x2, x2, #16
  80. cmp x2, #16
  81. bge LoopDepth16
  82. LoopDepth8:
  83. cmp x2, #8
  84. blt LoopDepth4
  85. ld1 {v0.4s}, [x1], #16
  86. ld1 {v1.4s}, [x1], #16
  87. sqshl v0.4s, v0.4s, v26.4s
  88. sqshl v1.4s, v1.4s, v26.4s
  89. sqrdmulh v0.4s, v0.4s, v27.4s
  90. sqrdmulh v1.4s, v1.4s, v27.4s
  91. and v16.16b, v28.16b, v0.16b
  92. sshr v16.4s, v16.4s, #31
  93. sqadd v0.4s, v0.4s, v16.4s
  94. srshl v0.4s, v0.4s, v28.4s
  95. and v17.16b, v28.16b, v1.16b
  96. sshr v17.4s, v17.4s, #31
  97. sqadd v1.4s, v1.4s, v17.4s
  98. srshl v1.4s, v1.4s, v28.4s
  99. add v0.4s, v0.4s, v29.4s
  100. add v1.4s, v1.4s, v29.4s
  101. smax v0.4s, v0.4s, v30.4s
  102. smax v1.4s, v1.4s, v30.4s
  103. smin v0.4s, v0.4s, v31.4s
  104. smin v1.4s, v1.4s, v31.4s
  105. sqxtn v0.4h, v0.4s
  106. sqxtn v1.4h, v1.4s
  107. sqxtn v0.8b, v0.8h
  108. sqxtn v1.8b, v1.8h
  109. st1 {v0.s}[0], [x0], #4
  110. st1 {v1.s}[0], [x0], #4
  111. sub x2, x2, #8
  112. cmp x2, #8
  113. bge LoopDepth8
  114. LoopDepth4:
  115. cmp x2, #4
  116. blt End
  117. ld1 {v0.4s}, [x1], #16
  118. sqshl v0.4s, v0.4s, v26.4s
  119. sqrdmulh v0.4s, v0.4s, v27.4s
  120. and v16.16b, v28.16b, v0.16b
  121. sshr v16.4s, v16.4s, #31
  122. sqadd v0.4s, v0.4s, v16.4s
  123. srshl v0.4s, v0.4s, v28.4s
  124. add v0.4s, v0.4s, v29.4s
  125. smax v0.4s, v0.4s, v30.4s
  126. smin v0.4s, v0.4s, v31.4s
  127. sqxtn v0.4h, v0.4s
  128. sqxtn v0.8b, v0.8h
  129. st1 {v0.s}[0], [x0], #4
  130. sub x2, x2, #4
  131. bge LoopDepth4
  132. End:
  133. ret
  134. #endif