You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ConvDwInt8PostAlign4PerChannel.S 2.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. #ifdef __aarch64__
  2. .text
  3. .align 5
  4. .global ConvDwInt8PostAlign4PerChannel
  5. #ifndef __APPLE__
  6. .type ConvDwInt8PostAlign4PerChannel, %function
  7. #endif
  8. // void ConvDwInt8PostAlign4PerChannel(int8_t *dst, int32_t *buffer, int channel4, int32_t output_zp, int32_t *out_multiplier,
  9. // int32_t *left_shift, int32_t *right_shift, int32_t acc_min, int32_t acc_max);
  10. // x0: dst, x1: buffer, x2: num_pixels, x3: output_zp, x4: out_multiplier,
  11. // x5: left_shift, x6: right_shift, x7: acc_min, x8: acc_max
  12. ConvDwInt8PostAlign4PerChannel:
  13. // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
  14. // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
  15. // x19 ~ x29 should be also preserved
  16. // whereas our coding style do not permit such amount of parameters
  17. ldr x8, [sp]
  18. dup v29.4s, w3
  19. dup v30.4s, w7
  20. dup v31.4s, w8
  21. LoopDepth8:
  22. cmp x2, #8
  23. blt LoopDepth4
  24. ld1 {v0.4s}, [x1], #16
  25. ld1 {v1.4s}, [x1], #16
  26. ld1 {v2.4s}, [x5], #16
  27. ld1 {v3.4s}, [x5], #16
  28. ld1 {v4.4s}, [x4], #16
  29. ld1 {v5.4s}, [x4], #16
  30. sqshl v0.4s, v0.4s, v2.4s
  31. sqshl v1.4s, v1.4s, v3.4s
  32. ld1 {v6.4s}, [x6], #16
  33. ld1 {v7.4s}, [x6], #16
  34. sqrdmulh v0.4s, v0.4s, v4.4s
  35. sqrdmulh v1.4s, v1.4s, v5.4s
  36. and v16.16b, v6.16b, v0.16b
  37. sshr v16.4s, v16.4s, #31
  38. sqadd v0.4s, v0.4s, v16.4s
  39. srshl v0.4s, v0.4s, v6.4s
  40. and v17.16b, v7.16b, v1.16b
  41. sshr v17.4s, v17.4s, #31
  42. sqadd v1.4s, v1.4s, v17.4s
  43. srshl v1.4s, v1.4s, v7.4s
  44. add v0.4s, v0.4s, v29.4s
  45. add v1.4s, v1.4s, v29.4s
  46. smax v0.4s, v0.4s, v30.4s
  47. smax v1.4s, v1.4s, v30.4s
  48. smin v0.4s, v0.4s, v31.4s
  49. smin v1.4s, v1.4s, v31.4s
  50. sqxtn v0.4h, v0.4s
  51. sqxtn v1.4h, v1.4s
  52. sqxtn v0.8b, v0.8h
  53. sqxtn v1.8b, v1.8h
  54. st1 {v0.s}[0], [x0], #4
  55. st1 {v1.s}[0], [x0], #4
  56. sub x2, x2, #8
  57. cmp x2, #8
  58. bge LoopDepth8
  59. LoopDepth4:
  60. cmp x2, #4
  61. blt End
  62. ld1 {v0.4s}, [x1], #16
  63. ld1 {v2.4s}, [x5], #16
  64. sqshl v0.4s, v0.4s, v2.4s
  65. ld1 {v4.4s}, [x4], #16
  66. sqrdmulh v0.4s, v0.4s, v4.4s
  67. ld1 {v6.4s}, [x6], #16
  68. and v16.16b, v6.16b, v0.16b
  69. sshr v16.4s, v16.4s, #31
  70. sqadd v0.4s, v0.4s, v16.4s
  71. srshl v0.4s, v0.4s, v6.4s
  72. add v0.4s, v0.4s, v29.4s
  73. smax v0.4s, v0.4s, v30.4s
  74. smin v0.4s, v0.4s, v31.4s
  75. sqxtn v0.4h, v0.4s
  76. sqxtn v0.8b, v0.8h
  77. st1 {v0.s}[0], [x0], #4
  78. sub x2, x2, #4
  79. bge LoopDepth4
  80. End:
  81. ret
  82. #endif