You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

MatmulOptR4Int8.S 3.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. #ifdef __aarch64__
  2. .text
  3. .align 5
  4. .global MatMulOptR4Int8Neon64
  5. #ifndef __APPLE__
  6. .type MatMulOptR4Int8Neon64, %function
  7. #endif
  8. //void MatMulOptR4Int8Neon64(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
  9. // const int *input_sum, const int *bias)
  10. // x0: a(left matrix ptr)
  11. // x1: b(right matrix ptr)
  12. // x2: out ptr
  13. // w3: row4
  14. // w4: col4
  15. // w5: deep16
  16. // x6: a_sums
  17. // x7: bias
  18. MatMulOptR4Int8Neon64:
  19. sub sp, sp, #128
  20. st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  21. st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
  22. mov w15, #0 // b col index
  23. mov w16, #0 // a row index
  24. mov w17, #4 // sizeof(int8)*4
  25. mul w12, w5, w17 // the stride of a/b: sizeof(int8)*4*deep16
  26. L1:
  27. cmp w15, w4
  28. beq End1
  29. mov w16, #0 // reset a row index
  30. mov x17, x0 // reload a ptr
  31. mov x13, x6 // reload a_sums ptr
  32. L2:
  33. cmp w16, w3
  34. beq End2
  35. mov x18, x1 // reload b ptr
  36. mov x10, x7 // reload bias ptr
  37. mov w11, w5 // reload depth
  38. dup v16.4s, wzr
  39. dup v17.4s, wzr
  40. dup v18.4s, wzr
  41. dup v19.4s, wzr
  42. dup v20.4s, wzr
  43. dup v21.4s, wzr
  44. dup v22.4s, wzr
  45. dup v23.4s, wzr
  46. dup v24.4s, wzr
  47. dup v25.4s, wzr
  48. dup v26.4s, wzr
  49. dup v27.4s, wzr
  50. dup v28.4s, wzr
  51. dup v29.4s, wzr
  52. dup v30.4s, wzr
  53. dup v31.4s, wzr
  54. L3:
  55. cmp w11, #0
  56. beq End3
  57. ld1 {v0.16b}, [x17], #16
  58. ld1 {v1.16b}, [x17], #16
  59. ld1 {v2.16b}, [x17], #16
  60. ld1 {v3.16b}, [x17], #16
  61. ld1 {v4.16b}, [x18], #16
  62. ld1 {v5.16b}, [x18], #16
  63. ld1 {v6.16b}, [x18], #16
  64. ld1 {v7.16b}, [x18], #16
  65. sdot v16.4s, v4.16b, v0.16b
  66. sdot v17.4s, v5.16b, v0.16b
  67. sdot v18.4s, v6.16b, v0.16b
  68. sdot v19.4s, v7.16b, v0.16b
  69. sdot v20.4s, v4.16b, v1.16b
  70. sdot v21.4s, v5.16b, v1.16b
  71. sdot v22.4s, v6.16b, v1.16b
  72. sdot v23.4s, v7.16b, v1.16b
  73. sdot v24.4s, v4.16b, v2.16b
  74. sdot v25.4s, v5.16b, v2.16b
  75. sdot v26.4s, v6.16b, v2.16b
  76. sdot v27.4s, v7.16b, v2.16b
  77. sdot v28.4s, v4.16b, v3.16b
  78. sdot v29.4s, v5.16b, v3.16b
  79. sdot v30.4s, v6.16b, v3.16b
  80. sdot v31.4s, v7.16b, v3.16b
  81. subs w11, w11, #16 // depth + 16
  82. b L3
  83. End3:
  84. addp v16.4s, v16.4s, v17.4s
  85. addp v18.4s, v18.4s, v19.4s
  86. addp v20.4s, v20.4s, v21.4s
  87. addp v22.4s, v22.4s, v23.4s
  88. addp v24.4s, v24.4s, v25.4s
  89. addp v26.4s, v26.4s, v27.4s
  90. addp v28.4s, v28.4s, v29.4s
  91. addp v30.4s, v30.4s, v31.4s
  92. addp v16.4s, v16.4s, v18.4s
  93. addp v17.4s, v20.4s, v22.4s
  94. addp v18.4s, v24.4s, v26.4s
  95. addp v19.4s, v28.4s, v30.4s
  96. // Add (Bias+Depth*Za*Zb-Za*Bsums)
  97. ld1 {v15.4s}, [x10], #16
  98. add v16.4s, v16.4s, v15.4s
  99. add v17.4s, v17.4s, v15.4s
  100. add v18.4s, v18.4s, v15.4s
  101. add v19.4s, v19.4s, v15.4s
  102. // Subtract (Asums*Zb)
  103. ld1 {v14.4s}, [x13], #16
  104. dup v20.4s, v14.s[0]
  105. dup v21.4s, v14.s[1]
  106. dup v22.4s, v14.s[2]
  107. dup v23.4s, v14.s[3]
  108. sub v16.4s, v16.4s, v20.4s
  109. sub v17.4s, v17.4s, v21.4s
  110. sub v18.4s, v18.4s, v22.4s
  111. sub v19.4s, v19.4s, v23.4s
  112. st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x2], #64
  113. add w16, w16, #4 // a row index + 4
  114. b L2
  115. End2:
  116. add w15, w15, #4 // b col index + 4
  117. add x1, x1, x12 // b ptr + stride
  118. add x7, x7, #16 // bias ptr + stride
  119. b L1
  120. End1:
  121. sub sp, sp, #128
  122. ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  123. ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
  124. ret
  125. #endif