You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

common.h 97 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172
  1. #pragma once
  2. #include <cmath>
  3. #include <cstdint>
  4. #include <type_traits>
  5. #include "src/arm_common/simd_macro/marm_neon.h"
  6. #include "src/common/utils.h"
  7. #include "src/fallback/conv_bias/common.h"
  8. namespace megdnn {
  9. namespace aarch64 {
  10. /* ======================== Prefetch ======================== */
  11. #define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n"
  12. #define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n"
  13. #define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n"
  14. #define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n"
  15. static inline void prefetch_6x(const void* pfp) {
  16. // clang-format off
  17. asm volatile(ASM_PREFETCH("[%[pfp]]")
  18. ASM_PREFETCH("[%[pfp], #64]")
  19. ASM_PREFETCH("[%[pfp], #128]")
  20. ASM_PREFETCH("[%[pfp], #192]")
  21. ASM_PREFETCH("[%[pfp], #256]")
  22. ASM_PREFETCH("[%[pfp], #320]")
  23. :
  24. : [pfp] "r"(pfp)
  25. : "memory");
  26. // clang-format on
  27. }
  28. static inline void prefetch_5x(const void* pfp) {
  29. // clang-format off
  30. asm volatile(ASM_PREFETCH("[%[pfp]]")
  31. ASM_PREFETCH("[%[pfp], #64]")
  32. ASM_PREFETCH("[%[pfp], #128]")
  33. ASM_PREFETCH("[%[pfp], #192]")
  34. ASM_PREFETCH("[%[pfp], #256]")
  35. :
  36. : [pfp] "r"(pfp)
  37. : "memory");
  38. // clang-format on
  39. }
  40. static inline void prefetch_4x(const void* pfp) {
  41. // clang-format off
  42. asm volatile(ASM_PREFETCH("[%[pfp]]")
  43. ASM_PREFETCH("[%[pfp], #64]")
  44. ASM_PREFETCH("[%[pfp], #128]")
  45. ASM_PREFETCH("[%[pfp], #192]")
  46. :
  47. : [pfp] "r"(pfp)
  48. : "memory");
  49. // clang-format on
  50. }
  51. static inline void prefetch_3x(const void* pfp) {
  52. // clang-format off
  53. asm volatile(ASM_PREFETCH("[%[pfp]]")
  54. ASM_PREFETCH("[%[pfp], #64]")
  55. ASM_PREFETCH("[%[pfp], #128]")
  56. :
  57. : [pfp] "r"(pfp)
  58. : "memory");
  59. // clang-format on
  60. }
  61. static inline void prefetch_2x(const void* pfp) {
  62. // clang-format off
  63. asm volatile(ASM_PREFETCH("[%[pfp]]")
  64. ASM_PREFETCH("[%[pfp], #64]")
  65. :
  66. : [pfp] "r"(pfp)
  67. : "memory");
  68. // clang-format on
  69. }
  70. static inline void prefetch_1x(const void* pfp) {
  71. // clang-format off
  72. asm volatile(ASM_PREFETCH("[%[pfp]]") : : [pfp] "r"(pfp) : "memory");
  73. // clang-format on
  74. }
  75. /* ======================== interleave pack A ======================== */
  76. /**
  77. * interleave_INTERLEAVE_UNROLLK_BATCH_type
  78. *
  79. * BATCH means process BATCH * UNROLL_K cols once, BATCH * sizeof(TYPE) *
  80. * UNROLL_K = 16bytes(128bits, a vector size).
  81. *
  82. * the elements traverse order:
  83. * rep(j, 0, INTERLEAVE) rep(i, 0, UNROLL_K) *ouptr++ = inptr[j, i]
  84. */
  85. template <typename T>
  86. static inline void interleave_24x1_8_h_helper(
  87. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  88. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  89. T*& outptr, int skippf = 0) {
  90. static_assert(sizeof(T) == 2, "only support size == 2");
  91. asm volatile(
  92. // Load up 8 elements (1 vector) from each of 8 sources.
  93. "cbnz %w[skippf], 1f\n"
  94. ASM_PREFETCH("[%[inptr0], #128]")
  95. ASM_PREFETCH("[%[inptr1], #128]")
  96. ASM_PREFETCH("[%[inptr2], #128]")
  97. ASM_PREFETCH("[%[inptr3], #128]")
  98. "1:\n"
  99. "ldr q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
  100. "ldr q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
  101. "ldr q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
  102. "ldr q6, [%[inptr6]], #16\n"
  103. "zip1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3
  104. "zip2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
  105. "zip1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3
  106. "zip2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
  107. "ldr q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
  108. "ldr q5, [%[inptr5]], #16\n"
  109. "ldr q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
  110. "ldr q7, [%[inptr7]], #16\n"
  111. "zip1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
  112. "zip2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
  113. "zip1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
  114. "zip2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
  115. "zip1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1
  116. "zip2 v20.8h, v8.8h, v9.8h\n"
  117. "zip1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
  118. "zip2 v21.8h, v10.8h, v11.8h\n"
  119. "cbnz %w[skippf], 2f\n"
  120. ASM_PREFETCH("[%[inptr4], #112]")
  121. ASM_PREFETCH("[%[inptr5], #112]")
  122. ASM_PREFETCH("[%[inptr6], #112]")
  123. ASM_PREFETCH("[%[inptr7], #112]")
  124. "2:\n"
  125. "zip1 v22.8h, v16.8h, v17.8h\n"
  126. "zip2 v30.8h, v16.8h, v17.8h\n"
  127. "zip1 v23.8h, v18.8h, v19.8h\n"
  128. "zip2 v31.8h, v18.8h, v19.8h\n"
  129. "zip1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
  130. "zip2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
  131. "str q14, [%[outptr]], #48\n"
  132. "str q15, [%[outptr]], #48\n"
  133. "zip1 v0.8h, v20.8h, v21.8h\n"
  134. "zip2 v1.8h, v20.8h, v21.8h\n"
  135. "str q0, [%[outptr]], #48\n"
  136. "str q1, [%[outptr]], #48\n"
  137. "zip1 v2.8h, v22.8h, v23.8h\n"
  138. "zip2 v3.8h, v22.8h, v23.8h\n"
  139. "str q2, [%[outptr]], #48\n"
  140. "str q3, [%[outptr]], #48\n"
  141. "zip1 v4.8h, v30.8h, v31.8h\n"
  142. "zip2 v5.8h, v30.8h, v31.8h\n"
  143. "str q4, [%[outptr]], #48\n"
  144. "str q5, [%[outptr]], #48\n"
  145. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1),
  146. [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
  147. [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  148. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7),
  149. [outptr] "+r"(outptr)
  150. : [skippf] "r"(skippf)
  151. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
  152. "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
  153. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
  154. "v31", "cc", "memory");
  155. }
  156. template <typename T>
  157. static inline void interleave_16x1_8_h_helper(
  158. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  159. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  160. T*& outptr, int skippf = 0) {
  161. static_assert(sizeof(T) == 2, "only support size == 2");
  162. asm volatile(
  163. // Load up 8 elements (1 vector) from each of 8 sources.
  164. "cbnz %w[skippf], 1f\n"
  165. ASM_PREFETCH("[%[inptr0], #128]")
  166. ASM_PREFETCH("[%[inptr1], #128]")
  167. ASM_PREFETCH("[%[inptr2], #128]")
  168. ASM_PREFETCH("[%[inptr3], #128]")
  169. "1:\n"
  170. "ldr q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
  171. "ldr q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
  172. "ldr q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
  173. "ldr q6, [%[inptr6]], #16\n"
  174. "zip1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3
  175. "zip2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
  176. "zip1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3
  177. "zip2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
  178. "ldr q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
  179. "ldr q5, [%[inptr5]], #16\n"
  180. "ldr q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
  181. "ldr q7, [%[inptr7]], #16\n"
  182. "zip1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
  183. "zip2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
  184. "zip1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
  185. "zip2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
  186. "zip1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1
  187. "zip2 v20.8h, v8.8h, v9.8h\n"
  188. "zip1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
  189. "zip2 v21.8h, v10.8h, v11.8h\n"
  190. "cbnz %w[skippf], 2f\n"
  191. ASM_PREFETCH("[%[inptr4], #112]")
  192. ASM_PREFETCH("[%[inptr5], #112]")
  193. ASM_PREFETCH("[%[inptr6], #112]")
  194. ASM_PREFETCH("[%[inptr7], #112]")
  195. "2:\n"
  196. "zip1 v22.8h, v16.8h, v17.8h\n"
  197. "zip2 v30.8h, v16.8h, v17.8h\n"
  198. "zip1 v23.8h, v18.8h, v19.8h\n"
  199. "zip2 v31.8h, v18.8h, v19.8h\n"
  200. "zip1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
  201. "zip2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
  202. "str q14, [%[outptr]], #32\n"
  203. "str q15, [%[outptr]], #32\n"
  204. "zip1 v0.8h, v20.8h, v21.8h\n"
  205. "zip2 v1.8h, v20.8h, v21.8h\n"
  206. "str q0, [%[outptr]], #32\n"
  207. "str q1, [%[outptr]], #32\n"
  208. "zip1 v2.8h, v22.8h, v23.8h\n"
  209. "zip2 v3.8h, v22.8h, v23.8h\n"
  210. "str q2, [%[outptr]], #32\n"
  211. "str q3, [%[outptr]], #32\n"
  212. "zip1 v4.8h, v30.8h, v31.8h\n"
  213. "zip2 v5.8h, v30.8h, v31.8h\n"
  214. "str q4, [%[outptr]], #32\n"
  215. "str q5, [%[outptr]], #32\n"
  216. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1),
  217. [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
  218. [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  219. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7),
  220. [outptr] "+r"(outptr)
  221. : [skippf] "r"(skippf)
  222. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
  223. "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
  224. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
  225. "v31", "cc", "memory");
  226. }
  227. template <typename T>
  228. static inline void interleave_8x1_8_h(
  229. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  230. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  231. T*& outptr, int skippf = 0) {
  232. static_assert(sizeof(T) == 2, "only support size == 2");
  233. asm volatile(
  234. // Load up 8 elements (1 vector) from each of 8 sources.
  235. "cbnz %w[skippf], 1f\n"
  236. ASM_PREFETCH("[%[inptr0], #128]")
  237. ASM_PREFETCH("[%[inptr1], #128]")
  238. ASM_PREFETCH("[%[inptr2], #128]")
  239. ASM_PREFETCH("[%[inptr3], #128]")
  240. "1:\n"
  241. "ldr q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
  242. "ldr q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
  243. "ldr q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
  244. "ldr q6, [%[inptr6]], #16\n"
  245. "zip1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3
  246. "zip2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
  247. "zip1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3
  248. "zip2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
  249. "ldr q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
  250. "ldr q5, [%[inptr5]], #16\n"
  251. "ldr q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
  252. "ldr q7, [%[inptr7]], #16\n"
  253. "zip1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
  254. "zip2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
  255. "zip1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
  256. "zip2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
  257. "zip1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1
  258. "zip2 v20.8h, v8.8h, v9.8h\n"
  259. "zip1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
  260. "zip2 v21.8h, v10.8h, v11.8h\n"
  261. "cbnz %w[skippf], 2f\n"
  262. ASM_PREFETCH("[%[inptr4], #112]")
  263. ASM_PREFETCH("[%[inptr5], #112]")
  264. ASM_PREFETCH("[%[inptr6], #112]")
  265. ASM_PREFETCH("[%[inptr7], #112]")
  266. "2:\n"
  267. "zip1 v22.8h, v16.8h, v17.8h\n"
  268. "zip2 v30.8h, v16.8h, v17.8h\n"
  269. "zip1 v23.8h, v18.8h, v19.8h\n"
  270. "zip2 v31.8h, v18.8h, v19.8h\n"
  271. "zip1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
  272. "zip2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
  273. "stp q14, q15, [%[outptr]], #32\n" // Write back first two elements
  274. "zip1 v0.8h, v20.8h, v21.8h\n"
  275. "zip2 v1.8h, v20.8h, v21.8h\n"
  276. "stp q0, q1, [%[outptr]], #32\n" // Write back next two elements
  277. "zip1 v2.8h, v22.8h, v23.8h\n"
  278. "zip2 v3.8h, v22.8h, v23.8h\n"
  279. "stp q2, q3, [%[outptr]], #32\n" // Write back next two elements
  280. "zip1 v4.8h, v30.8h, v31.8h\n"
  281. "zip2 v5.8h, v30.8h, v31.8h\n"
  282. "stp q4, q5, [%[outptr]], #32\n" // Write back last two elements
  283. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1),
  284. [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
  285. [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  286. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7),
  287. [outptr] "+r"(outptr)
  288. : [skippf] "r"(skippf)
  289. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
  290. "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
  291. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
  292. "v31", "cc", "memory");
  293. }
  294. template <typename T>
  295. static inline void interleave_4x1_4_h(
  296. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  297. T*& outptr) {
  298. static_assert(sizeof(T) == 2, "only support size == 2");
  299. asm volatile(
  300. "ldr d0, [%[inptr0]], #8\n" // d0 = A0A1A2A3
  301. "ldr d1, [%[inptr1]], #8\n" // d1 = B0B1B2B3
  302. "ldr d2, [%[inptr2]], #8\n" // d2 = C0C1C2C3
  303. "ldr d3, [%[inptr3]], #8\n" // d3 = D0D1D2D3
  304. "zip1 v4.4h, v0.4h, v2.4h\n" // d4 = A0C0A1C1
  305. "zip2 v8.4h, v0.4h, v2.4h\n" // d8 = A2C2A3C3
  306. "zip1 v5.4h, v1.4h, v3.4h\n" // d5 = B0D0B1D1
  307. "zip2 v9.4h, v1.4h, v3.4h\n" // d9 = B2D2B3D3
  308. "zip1 v6.4h, v4.4h, v5.4h\n" // d6 = A0B0C0D0
  309. "zip2 v7.4h, v4.4h, v5.4h\n" // d7 = A1B1C1D1
  310. "stp d6, d7, [%[outptr]], #16\n"
  311. "zip1 v10.4h, v8.4h, v9.4h\n" // d10 = A2B2C2D2
  312. "zip2 v11.4h, v8.4h, v9.4h\n" // d11 = A3B3C3D3
  313. "stp d10, d11, [%[outptr]], #16\n"
  314. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  315. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  316. :
  317. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  318. "memory");
  319. }
  320. static inline void interleave_4x1_2_d(
  321. const int64_t*& inptr0, const int64_t*& inptr1, const int64_t*& inptr2,
  322. const int64_t*& inptr3, int64_t*& outptr) {
  323. asm volatile(
  324. "ld1 {v0.2d}, [%[inptr0]], #16\n" // d0 = A0A1
  325. "ld1 {v1.2d}, [%[inptr1]], #16\n" // d1 = B0B1
  326. "ld1 {v2.2d}, [%[inptr2]], #16\n" // d2 = C0C1
  327. "ld1 {v3.2d}, [%[inptr3]], #16\n" // d3 = D0D1
  328. "zip1 v4.2d, v0.2d, v1.2d\n" // d8 = A0B0
  329. "zip2 v5.2d, v0.2d, v1.2d\n" // d9 = A1B1
  330. "zip1 v6.2d, v2.2d, v3.2d\n" // d10 = C0D0
  331. "zip2 v7.2d, v2.2d, v3.2d\n" // d11 = C1D1
  332. "st1 {v4.2d}, [%[outptr]], #16\n"
  333. "st1 {v6.2d}, [%[outptr]], #16\n"
  334. "st1 {v5.2d}, [%[outptr]], #16\n"
  335. "st1 {v7.2d}, [%[outptr]], #16\n"
  336. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  337. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  338. :
  339. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "cc", "memory");
  340. }
  341. static inline void interleave_4x2_2_d(
  342. const int64_t*& inptr0, const int64_t*& inptr1, const int64_t*& inptr2,
  343. const int64_t*& inptr3, int64_t*& outptr) {
  344. asm volatile(
  345. "ld1 {v0.2d}, [%[inptr0]], #16\n" // d0 = A0
  346. "ld1 {v1.2d}, [%[inptr0]], #16\n" // d1 = A1
  347. "ld1 {v2.2d}, [%[inptr1]], #16\n" // d2 = B0
  348. "ld1 {v3.2d}, [%[inptr1]], #16\n" // d3 = B1
  349. "ld1 {v4.2d}, [%[inptr2]], #16\n" // d4 = C0
  350. "ld1 {v5.2d}, [%[inptr2]], #16\n" // d5 = C1
  351. "ld1 {v6.2d}, [%[inptr3]], #16\n" // d6 = D0
  352. "ld1 {v7.2d}, [%[inptr3]], #16\n" // d7 = D1
  353. "st1 {v0.2d}, [%[outptr]], #16\n"
  354. "st1 {v2.2d}, [%[outptr]], #16\n"
  355. "st1 {v4.2d}, [%[outptr]], #16\n"
  356. "st1 {v6.2d}, [%[outptr]], #16\n"
  357. "st1 {v1.2d}, [%[outptr]], #16\n"
  358. "st1 {v3.2d}, [%[outptr]], #16\n"
  359. "st1 {v5.2d}, [%[outptr]], #16\n"
  360. "st1 {v7.2d}, [%[outptr]], #16\n"
  361. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  362. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  363. :
  364. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "cc", "memory");
  365. }
  366. static inline void interleave_12x1_4_s(
  367. const int32_t*& inptr0, const int32_t*& inptr1, const int32_t*& inptr2,
  368. const int32_t*& inptr3, const int32_t*& inptr4, const int32_t*& inptr5,
  369. const int32_t*& inptr6, const int32_t*& inptr7, const int32_t*& inptr8,
  370. const int32_t*& inptr9, const int32_t*& inptr10, const int32_t*& inptr11,
  371. int32_t*& outptr) {
  372. asm volatile(
  373. "ld1 {v0.4s}, [%[inptr0]], #16\n" // d0 = A0A1A2A3
  374. "ld1 {v1.4s}, [%[inptr1]], #16\n" // d1 = B0B1B2B3
  375. "ld1 {v2.4s}, [%[inptr2]], #16\n" // d2 = C0C1C2C3
  376. "ld1 {v3.4s}, [%[inptr3]], #16\n" // d3 = D0D1D2D3
  377. "zip1 v12.4s, v0.4s, v2.4s\n" // d12 = A0C0A1C1
  378. "zip2 v13.4s, v0.4s, v2.4s\n" // d13 = A2C2A3C3
  379. "zip1 v14.4s, v1.4s, v3.4s\n" // d14 = B0D0B1D1
  380. "zip2 v15.4s, v1.4s, v3.4s\n" // d15 = B2D2B3D3
  381. "zip1 v0.4s, v12.4s, v14.4s\n" // d0 = A0B0C0D0
  382. "zip2 v1.4s, v12.4s, v14.4s\n" // d1 = A1B1C1D1
  383. "zip1 v2.4s, v13.4s, v15.4s\n" // d2 = A2B2C2D2
  384. "zip2 v3.4s, v13.4s, v15.4s\n" // d3 = A3B3C3D3
  385. "ld1 {v4.4s}, [%[inptr4]], #16\n" // d4 = E0E1E2E3
  386. "ld1 {v5.4s}, [%[inptr5]], #16\n" // d5 = F0F1F2F3
  387. "ld1 {v6.4s}, [%[inptr6]], #16\n" // d6 = G0G1G2G3
  388. "ld1 {v7.4s}, [%[inptr7]], #16\n" // d7 = H0H1H2H3
  389. "zip1 v16.4s, v4.4s, v6.4s\n" // d16 = E0G0E1G1
  390. "zip2 v17.4s, v4.4s, v6.4s\n" // d17 = E2G2E3G3
  391. "zip1 v18.4s, v5.4s, v7.4s\n" // d18 = F0H0F1H1
  392. "zip2 v19.4s, v5.4s, v7.4s\n" // d19 = F2H2F3H3
  393. "zip1 v4.4s, v16.4s, v18.4s\n" // d4 = E0F0G0H0
  394. "zip2 v5.4s, v16.4s, v18.4s\n" // d5 = E1F1G1H1
  395. "zip1 v6.4s, v17.4s, v19.4s\n" // d6 = E2F2G2H2
  396. "zip2 v7.4s, v17.4s, v19.4s\n" // d7 = E3F3G3H3
  397. "ld1 {v8.4s}, [%[inptr8]], #16\n" // d8 = I0I1I2I3
  398. "ld1 {v9.4s}, [%[inptr9]], #16\n" // d9 = J0J1J2J3
  399. "ld1 {v10.4s}, [%[inptr10]], #16\n" // d10 = K0K1K2K3
  400. "ld1 {v11.4s}, [%[inptr11]], #16\n" // d11 = L0L1L2L3
  401. "zip1 v20.4s, v8.4s, v10.4s\n" // d20 = I0K0I1K1
  402. "zip2 v21.4s, v8.4s, v10.4s\n" // d21 = I2K2I3K3
  403. "zip1 v22.4s, v9.4s, v11.4s\n" // d22 = J0L0J1L1
  404. "zip2 v23.4s, v9.4s, v11.4s\n" // d23 = J2L2J3L3
  405. "zip1 v8.4s, v20.4s, v22.4s\n" // d8 = I0J0K0L0
  406. "zip2 v9.4s, v20.4s, v22.4s\n" // d9 = I1J1K1L1
  407. "zip1 v10.4s, v21.4s, v23.4s\n" // d10 = I2J2K2L2
  408. "zip2 v11.4s, v21.4s, v23.4s\n" // d11 = I3J3K3L3
  409. "st1 {v0.4s}, [%[outptr]], #16\n"
  410. "st1 {v4.4s}, [%[outptr]], #16\n"
  411. "st1 {v8.4s}, [%[outptr]], #16\n"
  412. "st1 {v1.4s}, [%[outptr]], #16\n"
  413. "st1 {v5.4s}, [%[outptr]], #16\n"
  414. "st1 {v9.4s}, [%[outptr]], #16\n"
  415. "st1 {v2.4s}, [%[outptr]], #16\n"
  416. "st1 {v6.4s}, [%[outptr]], #16\n"
  417. "st1 {v10.4s}, [%[outptr]], #16\n"
  418. "st1 {v3.4s}, [%[outptr]], #16\n"
  419. "st1 {v7.4s}, [%[outptr]], #16\n"
  420. "st1 {v11.4s}, [%[outptr]], #16\n"
  421. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  422. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  423. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [inptr8] "+r"(inptr8),
  424. [inptr9] "+r"(inptr9), [inptr10] "+r"(inptr10), [inptr11] "+r"(inptr11),
  425. [outptr] "+r"(outptr)
  426. :
  427. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  428. "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
  429. "v22", "v23", "cc", "memory");
  430. }
  431. template <typename T>
  432. static inline void interleave_12x1_4_h(
  433. const T*& in0, const T*& in1, const T*& in2, const T*& in3, const T*& in4,
  434. const T*& in5, const T*& in6, const T*& in7, const T*& in8, const T*& in9,
  435. const T*& in10, const T*& in11, T*& out) {
  436. static_assert(
  437. std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value,
  438. "interleave_12x1_4_h only support uint16_t and int16_t");
  439. const int16_t*& inptr0 = reinterpret_cast<const int16_t*&>(in0);
  440. const int16_t*& inptr1 = reinterpret_cast<const int16_t*&>(in1);
  441. const int16_t*& inptr2 = reinterpret_cast<const int16_t*&>(in2);
  442. const int16_t*& inptr3 = reinterpret_cast<const int16_t*&>(in3);
  443. const int16_t*& inptr4 = reinterpret_cast<const int16_t*&>(in4);
  444. const int16_t*& inptr5 = reinterpret_cast<const int16_t*&>(in5);
  445. const int16_t*& inptr6 = reinterpret_cast<const int16_t*&>(in6);
  446. const int16_t*& inptr7 = reinterpret_cast<const int16_t*&>(in7);
  447. const int16_t*& inptr8 = reinterpret_cast<const int16_t*&>(in8);
  448. const int16_t*& inptr9 = reinterpret_cast<const int16_t*&>(in9);
  449. const int16_t*& inptr10 = reinterpret_cast<const int16_t*&>(in10);
  450. const int16_t*& inptr11 = reinterpret_cast<const int16_t*&>(in11);
  451. int16_t*& outptr = reinterpret_cast<int16_t*&>(out);
  452. asm volatile(
  453. "ld1 {v0.4h}, [%[inptr0]], #8\n" // d0 = A0A1A2A3
  454. "ld1 {v1.4h}, [%[inptr1]], #8\n" // d1 = B0B1B2B3
  455. "ld1 {v2.4h}, [%[inptr2]], #8\n" // d2 = C0C1C2C3
  456. "ld1 {v3.4h}, [%[inptr3]], #8\n" // d3 = D0D1D2D3
  457. "zip1 v12.4h, v0.4h, v2.4h\n" // d12 = A0C0A1C1
  458. "zip2 v13.4h, v0.4h, v2.4h\n" // d13 = A2C2A3C3
  459. "zip1 v14.4h, v1.4h, v3.4h\n" // d14 = B0D0B1D1
  460. "zip2 v15.4h, v1.4h, v3.4h\n" // d15 = B2D2B3D3
  461. "zip1 v0.4h, v12.4h, v14.4h\n" // d0 = A0B0C0D0
  462. "zip2 v1.4h, v12.4h, v14.4h\n" // d1 = A1B1C1D1
  463. "zip1 v2.4h, v13.4h, v15.4h\n" // d2 = A2B2C2D2
  464. "zip2 v3.4h, v13.4h, v15.4h\n" // d3 = A3B3C3D3
  465. "ld1 {v4.4h}, [%[inptr4]], #8\n" // d4 = E0E1E2E3
  466. "ld1 {v5.4h}, [%[inptr5]], #8\n" // d5 = F0F1F2F3
  467. "ld1 {v6.4h}, [%[inptr6]], #8\n" // d6 = G0G1G2G3
  468. "ld1 {v7.4h}, [%[inptr7]], #8\n" // d7 = H0H1H2H3
  469. "zip1 v16.4h, v4.4h, v6.4h\n" // d16 = E0G0E1G1
  470. "zip2 v17.4h, v4.4h, v6.4h\n" // d17 = E2G2E3G3
  471. "zip1 v18.4h, v5.4h, v7.4h\n" // d18 = F0H0F1H1
  472. "zip2 v19.4h, v5.4h, v7.4h\n" // d19 = F2H2F3H3
  473. "zip1 v4.4h, v16.4h, v18.4h\n" // d4 = E0F0G0H0
  474. "zip2 v5.4h, v16.4h, v18.4h\n" // d5 = E1F1G1H1
  475. "zip1 v6.4h, v17.4h, v19.4h\n" // d6 = E2F2G2H2
  476. "zip2 v7.4h, v17.4h, v19.4h\n" // d7 = E3F3G3H3
  477. "ld1 {v8.4h}, [%[inptr8]], #8\n" // d8 = I0I1I2I3
  478. "ld1 {v9.4h}, [%[inptr9]], #8\n" // d9 = J0J1J2J3
  479. "ld1 {v10.4h}, [%[inptr10]], #8\n" // d10 = K0K1K2K3
  480. "ld1 {v11.4h}, [%[inptr11]], #8\n" // d11 = L0L1L2L3
  481. "zip1 v20.4h, v8.4h, v10.4h\n" // d20 = I0K0I1K1
  482. "zip2 v21.4h, v8.4h, v10.4h\n" // d21 = I2K2I3K3
  483. "zip1 v22.4h, v9.4h, v11.4h\n" // d22 = J0L0J1L1
  484. "zip2 v23.4h, v9.4h, v11.4h\n" // d23 = J2L2J3L3
  485. "zip1 v8.4h, v20.4h, v22.4h\n" // d8 = I0J0K0L0
  486. "zip2 v9.4h, v20.4h, v22.4h\n" // d9 = I1J1K1L1
  487. "zip1 v10.4h, v21.4h, v23.4h\n" // d10 = I2J2K2L2
  488. "zip2 v11.4h, v21.4h, v23.4h\n" // d11 = I3J3K3L3
  489. "st1 {v0.4h}, [%[outptr]], #8\n" // d0 = A0B0C0D0
  490. "st1 {v4.4h}, [%[outptr]], #8\n" // d4 = E0F0G0H0
  491. "st1 {v8.4h}, [%[outptr]], #8\n" // d8 = I0J0K0L0
  492. "st1 {v1.4h}, [%[outptr]], #8\n" // d1 = A1B1C1D1
  493. "st1 {v5.4h}, [%[outptr]], #8\n" // d5 = E1F1G1H1
  494. "st1 {v9.4h}, [%[outptr]], #8\n" // d9 = I1J1K1L1
  495. "st1 {v2.4h}, [%[outptr]], #8\n" // d2 = A2B2C2D2
  496. "st1 {v6.4h}, [%[outptr]], #8\n" // d6 = E2F2G2H2
  497. "st1 {v10.4h}, [%[outptr]], #8\n" // d10 = I2J2K2L2
  498. "st1 {v3.4h}, [%[outptr]], #8\n" // d3 = A3B3C3D3
  499. "st1 {v7.4h}, [%[outptr]], #8\n" // d7 = E3F3G3H3
  500. "st1 {v11.4h}, [%[outptr]], #8\n" // d11 = I3J3K3L3
  501. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  502. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  503. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [inptr8] "+r"(inptr8),
  504. [inptr9] "+r"(inptr9), [inptr10] "+r"(inptr10), [inptr11] "+r"(inptr11),
  505. [outptr] "+r"(outptr)
  506. :
  507. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  508. "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
  509. "v22", "v23", "cc", "memory");
  510. }
  511. template <typename T>
  512. static inline void interleave_12x4_4_b(
  513. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  514. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  515. const T*& inptr8, const T*& inptr9, const T*& inptr10, const T*& inptr11,
  516. T*& outptr) {
  517. static_assert(
  518. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  519. "interleave_12x4_4_b only support uint8_t and int8_t");
  520. interleave_12x1_4_s(
  521. reinterpret_cast<const int32_t*&>(inptr0),
  522. reinterpret_cast<const int32_t*&>(inptr1),
  523. reinterpret_cast<const int32_t*&>(inptr2),
  524. reinterpret_cast<const int32_t*&>(inptr3),
  525. reinterpret_cast<const int32_t*&>(inptr4),
  526. reinterpret_cast<const int32_t*&>(inptr5),
  527. reinterpret_cast<const int32_t*&>(inptr6),
  528. reinterpret_cast<const int32_t*&>(inptr7),
  529. reinterpret_cast<const int32_t*&>(inptr8),
  530. reinterpret_cast<const int32_t*&>(inptr9),
  531. reinterpret_cast<const int32_t*&>(inptr10),
  532. reinterpret_cast<const int32_t*&>(inptr11),
  533. reinterpret_cast<int32_t*&>(outptr));
  534. }
  535. static inline void interleave_2x1_4_s(
  536. const int32_t*& inptr0, const int32_t*& inptr1, int32_t*& outptr) {
  537. asm volatile(
  538. "ld1 {v0.4s}, [%[inptr0]], #16\n" // d0 = A0A1A2A3
  539. "ld1 {v1.4s}, [%[inptr1]], #16\n" // d1 = B0B1B2B3
  540. "st1 {v0.4s}, [%[outptr]], #16\n"
  541. "st1 {v1.4s}, [%[outptr]], #16\n"
  542. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [outptr] "+r"(outptr)
  543. :
  544. : "v0", "v1", "cc", "memory");
  545. }
  546. static inline void interleave_8x1_4_s(
  547. const int32_t*& inptr0, const int32_t*& inptr1, const int32_t*& inptr2,
  548. const int32_t*& inptr3, const int32_t*& inptr4, const int32_t*& inptr5,
  549. const int32_t*& inptr6, const int32_t*& inptr7, int32_t*& outptr) {
  550. asm volatile(
  551. "ld1 {v0.4s}, [%[inptr0]], #16\n" // d0 = A0A1A2A3
  552. "ld1 {v1.4s}, [%[inptr1]], #16\n" // d1 = B0B1B2B3
  553. "ld1 {v2.4s}, [%[inptr2]], #16\n" // d2 = C0C1C2C3
  554. "ld1 {v3.4s}, [%[inptr3]], #16\n" // d3 = D0D1D2D3
  555. "zip1 v8.4s, v0.4s, v2.4s\n" // d8 = A0C0A1C1
  556. "zip2 v9.4s, v0.4s, v2.4s\n" // d9 = A2C2A3C3
  557. "zip1 v10.4s, v1.4s, v3.4s\n" // d10 = B0D0B1D1
  558. "zip2 v11.4s, v1.4s, v3.4s\n" // d11 = B2D2B3D3
  559. "zip1 v12.4s, v8.4s, v10.4s\n" // d12 = A0B0C0D0
  560. "zip2 v13.4s, v8.4s, v10.4s\n" // d13 = A1B1C1D1
  561. "zip1 v14.4s, v9.4s, v11.4s\n" // d14 = A2B2C2D2
  562. "zip2 v15.4s, v9.4s, v11.4s\n" // d15 = A3B3C3D3
  563. "ld1 {v4.4s}, [%[inptr4]], #16\n" // d4 = E0E1E2E3
  564. "ld1 {v5.4s}, [%[inptr5]], #16\n" // d5 = F0F1F2F3
  565. "ld1 {v6.4s}, [%[inptr6]], #16\n" // d6 = G0G1G2G3
  566. "ld1 {v7.4s}, [%[inptr7]], #16\n" // d7 = H0H1H2H3
  567. "zip1 v16.4s, v4.4s, v6.4s\n" // d16 = E0G0E1G1
  568. "zip2 v17.4s, v4.4s, v6.4s\n" // d17 = E2G2E3G3
  569. "zip1 v18.4s, v5.4s, v7.4s\n" // d18 = F0H0F1H1
  570. "zip2 v19.4s, v5.4s, v7.4s\n" // d19 = F2H2F3H3
  571. "zip1 v20.4s, v16.4s, v18.4s\n" // d20 = E0F0G0H0
  572. "zip2 v21.4s, v16.4s, v18.4s\n" // d21 = E1F1G1H1
  573. "zip1 v22.4s, v17.4s, v19.4s\n" // d22 = E2F2G2H2
  574. "zip2 v23.4s, v17.4s, v19.4s\n" // d23 = E3F3G3H3
  575. "st1 {v12.4s}, [%[outptr]], #16\n"
  576. "st1 {v20.4s}, [%[outptr]], #16\n"
  577. "st1 {v13.4s}, [%[outptr]], #16\n"
  578. "st1 {v21.4s}, [%[outptr]], #16\n"
  579. "st1 {v14.4s}, [%[outptr]], #16\n"
  580. "st1 {v22.4s}, [%[outptr]], #16\n"
  581. "st1 {v15.4s}, [%[outptr]], #16\n"
  582. "st1 {v23.4s}, [%[outptr]], #16\n"
  583. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  584. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  585. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  586. :
  587. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  588. "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
  589. "v22", "v23", "cc", "memory");
  590. }
  591. static inline void interleave_8x1_2_d(
  592. const int64_t*& inptr0, const int64_t*& inptr1, const int64_t*& inptr2,
  593. const int64_t*& inptr3, const int64_t*& inptr4, const int64_t*& inptr5,
  594. const int64_t*& inptr6, const int64_t*& inptr7, int64_t*& outptr) {
  595. asm volatile(
  596. "ld1 {v0.2d}, [%[inptr0]], #16\n" // d0 = A0A1
  597. "ld1 {v1.2d}, [%[inptr1]], #16\n" // d1 = B0B1
  598. "ld1 {v2.2d}, [%[inptr2]], #16\n" // d2 = C0C1
  599. "ld1 {v3.2d}, [%[inptr3]], #16\n" // d3 = D0D1
  600. "ld1 {v4.2d}, [%[inptr4]], #16\n" // d4 = E0E1
  601. "ld1 {v5.2d}, [%[inptr5]], #16\n" // d5 = F0F1
  602. "ld1 {v6.2d}, [%[inptr6]], #16\n" // d6 = G0G1
  603. "ld1 {v7.2d}, [%[inptr7]], #16\n" // d7 = H0H1
  604. "zip1 v8.2d, v0.2d, v1.2d\n" // d8 = A0B0
  605. "zip2 v9.2d, v0.2d, v1.2d\n" // d9 = A1B1
  606. "zip1 v10.2d, v2.2d, v3.2d\n" // d10 = C0D0
  607. "zip2 v11.2d, v2.2d, v3.2d\n" // d11 = C1D1
  608. "zip1 v12.2d, v4.2d, v5.2d\n" // d12 = E0F0
  609. "zip2 v13.2d, v4.2d, v5.2d\n" // d13 = E1F1
  610. "zip1 v14.2d, v6.2d, v7.2d\n" // d14 = G0H0
  611. "zip2 v15.2d, v6.2d, v7.2d\n" // d15 = G1H1
  612. "st1 {v8.2d}, [%[outptr]], #16\n"
  613. "st1 {v10.2d}, [%[outptr]], #16\n"
  614. "st1 {v12.2d}, [%[outptr]], #16\n"
  615. "st1 {v14.2d}, [%[outptr]], #16\n"
  616. "st1 {v9.2d}, [%[outptr]], #16\n"
  617. "st1 {v11.2d}, [%[outptr]], #16\n"
  618. "st1 {v13.2d}, [%[outptr]], #16\n"
  619. "st1 {v15.2d}, [%[outptr]], #16\n"
  620. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  621. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  622. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  623. :
  624. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  625. "v12", "v13", "v14", "v15", "cc", "memory");
  626. }
  627. static inline void interleave_8x2_2_d(
  628. const int64_t*& inptr0, const int64_t*& inptr1, const int64_t*& inptr2,
  629. const int64_t*& inptr3, const int64_t*& inptr4, const int64_t*& inptr5,
  630. const int64_t*& inptr6, const int64_t*& inptr7, int64_t*& outptr) {
  631. asm volatile(
  632. "ld1 {v0.2d}, [%[inptr0]], #16\n" // d0 = A0
  633. "ld1 {v1.2d}, [%[inptr0]], #16\n" // d1 = A1
  634. "ld1 {v2.2d}, [%[inptr1]], #16\n" // d2 = B0
  635. "ld1 {v3.2d}, [%[inptr1]], #16\n" // d3 = B1
  636. "ld1 {v4.2d}, [%[inptr2]], #16\n" // d4 = C0
  637. "ld1 {v5.2d}, [%[inptr2]], #16\n" // d5 = C1
  638. "ld1 {v6.2d}, [%[inptr3]], #16\n" // d6 = D0
  639. "ld1 {v7.2d}, [%[inptr3]], #16\n" // d7 = D1
  640. "ld1 {v8.2d}, [%[inptr4]], #16\n" // d8 = E0
  641. "ld1 {v9.2d}, [%[inptr4]], #16\n" // d9 = E1
  642. "ld1 {v10.2d}, [%[inptr5]], #16\n" // d10 = F0
  643. "ld1 {v11.2d}, [%[inptr5]], #16\n" // d11 = F1
  644. "ld1 {v12.2d}, [%[inptr6]], #16\n" // d12 = G0
  645. "ld1 {v13.2d}, [%[inptr6]], #16\n" // d13 = G1
  646. "ld1 {v14.2d}, [%[inptr7]], #16\n" // d14 = H0
  647. "ld1 {v15.2d}, [%[inptr7]], #16\n" // d15 = H1
  648. "st1 {v0.2d}, [%[outptr]], #16\n"
  649. "st1 {v2.2d}, [%[outptr]], #16\n"
  650. "st1 {v4.2d}, [%[outptr]], #16\n"
  651. "st1 {v6.2d}, [%[outptr]], #16\n"
  652. "st1 {v8.2d}, [%[outptr]], #16\n"
  653. "st1 {v10.2d}, [%[outptr]], #16\n"
  654. "st1 {v12.2d}, [%[outptr]], #16\n"
  655. "st1 {v14.2d}, [%[outptr]], #16\n"
  656. "st1 {v1.2d}, [%[outptr]], #16\n"
  657. "st1 {v3.2d}, [%[outptr]], #16\n"
  658. "st1 {v5.2d}, [%[outptr]], #16\n"
  659. "st1 {v7.2d}, [%[outptr]], #16\n"
  660. "st1 {v9.2d}, [%[outptr]], #16\n"
  661. "st1 {v11.2d}, [%[outptr]], #16\n"
  662. "st1 {v13.2d}, [%[outptr]], #16\n"
  663. "st1 {v15.2d}, [%[outptr]], #16\n"
  664. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  665. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  666. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  667. :
  668. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  669. "v12", "v13", "v14", "v15", "cc", "memory");
  670. }
  671. template <typename T>
  672. static inline void interleave_2x4_4_b(const T*& inptr0, const T*& inptr1, T*& outptr) {
  673. static_assert(
  674. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  675. "interleave_2x4_4_b only support uint8_t and int8_t");
  676. interleave_2x1_4_s(
  677. reinterpret_cast<const int32_t*&>(inptr0),
  678. reinterpret_cast<const int32_t*&>(inptr1),
  679. reinterpret_cast<int32_t*&>(outptr));
  680. }
  681. template <typename T>
  682. static inline void interleave_8x4_4_b(
  683. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  684. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  685. T*& outptr) {
  686. static_assert(
  687. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  688. "interleave_8x4_4_b only support uint8_t and int8_t");
  689. interleave_8x1_4_s(
  690. reinterpret_cast<const int32_t*&>(inptr0),
  691. reinterpret_cast<const int32_t*&>(inptr1),
  692. reinterpret_cast<const int32_t*&>(inptr2),
  693. reinterpret_cast<const int32_t*&>(inptr3),
  694. reinterpret_cast<const int32_t*&>(inptr4),
  695. reinterpret_cast<const int32_t*&>(inptr5),
  696. reinterpret_cast<const int32_t*&>(inptr6),
  697. reinterpret_cast<const int32_t*&>(inptr7),
  698. reinterpret_cast<int32_t*&>(outptr));
  699. }
  700. template <typename T>
  701. static inline void interleave_8x4_1_h(
  702. const T*& in0, const T*& in1, const T*& in2, const T*& in3, T* out) {
  703. static_assert(sizeof(T) == 2, "only support size == 2");
  704. asm volatile(
  705. "ldr q0, [%[in0]], #16\n" // A1A2A3A4A5A6A7A8
  706. "ldr q1, [%[in1]], #16\n" // B1B2B3B4B5B6B7B8
  707. "ldr q2, [%[in2]], #16\n" // C1C2C3C4C5C6C7C8
  708. "ldr q3, [%[in3]], #16\n" // D1D2D3D4D5D6D7D8
  709. "trn1 v4.8h, v0.8h, v1.8h\n" // A1B1A3B3A5B5A7B7
  710. "trn2 v5.8h, v0.8h, v1.8h\n" // A2B2A4B4A6B6A8B8
  711. "trn1 v6.8h, v2.8h, v3.8h\n" // C1D1C3D3C5D5C7D7
  712. "trn2 v7.8h, v2.8h, v3.8h\n" // C2D2C4D4C6D6C8D8
  713. "zip1 v8.4s, v4.4s, v6.4s\n" // A1B1C1D1A3B3C3D3
  714. "zip2 v9.4s, v4.4s, v6.4s\n" // A5B5C5D5A7B7C7D7
  715. "zip1 v10.4s, v5.4s, v7.4s\n" // A2B2C2D2A4B4C4D4
  716. "zip2 v11.4s, v5.4s, v7.4s\n" // A6B6C6D6A8B8C8D8
  717. "zip1 v12.2d, v8.2d, v10.2d\n" // A1B1C1D1A2B2C2D2
  718. "zip2 v13.2d, v8.2d, v10.2d\n" // A3B3C3D3A4B4C4D4
  719. "zip1 v14.2d, v9.2d, v11.2d\n" // A5B5C5D5A6B6C6D6
  720. "zip2 v15.2d, v9.2d, v11.2d\n" // A7B7C7D7A8B8C8D8
  721. "st1 {v12.2d}, [%[out]], #16\n"
  722. "st1 {v13.2d}, [%[out]], #16\n"
  723. "st1 {v14.2d}, [%[out]], #16\n"
  724. "st1 {v15.2d}, [%[out]], #16\n"
  725. : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3),
  726. [out] "+r"(out)
  727. :
  728. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  729. "v12", "v13", "v14", "v15", "memory");
  730. }
  731. template <typename T>
  732. static inline void interleave_8x8_2_b(
  733. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  734. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  735. T*& outptr) {
  736. static_assert(
  737. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  738. "interleave_8x8_2_b only support uint8_t and int8_t");
  739. interleave_8x1_2_d(
  740. reinterpret_cast<const int64_t*&>(inptr0),
  741. reinterpret_cast<const int64_t*&>(inptr1),
  742. reinterpret_cast<const int64_t*&>(inptr2),
  743. reinterpret_cast<const int64_t*&>(inptr3),
  744. reinterpret_cast<const int64_t*&>(inptr4),
  745. reinterpret_cast<const int64_t*&>(inptr5),
  746. reinterpret_cast<const int64_t*&>(inptr6),
  747. reinterpret_cast<const int64_t*&>(inptr7),
  748. reinterpret_cast<int64_t*&>(outptr));
  749. }
  750. template <typename T>
  751. static inline void interleave_8x8_2_h(
  752. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  753. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  754. T*& outptr) {
  755. static_assert(
  756. std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value,
  757. "interleave_8x8_2_h only support uint16_t and int16_t");
  758. interleave_8x2_2_d(
  759. reinterpret_cast<const int64_t*&>(inptr0),
  760. reinterpret_cast<const int64_t*&>(inptr1),
  761. reinterpret_cast<const int64_t*&>(inptr2),
  762. reinterpret_cast<const int64_t*&>(inptr3),
  763. reinterpret_cast<const int64_t*&>(inptr4),
  764. reinterpret_cast<const int64_t*&>(inptr5),
  765. reinterpret_cast<const int64_t*&>(inptr6),
  766. reinterpret_cast<const int64_t*&>(inptr7),
  767. reinterpret_cast<int64_t*&>(outptr));
  768. }
  769. template <typename T>
  770. static inline void interleave_8x2_8_b(
  771. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  772. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  773. T*& outptr) {
  774. static_assert(
  775. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  776. "interleave_8x2_8_b only support uint8_t and int8_t");
  777. interleave_8x1_8_h(
  778. reinterpret_cast<const int16_t*&>(inptr0),
  779. reinterpret_cast<const int16_t*&>(inptr1),
  780. reinterpret_cast<const int16_t*&>(inptr2),
  781. reinterpret_cast<const int16_t*&>(inptr3),
  782. reinterpret_cast<const int16_t*&>(inptr4),
  783. reinterpret_cast<const int16_t*&>(inptr5),
  784. reinterpret_cast<const int16_t*&>(inptr6),
  785. reinterpret_cast<const int16_t*&>(inptr7),
  786. reinterpret_cast<int16_t*&>(outptr));
  787. }
  788. template <typename T>
  789. static inline void interleave_8x8_1_b(
  790. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  791. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  792. T*& outptr) {
  793. static_assert(
  794. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  795. "interleave_8x8_1_b only support uint8_t and int8_t");
  796. asm volatile(
  797. "ld1 {v0.d}[0], [%[inptr0]], 8\n" // A1A2A3A4A5A6A7A8
  798. "ld1 {v0.d}[1], [%[inptr1]], 8\n" // B1B2B3B4B5B6B7B8
  799. "ld1 {v1.d}[0], [%[inptr2]], 8\n" // C1C2C3C4C5C6C7C8
  800. "ld1 {v1.d}[1], [%[inptr3]], 8\n" // D1D2D3D4D5D6D7D8
  801. "ld1 {v2.d}[0], [%[inptr4]], 8\n" // E1E2E3E4E5E6E7E8
  802. "ld1 {v2.d}[1], [%[inptr5]], 8\n" // F1F2F3F4F5F6F7F8
  803. "ld1 {v3.d}[0], [%[inptr6]], 8\n" // G1G2G3G4G5G6G7G8
  804. "ld1 {v3.d}[1], [%[inptr7]], 8\n" // H1H2H3H4H5H6H7H8
  805. "st1 {v0.2d}, [%[outptr]], 16\n" // A1A2A3A4A5A6A7A8B1B2B3B4B5B6B7B8
  806. "st1 {v1.2d}, [%[outptr]], 16\n" // C1C2C3C4C5C6C7C8D1D2D3D4D5D6D7D8
  807. "st1 {v2.2d}, [%[outptr]], 16\n" // E1E2E3E4E5E6E7E8F1F2F3F4F5F6F7F8
  808. "st1 {v3.2d}, [%[outptr]], 16\n" // G1G2G3G4G5G6G7G8H1H2H3H4H5H6H7H8
  809. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  810. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  811. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  812. :
  813. : "v0", "v1", "v2", "v3", "memory");
  814. }
  815. template <typename T>
  816. static inline void interleave_8x4_1_b_with_shift(
  817. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  818. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  819. T* outptr) {
  820. static_assert(sizeof(T) == 1, "only support size == 1");
  821. asm volatile(
  822. "ld1 {v0.s}[0], [%[inptr0]], #4\n"
  823. "ld1 {v0.s}[1], [%[inptr1]], #4\n"
  824. "ld1 {v0.s}[2], [%[inptr2]], #4\n"
  825. "ld1 {v0.s}[3], [%[inptr3]], #4\n"
  826. "ld1 {v1.s}[0], [%[inptr4]], #4\n"
  827. "ld1 {v1.s}[1], [%[inptr5]], #4\n"
  828. "ld1 {v1.s}[2], [%[inptr6]], #4\n"
  829. "ld1 {v1.s}[3], [%[inptr7]], #4\n"
  830. "shl v2.16b, v0.16b, #4\n"
  831. "shl v5.16b, v1.16b, #4\n"
  832. "sshr v3.16b, v0.16b, #4\n" // hig
  833. "sshr v4.16b, v2.16b, #4\n" // low
  834. "sshr v6.16b, v1.16b, #4\n" // hig
  835. "sshr v7.16b, v5.16b, #4\n" // low
  836. "zip1 v8.16b, v4.16b, v3.16b\n"
  837. "zip2 v9.16b, v4.16b, v3.16b\n"
  838. "zip1 v10.16b, v7.16b, v6.16b\n"
  839. "zip2 v11.16b, v7.16b, v6.16b\n"
  840. "st1 {v8.16b-v11.16b},[%[outptr]],#64"
  841. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  842. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  843. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  844. :
  845. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  846. "memory");
  847. }
  848. template <typename T>
  849. static inline void interleave_8x8_1_h(
  850. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  851. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  852. T*& outptr) {
  853. static_assert(
  854. std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value,
  855. "interleave_8x8_1_h only support uint16_t and int16_t");
  856. asm volatile(
  857. "ld1 {v0.8h}, [%[inptr0]], #16\n" // A1A2A3A4A5A6A7A8
  858. "ld1 {v1.8h}, [%[inptr1]], #16\n" // B1B2B3B4B5B6B7B8
  859. "ld1 {v2.8h}, [%[inptr2]], #16\n" // C1C2C3C4C5C6C7C8
  860. "ld1 {v3.8h}, [%[inptr3]], #16\n" // D1D2D3D4D5D6D7D8
  861. "ld1 {v4.8h}, [%[inptr4]], #16\n" // E1E2E3E4E5E6E7E8
  862. "ld1 {v5.8h}, [%[inptr5]], #16\n" // F1F2F3F4F5F6F7F8
  863. "ld1 {v6.8h}, [%[inptr6]], #16\n" // G1G2G3G4G5G6G7G8
  864. "ld1 {v7.8h}, [%[inptr7]], #16\n" // H1H2H3H4H5H6H7H8
  865. "st1 {v0.8h}, [%[outptr]], #16\n" // A1A2A3A4A5A6A7A8
  866. "st1 {v1.8h}, [%[outptr]], #16\n" // B1B2B3B4B5B6B7B8
  867. "st1 {v2.8h}, [%[outptr]], #16\n" // C1C2C3C4C5C6C7C8
  868. "st1 {v3.8h}, [%[outptr]], #16\n" // D1D2D3D4D5D6D7D8
  869. "st1 {v4.8h}, [%[outptr]], #16\n" // E1E2E3E4E5E6E7E8
  870. "st1 {v5.8h}, [%[outptr]], #16\n" // F1F2F3F4F5F6F7F8
  871. "st1 {v6.8h}, [%[outptr]], #16\n" // G1G2G3G4G5G6G7G8
  872. "st1 {v7.8h}, [%[outptr]], #16\n" // H1H2H3H4H5H6H7H8
  873. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  874. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  875. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  876. :
  877. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
  878. }
  879. static inline void interleave_4x1_4_s(
  880. const int32_t*& inptr0, const int32_t*& inptr1, const int32_t*& inptr2,
  881. const int32_t*& inptr3, int32_t*& outptr) {
  882. asm volatile(
  883. "ld1 {v0.4s}, [%[inptr0]], #16\n" // d0 = A0A1A2A3
  884. "ld1 {v1.4s}, [%[inptr1]], #16\n" // d1 = B0B1B2B3
  885. "ld1 {v2.4s}, [%[inptr2]], #16\n" // d2 = C0C1C2C3
  886. "ld1 {v3.4s}, [%[inptr3]], #16\n" // d3 = D0D1D2D3
  887. "zip1 v8.4s, v0.4s, v2.4s\n" // d8 = A0C0A1C1
  888. "zip2 v9.4s, v0.4s, v2.4s\n" // d9 = A2C2A3C3
  889. "zip1 v10.4s, v1.4s, v3.4s\n" // d10 = B0D0B1D1
  890. "zip2 v11.4s, v1.4s, v3.4s\n" // d11 = B2D2B3D3
  891. "zip1 v12.4s, v8.4s, v10.4s\n" // d12 = A0B0C0D0
  892. "zip2 v13.4s, v8.4s, v10.4s\n" // d13 = A1B1C1D1
  893. "zip1 v14.4s, v9.4s, v11.4s\n" // d14 = A2B2C2D2
  894. "zip2 v15.4s, v9.4s, v11.4s\n" // d15 = A3B3C3D3
  895. "st1 {v12.4s}, [%[outptr]], #16\n"
  896. "st1 {v13.4s}, [%[outptr]], #16\n"
  897. "st1 {v14.4s}, [%[outptr]], #16\n"
  898. "st1 {v15.4s}, [%[outptr]], #16\n"
  899. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  900. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  901. :
  902. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  903. "v12", "v13", "v14", "v15", "cc", "memory");
  904. }
  905. template <typename T>
  906. static inline void interleave_4x8_1_s(
  907. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  908. T*& outptr) {
  909. static_assert(sizeof(T) == 4, "only support size == 4");
  910. asm volatile(
  911. "ld1 {v0.4s, v1.4s}, [%[inptr0]], #32\n"
  912. "ld1 {v2.4s, v3.4s}, [%[inptr1]], #32\n"
  913. "ld1 {v4.4s, v5.4s}, [%[inptr2]], #32\n"
  914. "ld1 {v6.4s, v7.4s}, [%[inptr3]], #32\n"
  915. "st1 {v0.4s, v1.4s}, [%[outptr]], #32\n"
  916. "st1 {v2.4s, v3.4s}, [%[outptr]], #32\n"
  917. "st1 {v4.4s, v5.4s}, [%[outptr]], #32\n"
  918. "st1 {v6.4s, v7.4s}, [%[outptr]], #32\n"
  919. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  920. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  921. :
  922. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "cc", "memory");
  923. }
  924. template <typename T>
  925. static inline void interleave_4x12_1_s(
  926. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  927. T*& outptr) {
  928. static_assert(sizeof(T) == 4, "only support size == 4");
  929. asm volatile(
  930. "ld1 {v0.4s, v1.4s, v2.4s}, [%[inptr0]], #48\n"
  931. "ld1 {v4.4s, v5.4s, v6.4s}, [%[inptr1]], #48\n"
  932. "ld1 {v8.4s, v9.4s, v10.4s}, [%[inptr2]], #48\n"
  933. "ld1 {v12.4s, v13.4s, v14.4s}, [%[inptr3]], #48\n"
  934. "st1 {v0.4s, v1.4s, v2.4s}, [%[outptr]], #48\n"
  935. "st1 {v4.4s, v5.4s, v6.4s}, [%[outptr]], #48\n"
  936. "st1 {v8.4s, v9.4s, v10.4s}, [%[outptr]], #48\n"
  937. "st1 {v12.4s, v13.4s, v14.4s}, [%[outptr]], #48\n"
  938. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  939. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  940. :
  941. : "v0", "v1", "v2", "v4", "v5", "v6", "v8", "v9", "v10", "v12", "v13",
  942. "v14", "cc", "memory");
  943. }
  944. template <typename T>
  945. static inline void interleave_4x16_1_b(
  946. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  947. T*& outptr) {
  948. static_assert(sizeof(T) == 1, "only support size == 1");
  949. asm volatile(
  950. "ld1 {v0.4s}, [%[inptr0]], #16\n" // d0 = A0A1A2A3
  951. "ld1 {v1.4s}, [%[inptr1]], #16\n" // d1 = B0B1B2B3
  952. "ld1 {v2.4s}, [%[inptr2]], #16\n" // d2 = C0C1C2C3
  953. "ld1 {v3.4s}, [%[inptr3]], #16\n" // d3 = D0D1D2D3
  954. "st1 {v0.4s}, [%[outptr]], #16\n"
  955. "st1 {v1.4s}, [%[outptr]], #16\n"
  956. "st1 {v2.4s}, [%[outptr]], #16\n"
  957. "st1 {v3.4s}, [%[outptr]], #16\n"
  958. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  959. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  960. :
  961. : "v0", "v1", "v2", "v3", "v4", "cc", "memory");
  962. }
  963. template <typename T>
  964. static inline void interleave_4x16_1_s(
  965. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  966. T*& outptr) {
  967. static_assert(sizeof(T) == 4, "only support size == 4");
  968. asm volatile(
  969. "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
  970. "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[inptr1]], #64\n"
  971. "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%[inptr2]], #64\n"
  972. "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%[inptr3]], #64\n"
  973. "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[outptr]], #64\n"
  974. "st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[outptr]], #64\n"
  975. "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%[outptr]], #64\n"
  976. "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%[outptr]], #64\n"
  977. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  978. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  979. :
  980. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  981. "v12", "v13", "v14", "v15", "cc", "memory");
  982. }
  983. template <typename T>
  984. static inline void interleave_4x2_4_b(
  985. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  986. T*& outptr) {
  987. static_assert(
  988. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  989. "interleave_4x2_4_b only support uint8_t and int8_t");
  990. interleave_4x1_4_h(
  991. reinterpret_cast<const int16_t*&>(inptr0),
  992. reinterpret_cast<const int16_t*&>(inptr1),
  993. reinterpret_cast<const int16_t*&>(inptr2),
  994. reinterpret_cast<const int16_t*&>(inptr3),
  995. reinterpret_cast<int16_t*&>(outptr));
  996. }
  997. template <typename T>
  998. static inline void interleave_4x4_4_b(
  999. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1000. T*& outptr) {
  1001. static_assert(
  1002. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1003. "interleave_4x4_4_b only support uint8_t and int8_t");
  1004. interleave_4x1_4_s(
  1005. reinterpret_cast<const int32_t*&>(inptr0),
  1006. reinterpret_cast<const int32_t*&>(inptr1),
  1007. reinterpret_cast<const int32_t*&>(inptr2),
  1008. reinterpret_cast<const int32_t*&>(inptr3),
  1009. reinterpret_cast<int32_t*&>(outptr));
  1010. }
  1011. template <typename T>
  1012. static inline void interleave_4x4_1_s(
  1013. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1014. T*& outptr) {
  1015. static_assert(sizeof(T) == 4, "interleave_4x4_1_s only support size == 4");
  1016. asm volatile(
  1017. "ld1 {v0.4s}, [%[inptr0]], #16\n"
  1018. "ld1 {v1.4s}, [%[inptr1]], #16\n"
  1019. "ld1 {v2.4s}, [%[inptr2]], #16\n"
  1020. "ld1 {v3.4s}, [%[inptr3]], #16\n"
  1021. "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[outptr]], #64\n"
  1022. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1023. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  1024. :
  1025. : "v0", "v1", "v2", "v3", "cc", "memory");
  1026. }
  1027. template <typename T>
  1028. static inline void interleave_2x4_4_s(const T*& inptr0, const T*& inptr1, T* outptr) {
  1029. static_assert(sizeof(T) == 4, "interleave_2x4_4_s only support size == 4");
  1030. asm volatile(
  1031. "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
  1032. "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[inptr1]], #64\n"
  1033. "stp q0, q4, [%[outptr]]\n"
  1034. "stp q1, q5, [%[outptr], #32]\n"
  1035. "stp q2, q6, [%[outptr], #64]\n"
  1036. "stp q3, q7, [%[outptr], #96]\n"
  1037. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [outptr] "+r"(outptr)
  1038. :
  1039. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
  1040. }
  1041. template <typename T>
  1042. static inline void interleave_1x4_4_s(const T*& inptr0, T* outptr) {
  1043. static_assert(sizeof(T) == 4, "interleave_1x4_4_s only support size == 4");
  1044. asm volatile(
  1045. "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
  1046. "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[outptr]]\n"
  1047. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
  1048. :
  1049. : "v0", "v1", "v2", "v3", "memory");
  1050. }
  1051. template <typename T>
  1052. static inline void interleave_4x8_2_b(
  1053. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1054. T*& outptr) {
  1055. static_assert(
  1056. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1057. "interleave_4x8_2_b only support uint8_t and int8_t");
  1058. interleave_4x1_2_d(
  1059. reinterpret_cast<const int64_t*&>(inptr0),
  1060. reinterpret_cast<const int64_t*&>(inptr1),
  1061. reinterpret_cast<const int64_t*&>(inptr2),
  1062. reinterpret_cast<const int64_t*&>(inptr3),
  1063. reinterpret_cast<int64_t*&>(outptr));
  1064. }
  1065. template <typename T>
  1066. static inline void interleave_4x8_2_h(
  1067. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1068. T*& outptr) {
  1069. static_assert(
  1070. std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value,
  1071. "interleave_4x8_2_h only support uint16_t and int16_t");
  1072. interleave_4x2_2_d(
  1073. reinterpret_cast<const int64_t*&>(inptr0),
  1074. reinterpret_cast<const int64_t*&>(inptr1),
  1075. reinterpret_cast<const int64_t*&>(inptr2),
  1076. reinterpret_cast<const int64_t*&>(inptr3),
  1077. reinterpret_cast<int64_t*&>(outptr));
  1078. }
  1079. template <typename T>
  1080. static inline void interleave_1x16_1_s(const T*& inptr0, T*& outptr) {
  1081. static_assert(sizeof(T) == 4, "interleave_1x16_1_s only support size == 4");
  1082. asm volatile(
  1083. "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
  1084. "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[outptr]], #64\n"
  1085. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
  1086. :
  1087. : "v0", "v1", "v2", "v3", "cc", "memory");
  1088. }
  1089. template <typename T>
  1090. static inline void interleave_1x12_1_s(const T*& inptr0, T*& outptr) {
  1091. static_assert(sizeof(T) == 4, "interleave_1x12_1_s only support size == 4");
  1092. asm volatile(
  1093. "ld1 {v0.4s, v1.4s, v2.4s}, [%[inptr0]], #48\n"
  1094. "st1 {v0.4s, v1.4s, v2.4s}, [%[outptr]], #48\n"
  1095. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
  1096. :
  1097. : "v0", "v1", "v2", "cc", "memory");
  1098. }
  1099. template <typename T>
  1100. static inline void interleave_1x8_1_s(const T*& inptr0, T*& outptr) {
  1101. static_assert(sizeof(T) == 4, "interleave_1x8_1_s only support size == 4");
  1102. asm volatile(
  1103. "ld1 {v0.4s, v1.4s}, [%[inptr0]], #32\n"
  1104. "st1 {v0.4s, v1.4s}, [%[outptr]], #32\n"
  1105. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
  1106. :
  1107. : "v0", "v1", "cc", "memory");
  1108. }
  1109. template <typename T>
  1110. static inline void interleave_1x4_1_s(const T*& inptr0, T*& outptr) {
  1111. static_assert(sizeof(T) == 4, "interleave_1x4_1_s only support size == 4");
  1112. asm volatile(
  1113. "ld1 {v0.4s}, [%[inptr0]], #16\n"
  1114. "st1 {v0.4s}, [%[outptr]], #16\n"
  1115. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
  1116. :
  1117. : "v0", "cc", "memory");
  1118. }
  1119. template <typename T>
  1120. static inline void interleave_helper(
  1121. const T*& inptr, T*& outptr, int unroll_k, int ksize, T val = 0) {
  1122. int k = 0;
  1123. for (; k < ksize; k++) {
  1124. *outptr++ = *inptr++;
  1125. }
  1126. for (; k < unroll_k; k++) {
  1127. *outptr++ = val;
  1128. }
  1129. }
  1130. template <typename T>
  1131. static inline void interleave_1(
  1132. const T*& inptr0, T*& outptr, int unroll_k, int ksize, T val = 0) {
  1133. for (int k = 0; k < ksize; k += unroll_k) {
  1134. int size = std::min(unroll_k, ksize - k);
  1135. interleave_helper(inptr0, outptr, unroll_k, size, val);
  1136. }
  1137. }
  1138. template <typename T>
  1139. static inline void interleave_4(
  1140. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1141. T*& outptr, int unroll_k, int ksize, T val = 0) {
  1142. for (int k = 0; k < ksize; k += unroll_k) {
  1143. int size = std::min(unroll_k, ksize - k);
  1144. interleave_helper(inptr0, outptr, unroll_k, size, val);
  1145. interleave_helper(inptr1, outptr, unroll_k, size, val);
  1146. interleave_helper(inptr2, outptr, unroll_k, size, val);
  1147. interleave_helper(inptr3, outptr, unroll_k, size, val);
  1148. }
  1149. }
  1150. template <typename T>
  1151. static inline void interleave_8(
  1152. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1153. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1154. T*& outptr, int unroll_k, int ksize, T val = 0) {
  1155. for (int k = 0; k < ksize; k += unroll_k) {
  1156. int size = std::min(unroll_k, ksize - k);
  1157. interleave_helper(inptr0, outptr, unroll_k, size, val);
  1158. interleave_helper(inptr1, outptr, unroll_k, size, val);
  1159. interleave_helper(inptr2, outptr, unroll_k, size, val);
  1160. interleave_helper(inptr3, outptr, unroll_k, size, val);
  1161. interleave_helper(inptr4, outptr, unroll_k, size, val);
  1162. interleave_helper(inptr5, outptr, unroll_k, size, val);
  1163. interleave_helper(inptr6, outptr, unroll_k, size, val);
  1164. interleave_helper(inptr7, outptr, unroll_k, size, val);
  1165. }
  1166. }
  1167. template <typename T>
  1168. static inline void interleave_12(
  1169. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1170. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1171. const T*& inptr8, const T*& inptr9, const T*& inptr10, const T*& inptr11,
  1172. T*& outptr, int unroll_k, int ksize) {
  1173. for (int k = 0; k < ksize; k += unroll_k) {
  1174. int size = std::min(unroll_k, ksize - k);
  1175. interleave_helper(inptr0, outptr, unroll_k, size);
  1176. interleave_helper(inptr1, outptr, unroll_k, size);
  1177. interleave_helper(inptr2, outptr, unroll_k, size);
  1178. interleave_helper(inptr3, outptr, unroll_k, size);
  1179. interleave_helper(inptr4, outptr, unroll_k, size);
  1180. interleave_helper(inptr5, outptr, unroll_k, size);
  1181. interleave_helper(inptr6, outptr, unroll_k, size);
  1182. interleave_helper(inptr7, outptr, unroll_k, size);
  1183. interleave_helper(inptr8, outptr, unroll_k, size);
  1184. interleave_helper(inptr9, outptr, unroll_k, size);
  1185. interleave_helper(inptr10, outptr, unroll_k, size);
  1186. interleave_helper(inptr11, outptr, unroll_k, size);
  1187. }
  1188. }
  1189. /* ======================== transpose pack B ======================== */
  1190. /**
  1191. * transpose_INTERLEAVE_UNROLLK_BATCH_type
  1192. *
  1193. * BATCH means process BATCH * INTERLEAVE cols once, BATCH * sizeof(TYPE) *
  1194. * INTERLEAVE = 16bytes(128bits, a vector size).
  1195. *
  1196. * the elements traverse order:
  1197. * rep(j, 0, INTERLEAVE) rep(i, 0, UNROLL_K) *ouptr++ = inptr[i, j]
  1198. */
  1199. template <typename T>
  1200. static inline void transpose_24x4_1_h(
  1201. const T*& in0, const T*& in1, const T*& in2, const T*& in3, T* out) {
  1202. static_assert(sizeof(T) == 2, "only support size == 2");
  1203. asm volatile(
  1204. "ldp q0, q1, [%[in0]], #32\n"
  1205. "stp q0, q1, [%[out]]\n"
  1206. "ldr q2, [%[in0]], #16\n"
  1207. ASM_PREFETCH("[%[in0], #192]")
  1208. "ldp q3, q4, [%[in1]], #32\n"
  1209. "stp q2, q3, [%[out], #32]\n"
  1210. "ldr q5, [%[in1]], #16\n"
  1211. ASM_PREFETCH("[%[in1], #192]")
  1212. "stp q4, q5, [%[out], #64]\n"
  1213. "ldp q6, q7, [%[in2]], #32\n"
  1214. "stp q6, q7, [%[out], #96]\n"
  1215. "ldr q8, [%[in2]], #16\n"
  1216. ASM_PREFETCH("[%[in2], #192]")
  1217. "ldp q9, q10, [%[in3]], #32\n"
  1218. "stp q8, q9, [%[out], #128]\n"
  1219. "ldr q11, [%[in3]], #16\n"
  1220. "stp q10, q11, [%[out], #160]\n"
  1221. ASM_PREFETCH("[%[in3], #192]")
  1222. : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2),
  1223. [in3] "+r"(in3), [out] "+r"(out)
  1224. :
  1225. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
  1226. "v11", "memory");
  1227. }
  1228. template <typename T>
  1229. static inline void transpose_16x4_1_h(
  1230. const T*& in0, const T*& in1, const T*& in2, const T*& in3, T* out) {
  1231. static_assert(sizeof(T) == 2, "only support size == 2");
  1232. asm volatile(
  1233. "ldp q0, q1, [%[in0]], #32\n"
  1234. "stp q0, q1, [%[out]]\n"
  1235. "ldp q2, q3, [%[in1]], #32\n"
  1236. "stp q2, q3, [%[out], #32]\n"
  1237. "ldp q4, q5, [%[in2]], #32\n"
  1238. "stp q4, q5, [%[out], #64]\n"
  1239. "ldp q6, q7, [%[in3]], #32\n"
  1240. "stp q6, q7, [%[out], #96]\n"
  1241. : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3),
  1242. [out] "+r"(out)
  1243. :
  1244. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
  1245. }
  1246. template <typename T>
  1247. static inline void transpose_8x4_1_h(
  1248. const T*& in0, const T*& in1, const T*& in2, const T*& in3, T* out) {
  1249. static_assert(sizeof(T) == 2, "only support size == 2");
  1250. asm volatile(
  1251. "ldr q0, [%[in0]], #16\n"
  1252. "str q0, [%[out]]\n"
  1253. "ldr q1, [%[in1]], #16\n"
  1254. "str q1, [%[out], #16]\n"
  1255. "ldr q2, [%[in2]], #16\n"
  1256. "str q2, [%[out], #32]\n"
  1257. "ldr q3, [%[in3]], #16\n"
  1258. "str q3, [%[out], #48]\n"
  1259. : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3),
  1260. [out] "+r"(out)
  1261. :
  1262. : "v0", "v1", "v2", "v3", "memory");
  1263. }
  1264. template <typename T>
  1265. static inline void transpose_24x2_1_h(const T*& in0, const T*& in1, T* out) {
  1266. static_assert(sizeof(T) == 2, "only support size == 2");
  1267. asm volatile(
  1268. "ldp q0, q1, [%[in0]], #32\n"
  1269. "stp q0, q1, [%[out]]\n"
  1270. "ldr q2, [%[in0]], #16\n"
  1271. ASM_PREFETCH("[%[in0], #192]")
  1272. "ldp q3, q4, [%[in1]], #32\n"
  1273. "stp q2, q3, [%[out], #32]\n"
  1274. "ldr q5, [%[in1]], #16\n"
  1275. ASM_PREFETCH("[%[in1], #192]")
  1276. "stp q4, q5, [%[out], #64]\n"
  1277. : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
  1278. :
  1279. : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
  1280. }
  1281. template <typename T>
  1282. static inline void transpose_16x2_1_h(const T*& in0, const T*& in1, T* out) {
  1283. static_assert(sizeof(T) == 2, "only support size == 2");
  1284. asm volatile(
  1285. "ldp q0, q1, [%[in0]], #32\n"
  1286. "stp q0, q1, [%[out]]\n"
  1287. "ldp q2, q3, [%[in1]], #32\n"
  1288. "stp q2, q3, [%[out], #32]\n"
  1289. : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
  1290. :
  1291. : "v0", "v1", "v2", "v3", "memory");
  1292. }
  1293. template <typename T>
  1294. static inline void transpose_8x2_1_h(const T*& in0, const T*& in1, T* out) {
  1295. static_assert(sizeof(T) == 2, "only support size == 2");
  1296. asm volatile(
  1297. "ldr q0, [%[in0]], #16\n"
  1298. "str q0, [%[out]]\n"
  1299. "ldr q1, [%[in1]], #16\n"
  1300. "str q1, [%[out], #16]\n"
  1301. : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
  1302. :
  1303. : "v0", "v1", "memory");
  1304. }
  1305. template <typename T>
  1306. static inline void transpose_24x1_1_h(const T*& in0, T* out) {
  1307. static_assert(sizeof(T) == 2, "only support size == 2");
  1308. // clang-format off
  1309. asm volatile(
  1310. "ldp q0, q1, [%[in0]], #32\n"
  1311. "stp q0, q1, [%[out]] \n"
  1312. "ldr q2, [%[in0]], #16 \n"
  1313. ASM_PREFETCH("[%[in0], #192]")
  1314. "str q2, [%[out], #32] \n"
  1315. : [in0] "+r"(in0), [out] "+r"(out)
  1316. :
  1317. : "v0", "v1", "v2", "memory");
  1318. // clang-format on
  1319. }
  1320. template <typename T>
  1321. static inline void transpose_16x1_1_h(const T*& in0, T* out) {
  1322. static_assert(sizeof(T) == 2, "only support size == 2");
  1323. asm volatile(
  1324. "ldp q0, q1, [%[in0]], #32\n"
  1325. "stp q0, q1, [%[out]]\n"
  1326. : [in0] "+r"(in0), [out] "+r"(out)
  1327. :
  1328. : "v0", "v1", "memory");
  1329. }
  1330. template <typename T>
  1331. static inline void transpose_12x1_1_h(const T*& in0, T* out) {
  1332. static_assert(sizeof(T) == 2, "only support size == 2");
  1333. // clang-format off
  1334. asm volatile(
  1335. "ld1 {v0.8h}, [%[in0]], #16\n"
  1336. "ld1 {v1.4h}, [%[in0]], #8\n"
  1337. "st1 {v0.8h}, [%[out]], #16\n"
  1338. "st1 {v1.4h}, [%[out]], #8\n"
  1339. : [in0] "+r"(in0), [out] "+r"(out)
  1340. :
  1341. : "v0", "v1", "memory");
  1342. // clang-format on
  1343. }
  1344. template <typename T>
  1345. static inline void transpose_8x1_1_h(const T*& in0, T* out) {
  1346. static_assert(sizeof(T) == 2, "only support size == 2");
  1347. asm volatile(
  1348. "ldr q0, [%[in0]], #16\n"
  1349. "str q0, [%[out]]\n"
  1350. : [in0] "+r"(in0), [out] "+r"(out)
  1351. :
  1352. : "v0", "memory");
  1353. }
  1354. template <typename T>
  1355. static inline void transpose_4x1_1_h(const T*& in0, T* out) {
  1356. static_assert(sizeof(T) == 2, "only support size == 2");
  1357. // clang-format off
  1358. asm volatile(
  1359. "ld1 {v0.4h}, [%[in0]], #8\n"
  1360. "st1 {v0.4h}, [%[out]], #8\n"
  1361. : [in0] "+r"(in0), [out] "+r"(out)
  1362. :
  1363. : "v0", "memory");
  1364. // clang-format on
  1365. }
  1366. template <typename T>
  1367. static inline void transpose_4x4_1_s(
  1368. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1369. T* outptr, int stride = 16) {
  1370. static_assert(sizeof(T) == 4, "transpose_4x4_1_s only support sizeof(T) == 4");
  1371. asm volatile(
  1372. "ld1 {v0.4s}, [%[inptr0]], 16\n" // A0A1A2A3
  1373. "ld1 {v1.4s}, [%[inptr1]], 16\n" // B0B1B2B3
  1374. "ld1 {v2.4s}, [%[inptr2]], 16\n" // C0C1C2C3
  1375. "ld1 {v3.4s}, [%[inptr3]], 16\n" // D0D1D2D3
  1376. "zip1 v4.4s, v0.4s, v1.4s\n"
  1377. "zip1 v5.4s, v2.4s, v3.4s\n"
  1378. "zip2 v6.4s, v0.4s, v1.4s\n"
  1379. "zip2 v7.4s, v2.4s, v3.4s\n"
  1380. "zip1 v8.2d, v4.2d, v5.2d\n"
  1381. "zip1 v9.2d, v6.2d, v7.2d\n"
  1382. "zip2 v10.2d, v4.2d, v5.2d\n"
  1383. "zip2 v11.2d, v6.2d, v7.2d\n"
  1384. "st1 {v8.4s}, [%[outptr]], %x[stride]\n"
  1385. "st1 {v10.4s}, [%[outptr]], %x[stride]\n"
  1386. "st1 {v9.4s}, [%[outptr]], %x[stride]\n"
  1387. "st1 {v11.4s}, [%[outptr]], %x[stride]\n"
  1388. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1389. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr), [stride] "+r"(stride)
  1390. :
  1391. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  1392. "memory");
  1393. }
  1394. template <typename T>
  1395. static inline void transpose_1x12_4_s(const T*& inptr0, T* outptr) {
  1396. static_assert(sizeof(T) == 4, "transpose_1x12_4_s only support sizeof(T) == 4");
  1397. asm volatile(
  1398. "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
  1399. "ld4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[inptr0]], #64\n"
  1400. "ld4 {v8.4s, v9.4s, v10.4s, v11.4s},[%[inptr0]], #64\n"
  1401. "stp q0, q4, [%[outptr]] \n"
  1402. "stp q8, q1, [%[outptr], #32] \n"
  1403. "stp q5, q9, [%[outptr], #64] \n"
  1404. "stp q2, q6, [%[outptr], #96] \n"
  1405. "stp q10, q3, [%[outptr], #128] \n"
  1406. "stp q7, q11, [%[outptr], #160] \n"
  1407. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
  1408. :
  1409. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  1410. "memory");
  1411. }
  1412. template <typename T>
  1413. static inline void transpose_1x4_4_s(const T*& inptr0, T* outptr) {
  1414. static_assert(sizeof(T) == 4, "transpose_1x4_4_s only support sizeof(T) == 4");
  1415. asm volatile(
  1416. "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
  1417. "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[outptr]]\n"
  1418. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
  1419. :
  1420. : "v0", "v1", "v2", "v3", "memory");
  1421. }
  1422. template <typename T>
  1423. static inline void transpose_8x4_1_s(
  1424. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1425. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1426. T* outptr) {
  1427. static_assert(sizeof(T) == 4, "transpose_8x4_1_s only support sizeof(T) == 4");
  1428. asm volatile(
  1429. "ld1 {v0.4s}, [%[inptr0]], 16\n" // A0A1A2A3
  1430. "ld1 {v1.4s}, [%[inptr1]], 16\n" // B0B1B2B3
  1431. "ld1 {v2.4s}, [%[inptr2]], 16\n" // C0C1C2C3
  1432. "ld1 {v3.4s}, [%[inptr3]], 16\n" // D0D1D2D3
  1433. "ld1 {v4.4s}, [%[inptr4]], 16\n" // E0E1E2E3
  1434. "ld1 {v5.4s}, [%[inptr5]], 16\n" // F0F1F2F3
  1435. "ld1 {v6.4s}, [%[inptr6]], 16\n" // G0G1G2G3
  1436. "ld1 {v7.4s}, [%[inptr7]], 16\n" // H0H1H2H3
  1437. "zip1 v8.4s, v0.4s, v1.4s\n" // A0B0A1B1
  1438. "zip2 v9.4s, v0.4s, v1.4s\n" // A2B2A3B3
  1439. "zip1 v10.4s, v2.4s, v3.4s\n" // C0D0C1D1
  1440. "zip2 v11.4s, v2.4s, v3.4s\n" // C2D2C3D3
  1441. "zip1 v12.4s, v4.4s, v5.4s\n" // E0F0E1F1
  1442. "zip2 v13.4s, v4.4s, v5.4s\n" // E2F2E3F3
  1443. "zip1 v14.4s, v6.4s, v7.4s\n" // G0H0G1H1
  1444. "zip2 v15.4s, v6.4s, v7.4s\n" // G2H2G3H3
  1445. "zip1 v0.2d, v8.2d, v10.2d\n" // A0B0C0D0
  1446. "zip2 v2.2d, v8.2d, v10.2d\n" // A1B1C1D1
  1447. "zip1 v4.2d, v9.2d, v11.2d\n" // A2B2C2D2
  1448. "zip2 v6.2d, v9.2d, v11.2d\n" // A3B3C3D3
  1449. "zip1 v1.2d, v12.2d, v14.2d\n" // E0F0G0H0
  1450. "zip2 v3.2d, v12.2d, v14.2d\n" // E1F1G1H1
  1451. "zip1 v5.2d, v13.2d, v15.2d\n" // E2F2G2H2
  1452. "zip2 v7.2d, v13.2d, v15.2d\n" // E3F3G3H3
  1453. "st1 {v0.4s,v1.4s,v2.4s,v3.4s}, [%[outptr]], #64\n"
  1454. "st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [%[outptr]], #64\n"
  1455. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1456. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  1457. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  1458. :
  1459. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  1460. "v12", "v13", "v14", "v15", "memory");
  1461. }
  1462. template <typename T>
  1463. static inline void transpose_12x4_1_s(
  1464. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1465. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1466. const T*& inptr8, const T*& inptr9, const T*& inptr10, const T*& inptr11,
  1467. T* outptr) {
  1468. static_assert(sizeof(T) == 4, "transpose_12x4_1_s only support sizeof(T) == 4");
  1469. asm volatile(
  1470. "ld1 {v0.4s}, [%[inptr0]], 16\n" // A0A1A2A3
  1471. "ld1 {v1.4s}, [%[inptr1]], 16\n" // B0B1B2B3
  1472. "ld1 {v2.4s}, [%[inptr2]], 16\n" // C0C1C2C3
  1473. "ld1 {v3.4s}, [%[inptr3]], 16\n" // D0D1D2D3
  1474. "ld1 {v4.4s}, [%[inptr4]], 16\n" // E0E1E2E3
  1475. "ld1 {v5.4s}, [%[inptr5]], 16\n" // F0F1F2F3
  1476. "ld1 {v6.4s}, [%[inptr6]], 16\n" // G0G1G2G3
  1477. "ld1 {v7.4s}, [%[inptr7]], 16\n" // H0H1H2H3
  1478. "ld1 {v16.4s}, [%[inptr8]], 16\n" // I0I1I2I3
  1479. "ld1 {v17.4s}, [%[inptr9]], 16\n" // J0J1J2J3
  1480. "ld1 {v18.4s}, [%[inptr10]], 16\n" // K0K1K2K3
  1481. "ld1 {v19.4s}, [%[inptr11]], 16\n" // L0L1L2L3
  1482. "zip1 v8.4s, v0.4s, v1.4s\n" // A0B0A1B1
  1483. "zip2 v9.4s, v0.4s, v1.4s\n" // A2B2A3B3
  1484. "zip1 v10.4s, v2.4s, v3.4s\n" // C0D0C1D1
  1485. "zip2 v11.4s, v2.4s, v3.4s\n" // C2D2C3D3
  1486. "zip1 v12.4s, v4.4s, v5.4s\n" // E0F0E1F1
  1487. "zip2 v13.4s, v4.4s, v5.4s\n" // E2F2E3F3
  1488. "zip1 v14.4s, v6.4s, v7.4s\n" // G0H0G1H1
  1489. "zip2 v15.4s, v6.4s, v7.4s\n" // G2H2G3H3
  1490. "zip1 v20.4s, v16.4s, v17.4s\n" // I0J0I1J1
  1491. "zip2 v21.4s, v16.4s, v17.4s\n" // I2J2I3J3
  1492. "zip1 v22.4s, v18.4s, v19.4s\n" // K0L0K1L1
  1493. "zip2 v23.4s, v18.4s, v19.4s\n" // K2L2K3L3
  1494. "zip1 v0.2d, v8.2d, v10.2d\n" // A0B0C0D0
  1495. "zip2 v3.2d, v8.2d, v10.2d\n" // A1B1C1D1
  1496. "zip1 v6.2d, v9.2d, v11.2d\n" // A2B2C2D2
  1497. "zip2 v24.2d, v9.2d, v11.2d\n" // A3B3C3D3
  1498. "zip1 v1.2d, v12.2d, v14.2d\n" // E0F0G0H0
  1499. "zip2 v4.2d, v12.2d, v14.2d\n" // E1F1G1H1
  1500. "zip1 v7.2d, v13.2d, v15.2d\n" // E2F2G2H2
  1501. "zip2 v25.2d, v13.2d, v15.2d\n" // E3F3G3H3
  1502. "zip1 v2.2d, v20.2d, v22.2d\n" // I0J0K0L0
  1503. "zip2 v5.2d, v20.2d, v22.2d\n" // I1J1K1L1
  1504. "zip1 v8.2d, v21.2d, v23.2d\n" // I2J2K2L2
  1505. "zip2 v26.2d, v21.2d, v23.2d\n" // I3J3K3L3
  1506. "st1 {v0.4s,v1.4s,v2.4s}, [%[outptr]], #48\n"
  1507. "st1 {v3.4s,v4.4s,v5.4s}, [%[outptr]], #48\n"
  1508. "st1 {v6.4s,v7.4s,v8.4s}, [%[outptr]], #48\n"
  1509. "st1 {v24.4s,v25.4s,v26.4s}, [%[outptr]], #48\n"
  1510. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1511. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  1512. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [inptr8] "+r"(inptr8),
  1513. [inptr9] "+r"(inptr9), [inptr10] "+r"(inptr10), [inptr11] "+r"(inptr11),
  1514. [outptr] "+r"(outptr)
  1515. :
  1516. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  1517. "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
  1518. "v22", "v23", "v24", "v25", "v26", "memory");
  1519. }
  1520. template <typename T>
  1521. static inline void transpose_12x4_1_b(
  1522. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1523. T* outptr) {
  1524. static_assert(
  1525. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1526. "transpose_12x4_1_b only support uint8_t and int8_t");
  1527. asm volatile(
  1528. "ldr q0, [%[inptr0]], #12\n" // A1A2A3A4A5A6A7A8A9A10A11A12A13A14A15A16
  1529. "ldr q1, [%[inptr1]], #12\n" // B1B2B3B4B5B6B7B8B9B10B11B12B13B14B15B16
  1530. "ldr q2, [%[inptr2]], #12\n" // C1C2C3C4C5C6C7C8C9C10C11C12C13C14C15C16
  1531. //! \warning the last inptr3 may less than 16bytes, so we should
  1532. //! split read it
  1533. "ldr d3, [%[inptr3]], #8\n" // D1D2D3D4D5D6D7D8D9D10D11D12D13D14D15D16
  1534. "ldr w1, [%[inptr3]], #4\n"
  1535. "ins v3.s[2], w1\n"
  1536. "trn1 v4.16b, v0.16b, v1.16b\n" // v4: A1B1A3B3....
  1537. "trn2 v5.16b, v0.16b, v1.16b\n" // v5: A2B2A4B4....
  1538. "trn1 v6.16b, v2.16b, v3.16b\n" // v6: C1D1C3D3....
  1539. "trn2 v7.16b, v2.16b, v3.16b\n" // v7: C2D2C4D4....
  1540. "trn1 v8.8h, v4.8h, v6.8h\n" // v8: A1B1C1D1A5B5C5D5...
  1541. "trn2 v9.8h, v4.8h, v6.8h\n" // v9: A3B3C3D3A7B7C7D7...
  1542. "trn1 v10.8h, v5.8h, v7.8h\n" // v10: A2B2C2D2A6B6C6D6...
  1543. "trn2 v11.8h, v5.8h, v7.8h\n" // v11: A4B4C4D4A8B8C8D8...
  1544. //! ABCD=E then
  1545. //! v8: E1E5E9E13 v10: E2E6E10E14 v9: E3E7E11E15 v11:
  1546. //! E4E8E12E16
  1547. "zip1 v12.4s, v8.4s, v10.4s\n" // v12: E1E2E5E6
  1548. "zip2 v13.4s, v8.4s, v10.4s\n" // v13: E9E10E13E14
  1549. "zip1 v14.4s, v9.4s, v11.4s\n" // v14: E3E4E7E8
  1550. "zip2 v15.4s, v9.4s, v11.4s\n" // v15: E11E12E15E16
  1551. "zip1 v17.2d, v12.2d, v14.2d\n" // v17: E1E2E3E4
  1552. "zip2 v18.2d, v12.2d, v14.2d\n" // v18: E5E6E7E8
  1553. "zip1 v19.2d, v13.2d, v15.2d\n" // v19: E8E10E11E12
  1554. "zip2 v20.2d, v13.2d, v15.2d\n" // v19: E13E14E15E16
  1555. "stp q17, q18, [%[outptr]], #32\n"
  1556. "str q19, [%[outptr]], #16\n"
  1557. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1558. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  1559. :
  1560. : "w1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
  1561. "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "memory");
  1562. }
  1563. template <typename T>
  1564. static inline void transpose_8x4_1_b(
  1565. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1566. T* outptr) {
  1567. static_assert(
  1568. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1569. "transpose_8x4_1_b only support uint8_t and int8_t");
  1570. asm volatile(
  1571. "ld1 {v0.d}[0], [%[inptr0]], #8\n" // A1A2A3A4A5A6A7A8
  1572. "ld1 {v1.d}[0], [%[inptr1]], #8\n" // B1B2B3B4B5B6B7B8
  1573. "ld1 {v0.d}[1], [%[inptr2]], #8\n" // C1C2C3C4C5C6C7C8
  1574. "ld1 {v1.d}[1], [%[inptr3]], #8\n" // D1D2D3D4D5D6D7D8
  1575. "zip1 v2.16b, v0.16b, v1.16b\n" // A1B1A2B2A3B3A4B4A5B5A6B6A7B7A8B8
  1576. "zip2 v3.16b, v0.16b, v1.16b\n" // C1D1C2D2C3D3C4D4C5D5C6D6C7D7C8D8
  1577. "zip1 v4.8h, v2.8h, v3.8h\n" // A1B1C1D1A2B2C2D2A3B3C3D3A4B4C4D4
  1578. "zip2 v5.8h, v2.8h, v3.8h\n" // A5B5C5D5A6B6C6D6A7B7C7D7A8B8C8D8
  1579. "st1 {v4.2d}, [%[outptr]], #16\n"
  1580. "st1 {v5.2d}, [%[outptr]], #16\n"
  1581. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1582. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  1583. :
  1584. : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
  1585. }
  1586. template <typename T>
  1587. static inline void transpose_4x8_1_b_with_shift(
  1588. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1589. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1590. T*& outptr) {
  1591. static int8x16_t shuffle_idx = {0, 4, 8, 12, 1, 5, 9, 13,
  1592. 2, 6, 10, 14, 3, 7, 11, 15};
  1593. static_assert(
  1594. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1595. "transpose_8x4_1_b only support uint8_t and int8_t");
  1596. asm volatile(
  1597. "ld1 {v0.s}[0], [%[inptr0]], #4\n" // A1A2A3A4
  1598. "ld1 {v0.s}[1], [%[inptr1]], #4\n" // B1B2B3B4
  1599. "ld1 {v0.s}[2], [%[inptr2]], #4\n" // C1C2C3C4
  1600. "ld1 {v0.s}[3], [%[inptr3]], #4\n" // D1D2D3D4
  1601. "ld1 {v1.s}[0], [%[inptr4]], #4\n" // E1E2E3E4
  1602. "ld1 {v1.s}[1], [%[inptr5]], #4\n" // F1F2F3F4
  1603. "ld1 {v1.s}[2], [%[inptr6]], #4\n" // G1G2G3G4
  1604. "ld1 {v1.s}[3], [%[inptr7]], #4\n" // H1H2H3H4
  1605. "tbl v2.16b, {v0.16b}, %[shuffle_idx].16b \n" // A1B1C1D1A2B2C2D2A3B3C3D3A4B4C4D4
  1606. "tbl v3.16b, {v1.16b}, %[shuffle_idx].16b \n" // E1F1G1H1E2F2G2H2E3F3G3H3E4F4G4H4
  1607. "zip1 v4.4s, v2.4s, v3.4s\n" // A1B1C1D1E1F1G1H1 A2B2C2D2E2F2G2H2
  1608. "zip2 v5.4s, v2.4s, v3.4s\n" // A3B3C3D3E3F3G3H3 A4B4C4D4E4F4G4H4
  1609. "shl v6.16b, v4.16b, #4\n"
  1610. "sshr v7.16b, v4.16b, #4\n" // hig
  1611. "sshr v8.16b, v6.16b, #4\n" // low
  1612. "shl v9.16b, v5.16b, #4\n"
  1613. "sshr v10.16b, v5.16b, #4\n" // hig
  1614. "sshr v11.16b, v9.16b, #4\n" // low
  1615. "zip1 v0.2d,v8.2d,v7.2d\n"
  1616. "zip2 v1.2d,v8.2d,v7.2d\n"
  1617. "zip1 v2.2d,v11.2d,v10.2d\n"
  1618. "zip2 v3.2d,v11.2d,v10.2d\n"
  1619. "st1 {v0.2d-v3.2d},[%[outptr]],#64\n"
  1620. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1621. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  1622. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7),
  1623. [shuffle_idx] "+w"(shuffle_idx), [outptr] "+r"(outptr)
  1624. :
  1625. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  1626. "memory");
  1627. }
  1628. template <typename T>
  1629. static inline void transpose_8x8_1_b(
  1630. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1631. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1632. T* outptr) {
  1633. static_assert(
  1634. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1635. "transpose_8x8_1_b only support uint8_t and int8_t");
  1636. asm volatile(
  1637. "ld1 {v0.8b}, [%[inptr0]], #8\n" // A1A2A3A4A5A6A7A8
  1638. "ld1 {v1.8b}, [%[inptr1]], #8\n" // B1B2B3B4B5B6B7B8
  1639. "ld1 {v2.8b}, [%[inptr2]], #8\n" // C1C2C3C4C5C6C7C8
  1640. "ld1 {v3.8b}, [%[inptr3]], #8\n" // D1D2D3D4D5D6D7D8
  1641. "ld1 {v4.8b}, [%[inptr4]], #8\n" // E1E2E3E4E5E6E7E8
  1642. "ld1 {v5.8b}, [%[inptr5]], #8\n" // F1F2F3F4F5F6F7F8
  1643. "ld1 {v6.8b}, [%[inptr6]], #8\n" // G1G2G3G4G5G6G7G8
  1644. "ld1 {v7.8b}, [%[inptr7]], #8\n" // H1H2H3H4H5H6H7H8
  1645. "zip1 v8.16b, v0.16b, v1.16b\n" // A1B1A2B2A3B3A4B4
  1646. // A5B5A6B6A7B7A8B8
  1647. "zip1 v9.16b, v2.16b, v3.16b\n" // C1D1C2D2C3D3C4D4
  1648. // C5D5C6D6C7D7C8D8
  1649. "zip1 v10.16b, v4.16b, v5.16b\n" // E1F1E2F2E3F3E4F4
  1650. // E5F5E6F6E7F7E8F8
  1651. "zip1 v11.16b, v6.16b, v7.16b\n" // G1H1G2H2G3H3G4H4
  1652. // G5H5G6H6G7H7G8H8
  1653. "zip1 v12.8h, v8.8h, v9.8h\n" // A1B1C1D1A2B2C2D2
  1654. // A3B3C3D3A4B4C4D4
  1655. "zip1 v13.8h, v10.8h, v11.8h\n" // E1F1G1H1E2F2G2H2
  1656. // E3F3G3H3E4F4G4H4
  1657. "zip2 v14.8h, v8.8h, v9.8h\n" // A5B5C5D5A6B6C6D6
  1658. // A7B7C7D7A8B8C8D8
  1659. "zip2 v15.8h, v10.8h, v11.8h\n" // E5F5G5H5E6F6G6H6
  1660. // E7F7G7H7E8F8G8H8
  1661. "zip1 v16.4s, v12.4s, v13.4s\n" // A1B1C1D1E1F1G1H1
  1662. // A2B2C2D2E2F2G2H2
  1663. "zip1 v18.4s, v14.4s, v15.4s\n" // A5B5C5D5E5F5G5H5
  1664. // A6B6C6D6E6F6G6H6
  1665. "zip2 v17.4s, v12.4s, v13.4s\n" // A3B3C3D3E3F3G3H3
  1666. // A4B4C4D4E4F4G4H4
  1667. "zip2 v19.4s, v14.4s, v15.4s\n" // A7B7C7D7E7F7G7H7
  1668. // A8B8C8D8E8F8G8H8
  1669. "st1 {v16.16b}, [%[outptr]], #16\n" // A1B1C1D1E1F1G1H1
  1670. // A2B2C2D2E2F2G2H2
  1671. "st1 {v17.16b}, [%[outptr]], #16\n" // A3B3C3D3E3F3G3H3
  1672. // A4B4C4D4E4F4G4H4
  1673. "st1 {v18.16b}, [%[outptr]], #16\n" // A5B5C5D5E5F5G5H5
  1674. // A6B6C6D6E6F6G6H6
  1675. "st1 {v19.16b}, [%[outptr]], #16\n" // A7B7C7D7E7F7G7H7
  1676. // A8B8C8D8E8F8G8H8
  1677. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1678. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  1679. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  1680. :
  1681. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  1682. "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "memory");
  1683. }
  1684. template <typename T>
  1685. static inline void transpose_4x16_1_b_helper(
  1686. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1687. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1688. T* outptr) {
  1689. static_assert(sizeof(T) == 1, "only support size == 1");
  1690. static int8x16_t shuffle_idx = {0, 4, 8, 12, 1, 5, 9, 13,
  1691. 2, 6, 10, 14, 3, 7, 11, 15};
  1692. asm volatile(
  1693. "ld1 {v0.s}[0], [%[inptr0]], #4\n"
  1694. "ld1 {v0.s}[1], [%[inptr1]], #4\n"
  1695. "ld1 {v0.s}[2], [%[inptr2]], #4\n"
  1696. "ld1 {v0.s}[3], [%[inptr3]], #4\n"
  1697. "ld1 {v1.s}[0], [%[inptr4]], #4\n"
  1698. "ld1 {v1.s}[1], [%[inptr5]], #4\n"
  1699. "ld1 {v1.s}[2], [%[inptr6]], #4\n"
  1700. "ld1 {v1.s}[3], [%[inptr7]], #4\n"
  1701. "tbl v2.16b, {v0.16b}, %[shuffle_idx].16b\n"
  1702. "tbl v3.16b, {v1.16b}, %[shuffle_idx].16b\n"
  1703. "zip1 v4.4s, v2.4s, v3.4s\n"
  1704. "zip2 v5.4s, v2.4s, v3.4s\n"
  1705. "dup v6.2d, v4.d[1]\n"
  1706. "dup v7.2d, v5.d[1]\n"
  1707. "str d4, [%[outptr]], #16\n"
  1708. "str d6, [%[outptr]], #16\n"
  1709. "str d5, [%[outptr]], #16\n"
  1710. "str d7, [%[outptr]], #16\n"
  1711. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1712. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  1713. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr),
  1714. [shuffle_idx] "+w"(shuffle_idx)
  1715. :
  1716. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
  1717. }
  1718. template <typename T>
  1719. static inline void transpose_4(
  1720. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1721. T* outptr, int interleave, int size, T val = 0) {
  1722. megdnn_assert(size <= interleave);
  1723. int i = 0;
  1724. for (; i < size; i++) {
  1725. *outptr++ = *inptr0++;
  1726. *outptr++ = *inptr1++;
  1727. *outptr++ = *inptr2++;
  1728. *outptr++ = *inptr3++;
  1729. }
  1730. for (; i < interleave; i++) {
  1731. *outptr++ = val;
  1732. *outptr++ = val;
  1733. *outptr++ = val;
  1734. *outptr++ = val;
  1735. }
  1736. }
  1737. template <typename T>
  1738. static inline void transpose_8(
  1739. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1740. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1741. T* outptr, int interleave, int size, T val = 0) {
  1742. megdnn_assert(size <= interleave);
  1743. int i = 0;
  1744. for (; i < size; i++) {
  1745. *outptr++ = *inptr0++;
  1746. *outptr++ = *inptr1++;
  1747. *outptr++ = *inptr2++;
  1748. *outptr++ = *inptr3++;
  1749. *outptr++ = *inptr4++;
  1750. *outptr++ = *inptr5++;
  1751. *outptr++ = *inptr6++;
  1752. *outptr++ = *inptr7++;
  1753. }
  1754. for (; i < interleave; i++) {
  1755. *outptr++ = val;
  1756. *outptr++ = val;
  1757. *outptr++ = val;
  1758. *outptr++ = val;
  1759. *outptr++ = val;
  1760. *outptr++ = val;
  1761. *outptr++ = val;
  1762. *outptr++ = val;
  1763. }
  1764. }
  1765. /***************************** Transpose then interleave ********************/
  1766. //! pack form {1, 4(icb), 4(ic), 4(oc)} to {1, 1, 4(oc), 16(ic)}
  1767. template <typename T>
  1768. static inline void transpose_interleave_4x4_4_b(
  1769. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1770. T* outptr, int stride = 64) {
  1771. static_assert(
  1772. sizeof(T) == 1, "transpose_interleave_4x4_4_b only support sizeof(T) == 1");
  1773. asm volatile(
  1774. "ld4 {v0.16b, v1.16b, v2.16b, v3.16b},[%[inptr0]], 64\n"
  1775. "ld4 {v4.16b, v5.16b, v6.16b, v7.16b},[%[inptr1]], 64\n"
  1776. "ld4 {v8.16b, v9.16b, v10.16b, v11.16b},[%[inptr2]], 64\n"
  1777. "ld4 {v12.16b, v13.16b, v14.16b, v15.16b},[%[inptr3]], 64\n"
  1778. "st1 {v0.16b, v1.16b, v2.16b, v3.16b},[%[outptr]], %x[stride]\n"
  1779. "st1 {v4.16b, v5.16b, v6.16b, v7.16b},[%[outptr]], %x[stride]\n"
  1780. "st1 {v8.16b, v9.16b, v10.16b, v11.16b},[%[outptr]], %x[stride]\n"
  1781. "st1 {v12.16b, v13.16b, v14.16b, v15.16b},[%[outptr]], %x[stride]\n"
  1782. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1783. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr), [stride] "+r"(stride)
  1784. :
  1785. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  1786. "v12", "v14", "v15", "memory");
  1787. }
  1788. template <typename T>
  1789. static inline void transpose_interleave_1x4_4_b(
  1790. const T*& inptr0, T* outptr, int stride = 64) {
  1791. static_assert(
  1792. sizeof(T) == 1, "transpose_interleave_1x4_4_b only support sizeof(T) == 1");
  1793. asm volatile(
  1794. "ld4 {v0.16b, v1.16b, v2.16b, v3.16b},[%[inptr0]], 64\n"
  1795. "st1 {v0.16b, v1.16b, v2.16b, v3.16b},[%[outptr]], %x[stride]\n"
  1796. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr), [stride] "+r"(stride)
  1797. :
  1798. : "v0", "v1", "v2", "v3", "v4", "memory");
  1799. }
  1800. static inline void interleave_4x4_16x4_s8_s16(
  1801. const int8_t* inptr0, const int8_t* inptr1, const int8_t* inptr2,
  1802. const int8_t* inptr3, int16_t* outptr) {
  1803. int8x16_t row0 = vld1q_s8(inptr0);
  1804. int16x8_t row0_01 = vmovl_low_s8(row0);
  1805. int16x8_t row0_23 = vmovl_high_s8(row0);
  1806. int16x4_t row0_0 = vget_low_s16(row0_01);
  1807. int16x4_t row0_1 = vget_high_s16(row0_01);
  1808. int16x4_t row0_2 = vget_low_s16(row0_23);
  1809. int16x4_t row0_3 = vget_high_s16(row0_23);
  1810. int8x16_t row1 = vld1q_s8(inptr1);
  1811. int16x8_t row1_01 = vmovl_low_s8(row1);
  1812. int16x8_t row1_23 = vmovl_high_s8(row1);
  1813. int16x4_t row1_0 = vget_low_s16(row1_01);
  1814. int16x4_t row1_1 = vget_high_s16(row1_01);
  1815. int16x4_t row1_2 = vget_low_s16(row1_23);
  1816. int16x4_t row1_3 = vget_high_s16(row1_23);
  1817. int8x16_t row2 = vld1q_s8(inptr2);
  1818. int16x8_t row2_01 = vmovl_low_s8(row2);
  1819. int16x8_t row2_23 = vmovl_high_s8(row2);
  1820. int16x4_t row2_0 = vget_low_s16(row2_01);
  1821. int16x4_t row2_1 = vget_high_s16(row2_01);
  1822. int16x4_t row2_2 = vget_low_s16(row2_23);
  1823. int16x4_t row2_3 = vget_high_s16(row2_23);
  1824. int8x16_t row3 = vld1q_s8(inptr3);
  1825. int16x8_t row3_01 = vmovl_low_s8(row3);
  1826. int16x8_t row3_23 = vmovl_high_s8(row3);
  1827. int16x4_t row3_0 = vget_low_s16(row3_01);
  1828. int16x4_t row3_1 = vget_high_s16(row3_01);
  1829. int16x4_t row3_2 = vget_low_s16(row3_23);
  1830. int16x4_t row3_3 = vget_high_s16(row3_23);
  1831. vst1_s16(outptr, row0_0);
  1832. vst1_s16(outptr + 1 * 4, row1_0);
  1833. vst1_s16(outptr + 2 * 4, row2_0);
  1834. vst1_s16(outptr + 3 * 4, row3_0);
  1835. vst1_s16(outptr + 4 * 4, row0_1);
  1836. vst1_s16(outptr + 5 * 4, row1_1);
  1837. vst1_s16(outptr + 6 * 4, row2_1);
  1838. vst1_s16(outptr + 7 * 4, row3_1);
  1839. vst1_s16(outptr + 8 * 4, row0_2);
  1840. vst1_s16(outptr + 9 * 4, row1_2);
  1841. vst1_s16(outptr + 10 * 4, row2_2);
  1842. vst1_s16(outptr + 11 * 4, row3_2);
  1843. vst1_s16(outptr + 12 * 4, row0_3);
  1844. vst1_s16(outptr + 13 * 4, row1_3);
  1845. vst1_s16(outptr + 14 * 4, row2_3);
  1846. vst1_s16(outptr + 15 * 4, row3_3);
  1847. };
  1848. static inline void interleave_4x4_8x4_s8_s16(
  1849. const int8_t* inptr0, const int8_t* inptr1, int16_t* outptr) {
  1850. int8x16_t row0 = vld1q_s8(inptr0);
  1851. int16x8_t row0_01 = vmovl_low_s8(row0);
  1852. int16x8_t row0_23 = vmovl_high_s8(row0);
  1853. int16x4_t row0_0 = vget_low_s16(row0_01);
  1854. int16x4_t row0_1 = vget_high_s16(row0_01);
  1855. int16x4_t row0_2 = vget_low_s16(row0_23);
  1856. int16x4_t row0_3 = vget_high_s16(row0_23);
  1857. int8x16_t row1 = vld1q_s8(inptr1);
  1858. int16x8_t row1_01 = vmovl_low_s8(row1);
  1859. int16x8_t row1_23 = vmovl_high_s8(row1);
  1860. int16x4_t row1_0 = vget_low_s16(row1_01);
  1861. int16x4_t row1_1 = vget_high_s16(row1_01);
  1862. int16x4_t row1_2 = vget_low_s16(row1_23);
  1863. int16x4_t row1_3 = vget_high_s16(row1_23);
  1864. vst1_s16(outptr, row0_0);
  1865. vst1_s16(outptr + 1 * 4, row1_0);
  1866. vst1_s16(outptr + 2 * 4, row0_1);
  1867. vst1_s16(outptr + 3 * 4, row1_1);
  1868. vst1_s16(outptr + 4 * 4, row0_2);
  1869. vst1_s16(outptr + 5 * 4, row1_2);
  1870. vst1_s16(outptr + 6 * 4, row0_3);
  1871. vst1_s16(outptr + 7 * 4, row1_3);
  1872. };
  1873. static inline void memcpy_s8_s16(const int8_t* inptr, int16_t* outptr, int count) {
  1874. for (; count >= 32; count -= 32) {
  1875. int8x8_t in0 = vld1_s8(inptr);
  1876. int8x8_t in1 = vld1_s8(inptr + 1 * 8);
  1877. int8x8_t in2 = vld1_s8(inptr + 2 * 8);
  1878. int8x8_t in3 = vld1_s8(inptr + 3 * 8);
  1879. vst1q_s16(outptr, vmovl_s8(in0));
  1880. vst1q_s16(outptr + 1 * 8, vmovl_s8(in1));
  1881. vst1q_s16(outptr + 2 * 8, vmovl_s8(in2));
  1882. vst1q_s16(outptr + 3 * 8, vmovl_s8(in3));
  1883. inptr += 32;
  1884. outptr += 32;
  1885. }
  1886. for (; count >= 8; count -= 8) {
  1887. int8x8_t in0 = vld1_s8(inptr);
  1888. vst1q_s16(outptr, vmovl_s8(in0));
  1889. inptr += 8;
  1890. outptr += 8;
  1891. }
  1892. for (; count > 0; --count) {
  1893. *outptr++ = (int16_t)(*inptr++);
  1894. }
  1895. }
  1896. static inline void transpos_12x4_s8(const int8_t* inptr0, int8_t* outptr) {
  1897. static const uint8_t src_idx_buffer[16] = {0, 4, 8, 12, 1, 5, 9, 13,
  1898. 2, 6, 10, 14, 3, 7, 11, 15};
  1899. static const uint8x16_t vtbl = vld1q_u8(&src_idx_buffer[0]);
  1900. int8x8x4_t input = vld4_s8(inptr0);
  1901. int8x16_t input2 = vqtbl1q_s8(vld1q_s8(inptr0 + 4 * 8), vtbl);
  1902. vst1_s8(outptr, input.val[0]);
  1903. vst1q_lane_s32(
  1904. reinterpret_cast<int32_t*>(outptr + 8), vreinterpretq_s32_s8(input2), 0);
  1905. vst1_s8(outptr + 1 * 12, input.val[1]);
  1906. vst1q_lane_s32(
  1907. reinterpret_cast<int32_t*>(outptr + 1 * 12 + 8),
  1908. vreinterpretq_s32_s8(input2), 1);
  1909. vst1_s8(outptr + 2 * 12, input.val[2]);
  1910. vst1q_lane_s32(
  1911. reinterpret_cast<int32_t*>(outptr + 2 * 12 + 8),
  1912. vreinterpretq_s32_s8(input2), 2);
  1913. vst1_s8(outptr + 3 * 12, input.val[3]);
  1914. vst1q_lane_s32(
  1915. reinterpret_cast<int32_t*>(outptr + 3 * 12 + 8),
  1916. vreinterpretq_s32_s8(input2), 3);
  1917. }
  1918. template <typename T>
  1919. static inline void interleave_8x8_mk4_b(
  1920. const T*& inptr0, const T*& inptr1, T*& outptr) {
  1921. static_assert(
  1922. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1923. "transpose_8x4_1_b only support uint8_t and int8_t");
  1924. asm volatile(
  1925. "ld1 {v0.4s}, [%[inptr0]], #16\n"
  1926. "ld1 {v1.4s}, [%[inptr1]], #16\n"
  1927. "ld1 {v2.4s}, [%[inptr0]], #16\n"
  1928. "ld1 {v3.4s}, [%[inptr1]], #16\n"
  1929. "zip1 v4.4s, v0.4s, v1.4s \n"
  1930. "zip2 v5.4s, v0.4s, v1.4s \n"
  1931. "zip1 v6.4s, v2.4s, v3.4s\n"
  1932. "zip2 v7.4s, v2.4s, v3.4s\n"
  1933. "st1 {v4.4s},[%[outptr]],#16\n"
  1934. "st1 {v5.4s},[%[outptr]],#16\n"
  1935. "st1 {v6.4s},[%[outptr]],#16\n"
  1936. "st1 {v7.4s},[%[outptr]],#16\n"
  1937. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [outptr] "+r"(outptr)
  1938. :
  1939. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
  1940. }
  1941. template <typename T>
  1942. static inline void transpose_8x8_mk4_b(const T*& inptr0, const T*& inptr1, T* outptr) {
  1943. static_assert(
  1944. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1945. "transpose_8x4_1_b only support uint8_t and int8_t");
  1946. asm volatile(
  1947. "ld4 {v0.8b-v3.8b}, [%[inptr0]], #32\n"
  1948. "ld4 {v4.8b-v7.8b}, [%[inptr1]], #32\n"
  1949. "st1 {v0.2s},[%[outptr]],#8\n"
  1950. "st1 {v1.2s},[%[outptr]],#8\n"
  1951. "st1 {v2.2s},[%[outptr]],#8\n"
  1952. "st1 {v3.2s},[%[outptr]],#8\n"
  1953. "st1 {v4.2s},[%[outptr]],#8\n"
  1954. "st1 {v5.2s},[%[outptr]],#8\n"
  1955. "st1 {v6.2s},[%[outptr]],#8\n"
  1956. "st1 {v7.2s},[%[outptr]],#8\n"
  1957. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [outptr] "+r"(outptr)
  1958. :
  1959. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
  1960. }
  1961. } // namespace aarch64
  1962. } // namespace megdnn
  1963. // vim: syntax=cpp.doxygen