You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

opr_impl.cpp 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. /**
  2. * \file dnn/src/fallback/relayout/opr_impl.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "src/fallback/relayout/opr_impl.h"
  12. #include "src/naive/handle.h"
  13. #include "src/common/utils.h"
  14. #include "src/common/relayout_helper.h"
  15. #include <cstring>
  16. using namespace megdnn;
  17. using namespace fallback;
  18. namespace {
  19. bool is_lastdim_contig(const TensorLayout& layout) {
  20. return layout.ndim <= 3 && layout.stride[layout.ndim - 1] == 1;
  21. }
  22. template<size_t sz, typename T0 = char>
  23. struct equiv_ctype_storage {
  24. T0 _[sz];
  25. };
  26. template <typename dtype>
  27. struct equiv_ctype {
  28. using type =
  29. std::aligned_storage_t<sizeof(typename DTypeTrait<dtype>::ctype),
  30. alignof(typename DTypeTrait<dtype>::ctype)>;
  31. };
  32. typedef void(*memcpy_policy_t)(void* cont, void* non_cont, size_t);
  33. void memcpy_cont2noncont(void *cont, void *non_cont, size_t size) {
  34. memcpy(non_cont, cont, size);
  35. }
  36. void memcpy_noncont2cont(void *cont, void *non_cont, size_t size) {
  37. memcpy(cont, non_cont, size);
  38. }
  39. template <typename T>
  40. void call_transpose(size_t batch, size_t m, size_t n, size_t ch, void* src,
  41. void* dst) {
  42. megdnn_assert(ch == 1);
  43. relayout::transpose_fallback::transpose<T>(
  44. batch, m, n, static_cast<T*>(src), static_cast<T*>(dst));
  45. }
  46. //! one operand contiguous, and the other non-contiguous
  47. template<typename ctype>
  48. void dispatch_on_dtype_cont(
  49. Handle *handle,
  50. const TensorND &cont, const TensorND &nonc, memcpy_policy_t mcp_pol) {
  51. auto ctptr = static_cast<uint8_t*>(cont.raw_ptr),
  52. ncptr = static_cast<uint8_t*>(nonc.raw_ptr);
  53. thin_function<void()> kern;
  54. switch (nonc.layout.ndim) {
  55. case 2: {
  56. auto shp0 = nonc.layout.shape[0],
  57. shp1 = nonc.layout.shape[1];
  58. auto strd0_n = nonc.layout.stride[0] * sizeof(ctype);
  59. auto strd0_c = shp1 * sizeof(ctype);
  60. kern = [=]() {
  61. auto cur_ctptr = ctptr;
  62. auto cur_ncptr = ncptr;
  63. for (size_t i = 0; i < shp0; ++ i) {
  64. mcp_pol(cur_ctptr, cur_ncptr, strd0_c);
  65. cur_ctptr += strd0_c;
  66. cur_ncptr += strd0_n;
  67. }
  68. };
  69. break;
  70. }
  71. case 3: {
  72. auto shp0 = nonc.layout.shape[0],
  73. shp1 = nonc.layout.shape[1],
  74. shp2 = nonc.layout.shape[2];
  75. auto strd0_n = nonc.layout.stride[0] * sizeof(ctype),
  76. strd1_n = nonc.layout.stride[1] * sizeof(ctype);
  77. auto strd1_c = shp2 * sizeof(ctype);
  78. kern = [=]() {
  79. auto cur_ctptr = ctptr;
  80. auto ncptr_row = ncptr;
  81. for (size_t i = 0; i < shp0; ++ i) {
  82. auto cur_ncptr = ncptr_row;
  83. for (size_t j = 0; j < shp1; ++ j) {
  84. mcp_pol(cur_ctptr, cur_ncptr, strd1_c);
  85. cur_ctptr += strd1_c;
  86. cur_ncptr += strd1_n;
  87. }
  88. ncptr_row += strd0_n;
  89. }
  90. };
  91. break;
  92. }
  93. default:
  94. megdnn_assert(0);
  95. }
  96. static_cast<naive::HandleImpl*>(handle)->dispatch_kern(std::move(kern));
  97. }
  98. void dispatch_cont(Handle *handle, const TensorND &cont, const TensorND &nonc,
  99. memcpy_policy_t mcp_pol) {
  100. switch (cont.layout.dtype.enumv()) {
  101. #define cb(_dt) case DTypeTrait<dtype::_dt>::enumv: \
  102. return dispatch_on_dtype_cont<equiv_ctype<dtype::_dt>::type>( \
  103. handle, cont, nonc, mcp_pol);
  104. MEGDNN_FOREACH_DTYPE_NAME(cb)
  105. MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb)
  106. #undef cb
  107. megdnn_assert(0);
  108. }
  109. }
  110. const size_t BLOCK_SIZE = 16,
  111. TRANSPOSE_CV_MAX_C =
  112. relayout::transpose_fallback::BLOCK_LINE_SIZE_BYTES;
  113. /*!
  114. * \tparam ctype The type of the data
  115. */
  116. template <typename ctype>
  117. void transpose_cv_block(size_t m, size_t n, size_t ch, size_t i, size_t j,
  118. size_t h, size_t w, void *src, void *dst) {
  119. auto batch_src = static_cast<const ctype*>(src);
  120. auto batch_dst = static_cast<ctype*>(dst);
  121. #define SET_VAL(dst, src) \
  122. switch (ch) { \
  123. case 3: \
  124. dst[2] = src[2]; MEGDNN_FALLTHRU \
  125. case 2: \
  126. dst[1] = src[1]; MEGDNN_FALLTHRU \
  127. case 1: \
  128. dst[0] = src[0]; \
  129. break; \
  130. default: \
  131. for (size_t _c = 0; _c < ch; ++_c) dst[_c] = src[_c]; \
  132. break; \
  133. }
  134. constexpr size_t B = BLOCK_SIZE;
  135. static_assert(TRANSPOSE_CV_MAX_C % sizeof(ctype) == 0, "bad ctype");
  136. ctype tmp[B][B][TRANSPOSE_CV_MAX_C / sizeof(ctype)];
  137. auto sptr = batch_src + i * n * ch + j * ch;
  138. for (size_t x = 0; x < h; ++x) {
  139. for (size_t y = 0; y < w; ++y) {
  140. SET_VAL(tmp[y][x], (sptr + y * ch))
  141. }
  142. sptr += n * ch;
  143. }
  144. auto dptr = batch_dst + j * m * ch + i * ch;
  145. for (size_t x = 0; x < w; ++x) {
  146. for (size_t y = 0; y < h; ++y) {
  147. SET_VAL((dptr + y * ch), tmp[x][y])
  148. }
  149. dptr += m * ch;
  150. }
  151. #undef SET_VAL
  152. }
  153. template <typename ctype>
  154. void transpose_cv_row(size_t m, size_t n, size_t ch, size_t i, size_t h,
  155. void *src, void *dst) {
  156. constexpr size_t B = BLOCK_SIZE;
  157. size_t j = 0;
  158. for (; j + B <= n; j += B) {
  159. transpose_cv_block<ctype>(m, n, ch, i, j, h, B, src, dst);
  160. }
  161. if (j < n) {
  162. transpose_cv_block<ctype>(m, n, ch, i, j, h, n - j, src, dst);
  163. }
  164. }
  165. template <typename ctype>
  166. void transpose_cv(size_t batch, size_t m, size_t n, size_t ch, void *src,
  167. void *dst) {
  168. constexpr size_t B = BLOCK_SIZE;
  169. auto batch_src = static_cast<ctype *>(src);
  170. auto batch_dst = static_cast<ctype *>(dst);
  171. for (size_t b = 0; b < batch; ++b) {
  172. size_t i = 0;
  173. for (; i + B <= m; i += B) {
  174. transpose_cv_row<ctype>(m, n, ch, i, B, batch_src, batch_dst);
  175. }
  176. if (i < m) {
  177. transpose_cv_row<ctype>(m, n, ch, i, m - i, batch_src, batch_dst);
  178. }
  179. batch_src += m * n * ch;
  180. batch_dst += m * n * ch;
  181. }
  182. }
  183. } // anonymous namespace
  184. void RelayoutForwardImpl::exec(
  185. _megdnn_tensor_in src0, _megdnn_tensor_out dst0,
  186. Handle *src_handle) {
  187. check_cpu_handle(src_handle);
  188. TensorND src = src0, dst = dst0;
  189. check_layout_and_canonize(src.layout, dst.layout);
  190. bool has_neg_stride = false;
  191. for (size_t i = 0; i < src.layout.ndim; ++ i) {
  192. if (src.layout.stride[i] < 0) {
  193. has_neg_stride = true;
  194. break;
  195. }
  196. }
  197. for (size_t i = 0; i < dst.layout.ndim; ++ i) {
  198. if (dst.layout.stride[i] < 0) {
  199. has_neg_stride = true;
  200. break;
  201. }
  202. }
  203. if (has_neg_stride) {
  204. NaiveRelayoutForwardImpl::do_exec(src, dst);
  205. return;
  206. }
  207. relayout::TransposeParam trans_param;
  208. bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param);
  209. exec_after_preprocess(src, dst, trans ? &trans_param : nullptr);
  210. }
  211. void RelayoutForwardImpl::exec_after_preprocess(
  212. const TensorND& src, const TensorND& dst,
  213. relayout::TransposeParam* transpose) {
  214. if (transpose) {
  215. auto dsize = src.layout.dtype.size() * transpose->c;
  216. void (*kptr)(size_t, size_t, size_t, size_t, void*, void*) = nullptr;
  217. auto src_addr = reinterpret_cast<uintptr_t>(src.raw_ptr),
  218. dst_addr = reinterpret_cast<uintptr_t>(dst.raw_ptr);
  219. if (dsize == 1) {
  220. megdnn_assert(transpose->c == 1);
  221. kptr = call_transpose<uint8_t>;
  222. } else if (dsize == 2) {
  223. transpose->c = 1;
  224. if (!((src_addr | dst_addr) & (alignof(uint16_t) - 1))) {
  225. kptr = call_transpose<uint16_t>;
  226. } else {
  227. kptr = call_transpose<equiv_ctype_storage<2>>;
  228. megdnn_log_error("unaligned addr in relayout");
  229. }
  230. } else if (dsize == 3) {
  231. transpose->c = 1;
  232. kptr = call_transpose<equiv_ctype_storage<3>>;
  233. } else if (dsize == 4) {
  234. transpose->c = 1;
  235. if (!((src_addr | dst_addr) & (alignof(uint32_t) - 1))) {
  236. kptr = call_transpose<uint32_t>;
  237. } else {
  238. kptr = call_transpose<equiv_ctype_storage<4>>;
  239. megdnn_log_error("unaligned addr in relayout");
  240. }
  241. } else if (dsize == 12) {
  242. transpose->c = 1;
  243. if (!((src_addr | dst_addr) & (alignof(uint32_t) - 1))) {
  244. kptr = call_transpose<equiv_ctype_storage<3, uint32_t>>;
  245. } else {
  246. kptr = call_transpose<equiv_ctype_storage<12>>;
  247. megdnn_log_error("unaligned addr in relayout");
  248. }
  249. } else if (dsize <= TRANSPOSE_CV_MAX_C) {
  250. switch (dst.layout.dtype.enumv()) {
  251. #define cb(_dt) \
  252. case DTypeTrait<dtype::_dt>::enumv: \
  253. kptr = transpose_cv<equiv_ctype<dtype::_dt>::type>; \
  254. break;
  255. MEGDNN_FOREACH_DTYPE_NAME(cb)
  256. MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb)
  257. #undef cb
  258. }
  259. megdnn_assert(kptr);
  260. }
  261. if (kptr) {
  262. auto kern = [
  263. t = *transpose, sptr = src.raw_ptr, dptr = dst.raw_ptr, kptr
  264. ]() {
  265. kptr(t.batch, t.m, t.n, t.c, sptr, dptr);
  266. };
  267. static_cast<naive::HandleImpl*>(handle())->dispatch_kern(kern);
  268. return;
  269. } else {
  270. megdnn_assert(transpose->c != 1, "unsupported dtype size");
  271. }
  272. }
  273. using relayout::is_contig;
  274. if (is_contig(dst.layout) && is_contig(src.layout)) {
  275. auto sptr = src.raw_ptr, dptr = dst.raw_ptr;
  276. auto sz = src.layout.span().dist_byte();
  277. MEGDNN_DISPATCH_CPU_KERN_OPR(memcpy(dptr, sptr, sz));
  278. return;
  279. }
  280. if (is_contig(dst.layout) && is_lastdim_contig(src.layout)) {
  281. return dispatch_cont(handle(), dst, src, memcpy_noncont2cont);
  282. }
  283. if (is_contig(src.layout) && is_lastdim_contig(dst.layout)) {
  284. return dispatch_cont(handle(), src, dst, memcpy_cont2noncont);
  285. }
  286. NaiveRelayoutForwardImpl::do_exec(src, dst);
  287. }
  288. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台