You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tensor.cpp 25 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722
  1. /**
  2. * \file src/core/impl/tensor.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megbrain/tensor.h"
  12. #include "megbrain/comp_node_env.h"
  13. #include "megbrain/opr/param_defs.h"
  14. #include "megbrain/opr/internal/megdnn_opr_wrapper.h"
  15. #include "megdnn/oprs.h"
  16. #include <thread>
  17. #include <cstring>
  18. #include <cmath>
  19. using namespace mgb;
  20. namespace {
  21. //! implement non-contiguous d2d copy
  22. void noncont_tensor_copy(
  23. const DeviceTensorND &dest, const DeviceTensorND &src,
  24. bool contig_dest, bool contig_src) {
  25. auto src_cn = src.comp_node();
  26. auto dst_cn = dest.comp_node();
  27. if (src_cn.device_type() == dst_cn.device_type()) {
  28. // perform relayout op for better performance when src and dst are
  29. // placed on comp nodes with the same device type
  30. auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node());
  31. auto relayout = opr::intl::get_megdnn_global_opr<megdnn::Relayout>(
  32. dst_cn);
  33. dst_cn.activate();
  34. relayout->exec(
  35. const_cast<DeviceTensorND&>(src).as_megdnn(),
  36. dest.as_megdnn(), MegDNNHandle::get(src_env).handle());
  37. } else {
  38. if (contig_src) {
  39. mgb_assert(!contig_dest);
  40. DeviceTensorND tmp{dst_cn};
  41. tmp.copy_from(src);
  42. dest.copy_from_fixlayout(tmp);
  43. return;
  44. }
  45. DeviceTensorND tmp;
  46. tmp.copy_from(src);
  47. dest.copy_from_fixlayout(tmp);
  48. }
  49. }
  50. //! implement non-contiguous h2h copy
  51. void noncont_tensor_copy(
  52. const HostTensorND &dest, const HostTensorND &src, bool, bool) {
  53. auto opr = opr::intl::get_megdnn_global_opr<megdnn::Relayout>(
  54. CompNode::default_cpu());
  55. opr->exec(
  56. const_cast<HostTensorND&>(src).as_megdnn(),
  57. dest.as_megdnn());
  58. }
  59. //! implement non-contiguous d2h copy
  60. void noncont_tensor_copy(
  61. const HostTensorND &dest, const DeviceTensorND &src,
  62. bool contig_dest, bool contig_src) {
  63. if (contig_src) {
  64. mgb_assert(!contig_dest);
  65. HostTensorND tmp;
  66. tmp.copy_from(src).sync();
  67. dest.copy_from_fixlayout(tmp); // sync not needed for h2h copy
  68. return;
  69. }
  70. DeviceTensorND tmp;
  71. tmp.copy_from(src);
  72. dest.copy_from_fixlayout(tmp);
  73. }
  74. //! implement non-contiguous h2d copy
  75. void noncont_tensor_copy(
  76. const DeviceTensorND &dest, const HostTensorND &src,
  77. bool contig_dest, bool contig_src) {
  78. if (contig_src) {
  79. mgb_assert(!contig_dest);
  80. DeviceTensorND tmp;
  81. // no need to sync because device free is async-safe with respect to
  82. // host thread
  83. tmp.copy_from(src);
  84. dest.copy_from_fixlayout(tmp);
  85. return;
  86. }
  87. HostTensorND tmp;
  88. tmp.copy_from(src);
  89. dest.copy_from_fixlayout(tmp).sync();
  90. }
  91. } // anonymous namespace
  92. /* ============= Slice and SubTensorSpec ============= */
  93. SubTensorSpec SubTensorSpec::make_from_offset_elem(
  94. const TensorLayout &layout, ptrdiff_t offset_elem) {
  95. mgb_assert(layout.ndim && layout.dtype.valid());
  96. return {layout, offset_elem};
  97. }
  98. SubTensorSpec Slice::apply(TensorLayout layout, int axis) const {
  99. mgb_assert(layout.ndim > 0 && layout.dtype.valid());
  100. if (axis == megdnn::param::OptionalAxisV1::INVALID_AXIS) {
  101. axis = 0;
  102. layout = layout.collapse_contiguous();
  103. mgb_assert(layout.ndim == 1,
  104. "apply Slice with axis==INVALID_AXIS on non-contig layout");
  105. }
  106. // axis in [-ndim, ndim) is available
  107. if (axis < 0)
  108. axis += layout.ndim;
  109. mgb_assert(axis >= 0 && static_cast<size_t>(axis) < layout.ndim,
  110. "invalid axis: %d; ndim=%zu", axis, layout.ndim);
  111. ptrdiff_t size_ax = layout.shape[axis];
  112. ptrdiff_t begin, end, step = m_step.val_with_default(1);
  113. mgb_assert(step, "Slice step can not be zero");
  114. auto tostr = [](const Maybe<ptrdiff_t> &v) -> std::string {
  115. if (!v.valid())
  116. return "None";
  117. return std::to_string(v.val());
  118. };
  119. auto mod_size = [size_ax](ptrdiff_t v) {
  120. return v < 0 ? v + size_ax : v;
  121. };
  122. MGB_MARK_USED_VAR(tostr);
  123. #define CHECK(cond) \
  124. mgb_assert(cond, \
  125. "index out of bound: layout=%s; request begin=%s end=%s step=%s " \
  126. "axis=%d", \
  127. layout.to_string().c_str(), tostr(m_begin).c_str(), \
  128. tostr(m_end).c_str(), tostr(m_step).c_str(), axis)
  129. if (step > 0) {
  130. begin = mod_size(m_begin.val_with_default(0));
  131. end = mod_size(m_end.val_with_default(size_ax));
  132. CHECK(begin >= 0 && end >= begin && end <= size_ax);
  133. } else {
  134. begin = mod_size(m_begin.val_with_default(size_ax - 1));
  135. end = m_end.valid() ? mod_size(m_end.val()) : -1;
  136. CHECK(step < 0 && begin >= 0 && end <= begin && begin < size_ax &&
  137. end >= -1);
  138. }
  139. auto step_abs = std::abs(step);
  140. layout.shape[axis] = (std::abs(end - begin) + step_abs - 1) / step_abs;
  141. auto orig_stride = layout.stride[axis];
  142. layout.stride[axis] *= step;
  143. // make stride as contiguous as possible
  144. if (layout.shape[axis] != 1 && axis)
  145. -- axis;
  146. if (layout.shape[axis] == 1) {
  147. auto stride = layout.stride[axis] =
  148. axis + 1 < static_cast<int>(layout.ndim) ?
  149. layout.stride[axis + 1] * layout.shape[axis + 1] : 1;
  150. for (int i = axis - 1; i >= 0; -- i) {
  151. if (layout.shape[i] == 1) {
  152. layout.stride[i] = stride;
  153. } else {
  154. break;
  155. }
  156. }
  157. }
  158. auto offset_elem = layout.is_empty() ? 0 : orig_stride * begin;
  159. return SubTensorSpec::make_from_offset_elem(layout, offset_elem);
  160. #undef CHECK
  161. }
  162. void SubTensorSpec::merge_with(const SubTensorSpec &rhs) {
  163. mgb_assert(m_layout.dtype.valid() && m_layout.dtype == rhs.m_layout.dtype &&
  164. rhs.m_layout.ndim);
  165. m_offset_elem += rhs.m_offset_elem;
  166. m_layout = rhs.m_layout;
  167. }
  168. /* ===================== TensorStorage ===================== */
  169. class mgb::HostTensorStorageTrait {
  170. public:
  171. static void* alloc(CompNode node, size_t size) {
  172. return node.alloc_host(size);
  173. }
  174. static void free(CompNode node, void *data) {
  175. node.free_host(data);
  176. }
  177. };
  178. class mgb::DeviceTensorStorageTrait {
  179. public:
  180. static void* alloc(CompNode node, size_t size) {
  181. return node.alloc_device(size);
  182. }
  183. static void free(CompNode node, void *data) {
  184. node.free_device(data);
  185. }
  186. };
  187. template<class Trait>
  188. TensorStorage<Trait>& TensorStorage<Trait>::operator = (
  189. const TensorStorage& rhs) {
  190. if (rhs.m_size > rhs.m_capacity) {
  191. rhs.ptr();
  192. }
  193. m_allow_realloc = rhs.m_allow_realloc;
  194. m_comp_node = rhs.m_comp_node;
  195. m_size = rhs.m_size;
  196. m_capacity = rhs.m_capacity;
  197. m_offset = rhs.m_offset;
  198. m_data = rhs.m_data;
  199. return *this;
  200. }
  201. template<class Trait>
  202. TensorStorage<Trait>& TensorStorage<Trait>::ensure_size(size_t sz) {
  203. if (sz > m_size) {
  204. mgb_throw_if(!m_allow_realloc || m_offset, MegBrainError,
  205. "can not grow a tensor that does not allow realloc");
  206. check_comp_node_valid();
  207. }
  208. m_size = sz;
  209. return *this;
  210. }
  211. template<class Trait>
  212. TensorStorage<Trait> TensorStorage<Trait>::sub(
  213. ptrdiff_t offset) const {
  214. ptr(); // apply lazy resize
  215. ptrdiff_t toff = offset + m_offset;
  216. if (offset == static_cast<ptrdiff_t>(m_size)) {
  217. return {false, m_comp_node, 0, 0, 0, RawStorage{}};
  218. }
  219. mgb_assert(toff >= 0 && offset < static_cast<ptrdiff_t>(m_size),
  220. "bad subtensor: offset=%td m_offset=%zu m_size=%zu",
  221. offset, m_offset, m_size);
  222. return {false, m_comp_node, m_size - offset, m_capacity - offset,
  223. static_cast<size_t>(toff), m_data};
  224. }
  225. template<class Trait>
  226. dt_byte* TensorStorage<Trait>::apply_lazy_and_get_ptr() {
  227. check_comp_node_valid();
  228. if (m_size > m_capacity) {
  229. mgb_assert(m_allow_realloc && !m_offset);
  230. m_data.reset(); // free old ptr
  231. m_capacity = 0; // to be exception safe
  232. auto ptr = static_cast<dt_byte*>(Trait::alloc(m_comp_node, m_size));
  233. mgb_throw_if(!ptr, SystemError, "failed to allocate memory");
  234. CompNode cn = m_comp_node;
  235. m_data.reset(ptr, [cn](void *p){Trait::free(cn, p);});
  236. m_capacity = m_size;
  237. }
  238. return m_data.get() + m_offset;
  239. }
  240. template<class Trait>
  241. TensorStorage<Trait>& TensorStorage<Trait>::comp_node(
  242. CompNode node, bool allow_mem_node_change) {
  243. mgb_assert(node.valid());
  244. if (m_comp_node.valid() && node.mem_node() != m_comp_node.mem_node()) {
  245. mgb_assert(allow_mem_node_change);
  246. m_allow_realloc = true;
  247. m_size = m_capacity = m_offset = 0;
  248. m_data.reset();
  249. }
  250. m_comp_node = node;
  251. return *this;
  252. }
  253. template<class Trait>
  254. void TensorStorage<Trait>::reset(CompNode node, size_t size,
  255. RawStorage data) {
  256. mgb_assert(m_allow_realloc);
  257. m_comp_node = node;
  258. m_size = size;
  259. m_capacity = size;
  260. m_offset = 0;
  261. m_data = std::move(data);
  262. }
  263. template<class Trait>
  264. template<class RTrait, typename>
  265. TensorStorage<Trait> TensorStorage<Trait>::make_proxy(
  266. const TensorStorage<RTrait> &src) {
  267. mgb_assert(src.comp_node().mem_node() == CompNode::default_cpu().mem_node(),
  268. "proxy source should be on CPU; got %s",
  269. src.comp_node().to_string().c_str());
  270. src.ptr();
  271. return {true, src.m_comp_node, src.m_size, src.m_capacity, src.m_offset,
  272. src.m_data};
  273. }
  274. template<class Trait>
  275. void TensorStorage<Trait>::on_invalid_comp_node() {
  276. mgb_throw(MegBrainError, "trying to acccess TensorStorage with invalid "
  277. "comp node");
  278. }
  279. namespace mgb {
  280. // host to host
  281. template<> template<>
  282. void TensorStorage<HostTensorStorageTrait>::copy_from(
  283. const TensorStorage<HostTensorStorageTrait> &src, size_t size) const {
  284. mgb_assert(size <= this->size() && size <= src.size());
  285. memcpy(ptr(), src.ptr(), size);
  286. }
  287. // device to host
  288. template<> template<>
  289. void TensorStorage<HostTensorStorageTrait>::copy_from(
  290. const TensorStorage<DeviceTensorStorageTrait> &src, size_t size) const {
  291. bool need_sync = false;
  292. mgb_assert(size <= this->size() && size <= src.size());
  293. if (m_comp_node != src.comp_node()) {
  294. auto default_cpu = CompNode::default_cpu();
  295. if (src.comp_node() != default_cpu) {
  296. mgb_assert(m_comp_node == default_cpu,
  297. "inconsistent D2H copy:"
  298. " copy from device to host using different comp nodes:"
  299. " device_node=%s host_node=%s",
  300. src.comp_node().to_string().c_str(),
  301. m_comp_node.to_string().c_str());
  302. // copy_from() should use m_comp_node, and default_cpu is
  303. // synchronous with current thread, so this copy has no
  304. // synchronizing ambiguity and we only need to sync on host
  305. need_sync = true;
  306. }
  307. }
  308. src.comp_node().copy_to_host(ptr(), src.ptr(), size);
  309. if (need_sync)
  310. src.comp_node().sync();
  311. }
  312. // host to device
  313. template<> template<>
  314. void TensorStorage<DeviceTensorStorageTrait>::copy_from(
  315. const TensorStorage<HostTensorStorageTrait> &src, size_t size) const {
  316. mgb_assert(size <= this->size() && size <= src.size());
  317. m_comp_node.copy_to_device(ptr(), src.ptr(), size);
  318. }
  319. // device to device
  320. template<> template<>
  321. void TensorStorage<DeviceTensorStorageTrait>::copy_from(
  322. const TensorStorage<DeviceTensorStorageTrait> &src, size_t size) const {
  323. mgb_assert(size <= this->size() && size <= src.size());
  324. if (src.comp_node().device_type() == CompNode::DeviceType::CPU &&
  325. comp_node().device_type() == CompNode::DeviceType::CUDA) {
  326. // current thread(i.e. cuda dispatcher thread) should wait for all
  327. // operations on src's comp_node to finish, otherwise a race condition
  328. // might occur between the worker thread of src's comp_node and the
  329. // thread responsible for copying pageable memory in \p src to a pinned
  330. // buffer, refer to
  331. // https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html
  332. //
  333. // Note: it is highly recommended that copy tensor from cpu to cuda
  334. // with asynchronized disaptching(see graph option async_exec_level),
  335. // or main thread might be blocked by worker thread corresponding to
  336. // the src's comp_node, resulting in bad performance
  337. //
  338. // TODO: consider using cudaMallocHost or cudaHostRegister
  339. // to pin the memory of src tensor, so it does not require synchronization
  340. // and is more efficient
  341. src.comp_node().sync();
  342. comp_node().copy_to_device(ptr(), src.ptr(), size);
  343. } else {
  344. src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size);
  345. }
  346. }
  347. // proxy host to device
  348. template TensorStorage<DeviceTensorStorageTrait>
  349. TensorStorage<DeviceTensorStorageTrait>::
  350. make_proxy<HostTensorStorageTrait, void>(
  351. const TensorStorage<HostTensorStorageTrait>&);
  352. // proxy device to host
  353. template TensorStorage<HostTensorStorageTrait>
  354. TensorStorage<HostTensorStorageTrait>::
  355. make_proxy<DeviceTensorStorageTrait, void>(
  356. const TensorStorage<DeviceTensorStorageTrait>&);
  357. }
  358. /* ===================== TensorND ===================== */
  359. // ctor def {
  360. #define DEF \
  361. template <class TensorStorage> \
  362. TensorND<TensorStorage>::TensorND
  363. DEF() = default;
  364. DEF(CompNode node) : m_storage{node} {}
  365. DEF(DType dtype) : m_layout{dtype} {}
  366. DEF(CompNode node, DType dtype) : m_storage{node}, m_layout{dtype} {}
  367. //! allocate contiguous from given comp node, shape and dtype
  368. DEF(CompNode node, const TensorShape& shape, DType dtype, TensorFormat format)
  369. : m_storage{node}, m_layout{dtype, format} {
  370. resize(shape);
  371. }
  372. //! allocate contiguous from given comp node and layout (strides not
  373. //! used)
  374. DEF(CompNode node, const TensorLayout& layout)
  375. : TensorND(node, layout, layout.dtype, layout.format) {
  376. mgb_assert(layout.is_contiguous(),
  377. "non-contiguous layout used for initializing a tensor: %s",
  378. layout.to_string().c_str());
  379. }
  380. #undef DEF
  381. // ctor def }
  382. // def {
  383. #define DEF(name, ret) \
  384. template<class TensorStorage> \
  385. typename TensorND<TensorStorage>::ChainReturnType ret \
  386. TensorND<TensorStorage>::name
  387. DEF(resize, &)(const TensorShape& shape) {
  388. mgb_assert(m_layout.dtype.valid());
  389. auto nr_elems = m_layout.init_contiguous_stride(shape);
  390. m_storage.ensure_size(m_layout.dtype.size(nr_elems));
  391. return static_cast<ChainReturnType&>(*this);
  392. }
  393. DEF(reset, &)(TensorStorage storage, const TensorLayout &layout) {
  394. //! The storage to be reset is either satisfy the layout or empty.
  395. //! Empty storage is used after weight preprocess for saving memory and
  396. //! checking layout when running
  397. mgb_assert(!layout.ndim || storage.valid_span(layout.span()) ||
  398. storage.empty());
  399. m_storage = std::move(storage);
  400. m_layout = layout;
  401. return static_cast<ChainReturnType&>(*this);
  402. }
  403. DEF(comp_node, &)(CompNode comp_node, bool allow_mem_node_change) {
  404. auto orig_cn = m_storage.comp_node_allow_invalid();
  405. m_storage.comp_node(comp_node, allow_mem_node_change);
  406. if (orig_cn.valid() && orig_cn.mem_node() != comp_node.mem_node()) {
  407. m_layout.ndim = 0;
  408. }
  409. return static_cast<ChainReturnType&>(*this);
  410. }
  411. DEF(storage, &)(const TensorStorage &storage) {
  412. if (m_storage.empty() || storage.empty() ||
  413. m_storage.ptr() != storage.ptr()) {
  414. m_storage = storage;
  415. m_layout.ndim = 0;
  416. }
  417. return static_cast<ChainReturnType&>(*this);
  418. }
  419. DEF(dtype, &)(DType dtype) {
  420. if (m_layout.dtype != dtype) {
  421. m_layout.dtype = dtype;
  422. m_layout.ndim = 0;
  423. }
  424. return static_cast<ChainReturnType&>(*this);
  425. }
  426. DEF(format, &)(TensorFormat format) {
  427. if (m_layout.format != format) {
  428. m_layout.format = format;
  429. m_layout.ndim = 0;
  430. }
  431. return static_cast<ChainReturnType&>(*this);
  432. }
  433. DEF(operator[], ) (std::initializer_list<Slice> slice) const {
  434. auto subspec = SubTensorSpec::make_from_offset_elem(m_layout, 0);
  435. size_t axis = 0;
  436. for (auto &&i: slice) {
  437. subspec.merge_with(i.apply(subspec.layout(), axis));
  438. axis ++;
  439. }
  440. return sub(subspec);
  441. }
  442. DEF(sub, )(const SubTensorSpec &spec) const {
  443. mgb_assert(
  444. spec.layout().dtype == dtype() && spec.layout().format == format(),
  445. "invalid subtensor spec: sub_layout=%s self=%s",
  446. spec.layout().to_string().c_str(), m_layout.to_string().c_str());
  447. ChainReturnType rst;
  448. rst.reset(m_storage.sub(spec.offset_byte()), spec.layout());
  449. return rst;
  450. }
  451. #undef DEF
  452. // def }
  453. /* ===================== TensorND::copy_from ===================== */
  454. namespace {
  455. /**
  456. * \brief determine whether to check overlap of two tensors.
  457. * \return true : when HostStorage || (DeviceStorage && SUPPORT_UNIFIED_ADDRESS)
  458. * \note when both support unified address, we can treat them both on CPU. So,
  459. * overlap check should be done
  460. */
  461. template <typename TensorStorage, typename RStorage>
  462. inline bool should_check_overlap(const TensorND<TensorStorage>& dst,
  463. const TensorND<RStorage>& src) {
  464. return true;
  465. }
  466. template <>
  467. inline bool should_check_overlap<HostTensorStorage, DeviceTensorStorage>(
  468. const HostTensorND& dst, const DeviceTensorND& src) {
  469. return src.comp_node().contain_flag(
  470. CompNode::Flag::SUPPORT_UNIFIED_ADDRESS);
  471. }
  472. template <>
  473. inline bool should_check_overlap<DeviceTensorStorage, HostTensorStorage>(
  474. const DeviceTensorND& dst, const HostTensorND& src) {
  475. return dst.comp_node().contain_flag(
  476. CompNode::Flag::SUPPORT_UNIFIED_ADDRESS);
  477. }
  478. /**
  479. * \brief D2D tensor copy should check overlap when
  480. * 1. They are on the same mem node. But note that the address must be logical
  481. * comparable. i.e. the original address alloc on enflame is uncomparable.
  482. * 2. They both support unified address, so can be treated as CPU address.
  483. */
  484. template <>
  485. inline bool should_check_overlap<DeviceTensorStorage, DeviceTensorStorage>(
  486. const DeviceTensorND& dst, const DeviceTensorND& src) {
  487. bool is_same_memnode =
  488. dst.comp_node().mem_node() == src.comp_node().mem_node();
  489. bool unified_address = src.comp_node().contain_flag(
  490. CompNode::Flag::SUPPORT_UNIFIED_ADDRESS) &&
  491. dst.comp_node().contain_flag(
  492. CompNode::Flag::SUPPORT_UNIFIED_ADDRESS);
  493. return is_same_memnode || unified_address;
  494. }
  495. /**
  496. * \brief check overlap of two tensors. throw exception when overlapped
  497. */
  498. inline void check_overlapped(const dt_byte* dst_min, const dt_byte* dst_max,
  499. const dt_byte* src_min, const dt_byte* src_max) {
  500. mgb_throw_if(src_min < dst_max && dst_min < src_max, TensorCopyOverlapError,
  501. "cound not perform copy between overlapped tensors");
  502. }
  503. } // namespace
  504. template<class TensorStorage>
  505. template<class RStorage>
  506. typename TensorND<TensorStorage>::ChainReturnType&
  507. TensorND<TensorStorage>::copy_from(const TensorND<RStorage> &src) {
  508. if (!m_storage.comp_node_valid())
  509. m_storage.comp_node(src.comp_node());
  510. if (m_layout.dtype.valid())
  511. m_layout.dtype.assert_is(src.dtype());
  512. else
  513. m_layout.dtype = src.dtype();
  514. m_layout.format = {};
  515. size_t size_bytes = dtype().size(
  516. m_layout.init_contiguous_stride(src.shape()));
  517. m_storage.ensure_size(size_bytes);
  518. if (!size_bytes) {
  519. return static_cast<ChainReturnType&>(*this);
  520. }
  521. if (src.layout().is_physical_contiguous()) {
  522. if (should_check_overlap(*this, src)) {
  523. check_overlapped(m_storage.ptr(),
  524. m_storage.ptr() + size_bytes,
  525. src.storage().ptr(),
  526. src.storage().ptr() + size_bytes);
  527. }
  528. m_storage.copy_from(src.storage(), size_bytes);
  529. return static_cast<ChainReturnType&>(*this);
  530. }
  531. return const_cast<ChainReturnType&>(copy_from_fixlayout(src));
  532. }
  533. template <class TensorStorage>
  534. template <class RStorage>
  535. const typename TensorND<TensorStorage>::ChainReturnType&
  536. TensorND<TensorStorage>::copy_from_fixlayout(
  537. const TensorND<RStorage>& src) const {
  538. dtype().assert_is(src.dtype());
  539. mgb_assert(m_layout.eq_shape(src.layout()),
  540. "shape differs in copy_from_fixlayout: %s vs %s",
  541. static_cast<const TensorShape&>(m_layout).to_string().c_str(),
  542. static_cast<const TensorShape&>(src.layout()).to_string().c_str());
  543. if (src.empty()) {
  544. return static_cast<const ChainReturnType&>(*this);
  545. }
  546. mgb_assert(m_layout.is_non_overlapping_strong(),
  547. "copy dest must have non-overlapping layout");
  548. TensorLayout::Span
  549. src_span = src.layout().span(),
  550. dst_span = layout().span();
  551. if (should_check_overlap(*this, src)) {
  552. check_overlapped(this->raw_ptr() + dst_span.low_byte,
  553. this->raw_ptr() + dst_span.high_byte,
  554. src.raw_ptr() + src_span.low_byte,
  555. src.raw_ptr() + src_span.high_byte);
  556. }
  557. bool self_contig = m_layout.is_physical_contiguous(),
  558. src_contig = src.layout().is_physical_contiguous();
  559. if (self_contig && src_contig) {
  560. if (m_layout.format.is_default() && src.layout().format.is_default()) {
  561. mgb_assert(src_span.low_byte == 0 && dst_span.low_byte == 0 &&
  562. src_span.high_byte == dst_span.high_byte);
  563. m_storage.copy_from(src.storage(), src_span.high_byte);
  564. } else {
  565. mgb_assert(src_span.low_byte == 0 && dst_span.low_byte == 0);
  566. m_storage.copy_from(src.storage(), std::min(src_span.high_byte,
  567. dst_span.high_byte));
  568. }
  569. return static_cast<const ChainReturnType&>(*this);
  570. }
  571. noncont_tensor_copy(*this, src, self_contig, src_contig);
  572. return static_cast<const ChainReturnType&>(*this);
  573. }
  574. /* =================== misc =================== */
  575. void mgb::dev_tensor_memset(const DeviceTensorND& tensor, int val) {
  576. auto&& env = CompNodeEnv::from_comp_node(tensor.comp_node());
  577. env.activate();
  578. void* ptr = tensor.raw_ptr();
  579. size_t size = tensor.layout().span().dist_byte();
  580. switch (env.property().type) {
  581. #if MGB_CUDA
  582. case CompNode::DeviceType::CUDA:
  583. MGB_CUDA_CHECK(
  584. cudaMemsetAsync(ptr, val, size, env.cuda_env().stream));
  585. break;
  586. #endif
  587. #if MGB_ATLAS
  588. case CompNode::DeviceType::ATLAS:
  589. #if MGB_USE_ATLAS_ASYNC_API
  590. MGB_ATLAS_CHECK(aclrtMemsetAsync(ptr, -1, val, size,
  591. env.atlas_env().stream));
  592. #else
  593. MGB_ATLAS_CHECK(aclrtMemset(ptr, -1, val, size));
  594. #endif
  595. break;
  596. #endif
  597. case CompNode::DeviceType::CPU: {
  598. auto fill = [ptr, size, val]() { std::memset(ptr, val, size); };
  599. env.cpu_env().dispatch(fill);
  600. } break;
  601. default:
  602. mgb_throw(MegBrainError,
  603. "unhandled comp node in dev_tensor_memset: %s",
  604. tensor.comp_node().to_string().c_str());
  605. }
  606. }
  607. namespace mgb {
  608. template class TensorStorage<HostTensorStorageTrait>;
  609. template class TensorStorage<DeviceTensorStorageTrait>;
  610. template class TensorND<TensorStorage<HostTensorStorageTrait>>;
  611. template class TensorND<TensorStorage<DeviceTensorStorageTrait>>;
  612. /* ===== copy_from related ===== */
  613. #define HT_RAW TensorND<HostTensorStorage>
  614. #define DT_RAW TensorND<DeviceTensorStorage>
  615. #define HT(f) f<HostTensorStorage>(const HT_RAW&)
  616. #define DT(f) f<DeviceTensorStorage> (const DT_RAW&)
  617. #define INST(f, c) \
  618. template c HostTensorND& HT_RAW::HT(f) c; \
  619. template c HostTensorND& HT_RAW::DT(f) c; \
  620. template c DeviceTensorND& DT_RAW::HT(f) c; \
  621. template c DeviceTensorND& DT_RAW::DT(f) c
  622. INST(copy_from, );
  623. INST(copy_from_fixlayout, const);
  624. #undef INST
  625. #undef DT
  626. #undef HT
  627. #undef DT_RAW
  628. #undef HT_RAW
  629. }
  630. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台