You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

physical_tensor.cpp 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. /**
  2. * \file imperative/src/impl/physical_tensor.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megbrain/imperative.h"
  12. #include "megbrain/imperative/blob_manager.h"
  13. #include "./event_pool.h"
  14. #include "./async_releaser.h"
  15. #include <mutex>
  16. namespace mgb {
  17. namespace imperative {
  18. namespace {
  19. class CompNodeSyncManager : public CompNodeDepedentObject {
  20. ThinHashMap<Blob*, std::unique_ptr<CompNode::Event>> m_blob2event;
  21. std::mutex m_mtx;
  22. public:
  23. #if MGB_CUDA && defined(WIN32)
  24. //! FIXME: windows cuda driver shutdown before call atexit function even
  25. //! register atexit function after init cuda driver! as a workround
  26. //! recovery resource by OS temporarily, may need remove this after
  27. //! upgrade cuda runtime
  28. static bool is_into_atexit;
  29. #endif
  30. std::shared_ptr<void> on_comp_node_finalize() override {
  31. MGB_LOCK_GUARD(m_mtx);
  32. m_blob2event.clear();
  33. return {};
  34. }
  35. static CompNodeSyncManager& inst() {
  36. static CompNodeSyncManager sl_inst;
  37. #if MGB_CUDA && defined(WIN32)
  38. //! FIXME: windows cuda driver shutdown before call atexit function even
  39. //! register atexit function after init cuda driver! as a workround
  40. //! recovery resource by OS temporarily, may need remove this after
  41. //! upgrade cuda runtime
  42. if (!is_into_atexit) {
  43. auto err = atexit([] { is_into_atexit = true; });
  44. mgb_assert(!err, "failed to register atexit function");
  45. }
  46. #endif
  47. return sl_inst;
  48. }
  49. CompNode::Event* get_or_create_event(Blob* blob) {
  50. mgb_assert(!is_finalized());
  51. MGB_LOCK_GUARD(m_mtx);
  52. auto&& e = m_blob2event[blob];
  53. if (!e) {
  54. e = blob->comp_node().create_event();
  55. }
  56. return e.get();
  57. }
  58. void remove(Blob* blob) {
  59. MGB_LOCK_GUARD(m_mtx);
  60. m_blob2event.erase(blob);
  61. }
  62. };
  63. #if MGB_CUDA && defined(WIN32)
  64. //! FIXME: windows cuda driver shutdown before call atexit function even
  65. //! register atexit function after init cuda driver! as a workround
  66. //! recovery resource by OS temporarily, may need remove this after
  67. //! upgrade cuda runtime
  68. bool CompNodeSyncManager::is_into_atexit = false;
  69. #endif
  70. // Cache for small blobs
  71. // 1. A blob has to be seen twice (within a window) to be eligible for cache
  72. // 2. Cache eviction occurs when cache size reaches a threshold, in least frequently used order
  73. class ConstTensorCache {
  74. public:
  75. struct Entry {
  76. size_t hitcnt = 0;
  77. std::unique_ptr<dt_byte[]> data;
  78. size_t size;
  79. BlobPtr blob;
  80. Entry() = default;
  81. Entry(const dt_byte* ptr, size_t size_, BlobPtr blob_)
  82. : data(new dt_byte[size_]), size(size_), blob(blob_) {
  83. memcpy(data.get(), ptr, size);
  84. }
  85. // does not check input
  86. bool match(const HostTensorND& hv) {
  87. return 0 == memcmp(data.get(), hv.raw_ptr(), hv.layout().span().high_byte);
  88. }
  89. };
  90. using KV = std::pair<uint64_t, Entry>;
  91. bool check(const HostTensorND& hv) {
  92. auto&& layout = hv.layout();
  93. auto&& span = layout.span();
  94. return hv.format().is_default() && !hv.empty() &&
  95. layout.is_contiguous() && span.low_byte == 0 &&
  96. span.high_byte <= max_bytes;
  97. }
  98. // hash storage; does not check input
  99. static uint64_t hash(const HostTensorND& hv) {
  100. auto&& span = hv.layout().span();
  101. return XXHash{}
  102. .update(hv.raw_ptr(), span.high_byte)
  103. .digest();
  104. }
  105. BlobPtr lookup(const HostTensorND& hv) {
  106. if (!check(hv)) {
  107. return {};
  108. }
  109. auto h = hash(hv);
  110. MGB_LOCK_GUARD(mtx);
  111. // lookup in g1
  112. auto it = g1.find(h);
  113. if (it != g1.end()) {
  114. if (!it->second.match(hv)) {
  115. mgb_log_warn("hash collision in const tensor cache");
  116. return {};
  117. }
  118. it->second.hitcnt += 1;
  119. return it->second.blob;
  120. }
  121. // lookup in g0
  122. if (!g0.extract(h) && !g0b.extract(h)) {
  123. maybe_collect_g0();
  124. g0.emplace(h);
  125. return {};
  126. }
  127. // add new entry to g1
  128. maybe_collect_g1();
  129. Entry entry(hv.raw_ptr(), hv.layout().span().high_byte, Tensor(hv).blob());
  130. it = g1.emplace_hint(it, h, std::move(entry));
  131. it->second.hitcnt += 1;
  132. return it->second.blob;
  133. }
  134. void clear() {
  135. MGB_LOCK_GUARD(mtx);
  136. g0.clear();
  137. g0b.clear();
  138. g1.clear();
  139. }
  140. std::mutex mtx;
  141. const size_t hwm = 1024, lwm = 512, max_bytes = TensorShape::MAX_NDIM * 8, window = 65536;
  142. private:
  143. void maybe_collect_g0() {
  144. if (g0.size() > window) {
  145. std::swap(g0, g0b);
  146. g0.clear();
  147. }
  148. }
  149. void maybe_collect_g1() {
  150. if (g1.size() < hwm) return;
  151. tmp.clear();
  152. for (auto&& kv : g1) {
  153. tmp.emplace_back(kv.first, std::move(kv.second));
  154. }
  155. std::nth_element(tmp.begin(), tmp.begin() + lwm, tmp.end(), [](const KV& lhs, const KV& rhs) {
  156. return lhs.second.hitcnt > rhs.second.hitcnt;
  157. });
  158. tmp.resize(lwm);
  159. g1.clear();
  160. for (auto&& kv : tmp) {
  161. kv.second.hitcnt = 0;
  162. g1.emplace(std::move(kv));
  163. }
  164. }
  165. // g0: records blobs which have been seen at least once (within a window)
  166. // g0b: backup of g0
  167. // g1: records the most frequently used blobs which have been seen at least
  168. // twice. When `g1.size() == hwm`, it will be refreshed and only the top
  169. // `lhw` frequently used blobs will be kept.
  170. std::unordered_set<uint64_t> g0, g0b;
  171. std::unordered_map<uint64_t, Entry> g1;
  172. std::vector<KV> tmp;
  173. public:
  174. ConstTensorCache() {
  175. g0.reserve(window), g0b.reserve(window);
  176. g1.reserve(hwm), tmp.reserve(hwm);
  177. }
  178. };
  179. struct MultiCNConstTensorCache : CompNodeDepedentObject {
  180. std::mutex mtx;
  181. CompNode::UnorderedMap<ConstTensorCache> cn2cache;
  182. std::shared_ptr<void> on_comp_node_finalize() {
  183. MGB_LOCK_GUARD(mtx);
  184. cn2cache.clear();
  185. return {};
  186. }
  187. BlobPtr lookup(const HostTensorND& hv) {
  188. MGB_LOCK_GUARD(mtx);
  189. return cn2cache[hv.comp_node()].lookup(hv);
  190. }
  191. static MultiCNConstTensorCache& inst() {
  192. static MultiCNConstTensorCache sl_inst;
  193. return sl_inst;
  194. }
  195. };
  196. } // namespace
  197. void EventDeleter::operator()(CompNode::Event* event) {
  198. EventPool::without_timer().free(event);
  199. }
  200. namespace {
  201. std::atomic_uint64_t next_blob_id = 0;
  202. }
  203. Blob::Blob(const DeviceTensorStorage& s):
  204. m_comp_node{s.comp_node()}, m_storage{s.raw_storage()},
  205. m_size{s.size() + s.offset()} {
  206. m_id = next_blob_id++;
  207. BlobManager::inst()->register_blob(this);
  208. }
  209. Blob::Blob(CompNode cn, size_t sz):
  210. m_comp_node{cn}, m_storage{}, m_size{sz} {
  211. m_id = next_blob_id++;
  212. BlobManager::inst()->register_blob(this);
  213. }
  214. Blob::~Blob() {
  215. BlobManager::inst()->unregister_blob(this);
  216. #if MGB_CUDA && defined(WIN32)
  217. //! FIXME: windows cuda driver shutdown before call atexit function even
  218. //! register atexit function after init cuda driver! as a workround
  219. //! recovery resource by OS temporarily, may need remove this after
  220. //! upgrade cuda runtime
  221. if (CompNodeSyncManager::is_into_atexit)
  222. return;
  223. #endif
  224. CompNodeSyncManager::inst().remove(this);
  225. }
  226. const Blob::RawStorage& Blob::storage() {
  227. if (!m_storage) {
  228. BlobManager::inst()->alloc_with_defrag(this, m_size);
  229. }
  230. return m_storage;
  231. }
  232. Tensor::Tensor(BlobPtr blob, const TensorLayout& layout, size_t offset, const HostTensorND& hv)
  233. : m_layout(layout), m_blob(std::move(blob)), m_offset(offset), m_value(hv) {
  234. }
  235. Tensor::Tensor(const HostTensorND &hv)
  236. : Tensor(hv.layout(), hv.comp_node()) {
  237. m_value = hv;
  238. dev_tensor().copy_from_fixlayout(hv);
  239. // even though hv is saved in m_value, Tensor itself could be
  240. // released before copy completes
  241. AsyncReleaser::inst()->add(hv);
  242. }
  243. Tensor::Tensor(const DeviceTensorND &dv, const HostTensorND& hv) {
  244. if (!hv.empty()) {
  245. mgb_assert(dv.comp_node() == hv.comp_node());
  246. mgb_assert(dv.dtype() == hv.dtype());
  247. mgb_assert(dv.shape().eq_shape(hv.shape()));
  248. m_value = hv;
  249. }
  250. m_layout = dv.layout();
  251. m_blob = Blob::make(dv.storage());
  252. m_offset = dv.storage().offset();
  253. }
  254. Tensor::Tensor(const TensorLayout& layout, const CompNode& cn)
  255. : m_layout{layout}, m_blob{Blob::make(cn, layout.span().dist_byte())},
  256. m_offset{0} {}
  257. Tensor::Tensor(const BlobPtr blob, const size_t offset, const TensorLayout& layout)
  258. : m_layout{layout}, m_blob{blob}, m_offset{offset} {}
  259. TensorPtr Tensor::make(const HostTensorND& hv) {
  260. auto&& blob = MultiCNConstTensorCache::inst().lookup(hv);
  261. if (blob) {
  262. return make(std::forward<decltype(blob)>(blob), hv.layout(), hv);
  263. }
  264. return std::make_shared<Tensor>(hv);
  265. }
  266. DeviceTensorND Tensor::dev_tensor() {
  267. mgb_assert(m_blob, "uninitialized tensor.");
  268. DeviceTensorStorage storage;
  269. storage.reset(m_blob->comp_node(), m_blob->size(), m_blob->storage());
  270. storage = storage.sub(m_offset);
  271. DeviceTensorND ret;
  272. ret.reset(storage, m_layout);
  273. return ret;
  274. }
  275. void Tensor::fetch_value() {
  276. MGB_LOCK_GUARD(m_mtx);
  277. if (m_value.empty()) {
  278. m_value.copy_from(dev_tensor());
  279. m_value_ready.reset(EventPool::without_timer().alloc(comp_node()));
  280. m_value_ready->record();
  281. }
  282. }
  283. bool Tensor::value_fetched() {
  284. MGB_LOCK_GUARD(m_mtx);
  285. return m_value.layout().ndim != 0;
  286. }
  287. const HostTensorND& Tensor::get_value() {
  288. fetch_value();
  289. if (m_value_ready) {
  290. m_value_ready->host_wait();
  291. }
  292. return m_value;
  293. }
  294. const HostTensorND* Tensor::try_get_value() {
  295. MGB_LOCK_GUARD(m_mtx);
  296. if (!m_value.empty() && (!m_value_ready || m_value_ready->finished())) {
  297. return &m_value;
  298. }
  299. return nullptr;
  300. }
  301. TensorPtr Tensor::make_scalar(DTypeScalar value, CompNode cn) {
  302. HostTensorND hv{cn, value.dtype()};
  303. hv.resize({1});
  304. memcpy(hv.raw_ptr(), value.storage(), value.dtype().size(1));
  305. return make(hv);
  306. }
  307. TensorPtr Tensor::sub(size_t offset, TensorShape shape) {
  308. TensorLayout layout(shape, m_layout.dtype);
  309. return Tensor::make(m_blob, offset + m_offset, layout);
  310. }
  311. void Tensor::add_release_callback(CompNode cn) {
  312. AsyncReleaser::inst()->add(m_blob, cn);
  313. }
  314. CompNode::Event* Tensor::get_or_create_event() {
  315. auto e = CompNodeSyncManager::inst().get_or_create_event(m_blob.get());
  316. e->record();
  317. return e;
  318. }
  319. void Tensor::static_initialize() {
  320. EventPool::with_timer();
  321. EventPool::without_timer();
  322. AsyncReleaser::inst();
  323. CompNodeSyncManager::inst();
  324. MultiCNConstTensorCache::inst();
  325. }
  326. } // namespace imperative
  327. } // namespace mgb
  328. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台