You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

interpreter_impl.cpp 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688
  1. /**
  2. * \file imperative/src/impl/interpreter_impl.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "./interpreter_impl.h"
  12. #include "megbrain/common.h"
  13. #include "megbrain/imperative/opr_utility.h"
  14. #include "megbrain/imperative/ops/backward_graph.h"
  15. #include "megbrain/imperative/ops/autogen.h"
  16. using namespace mgb;
  17. using namespace imperative;
  18. using namespace interpreter;
  19. using namespace interpreter::intl;
  20. std::unique_ptr<Interpreter::Channel> InterpreterImpl::create_channel() {
  21. return std::make_unique<ChannelImpl>();
  22. }
  23. Interpreter& Interpreter::inst() {
  24. static InterpreterImpl inst_;
  25. return inst_;
  26. }
  27. Handle ChannelImpl::put(const HostTensorND& value, bool no_cache) {
  28. auto info = alloc();
  29. info->desc.layout = value.layout();
  30. info->desc.comp_node = value.comp_node();
  31. info->desc.value = value.proxy_to_default_cpu();
  32. info->h_value = value;
  33. m_valid_handle.insert(info);
  34. m_buffer.enqueue(Put{info, value, no_cache});
  35. if (m_async_level == 0) {
  36. sync();
  37. info->desc.comp_node.sync();
  38. }
  39. return info;
  40. }
  41. Handle ChannelImpl::put(const DeviceTensorND& data) {
  42. auto info = alloc();
  43. info->desc.layout = data.layout();
  44. info->desc.comp_node = data.comp_node();
  45. info->ptr = Tensor::make(data);
  46. m_valid_handle.insert(info);
  47. return info;
  48. }
  49. void ChannelImpl::del(Handle handle) {
  50. mgb_assert(m_valid_handle.erase(handle), "invalid handle: %p", handle);
  51. m_buffer.enqueue(Del{reinterpret_cast<TensorInfo*>(handle)});
  52. }
  53. void ChannelImpl::swap_in(Handle handle) {
  54. if (m_enable_evict & SWAP) {
  55. mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(),
  56. "invalid handle: %p", handle);
  57. m_buffer.enqueue(SwapIn{reinterpret_cast<TensorInfo*>(handle)});
  58. }
  59. }
  60. void ChannelImpl::swap_out(Handle handle) {
  61. if (m_enable_evict & SWAP) {
  62. mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(),
  63. "invalid handle: %p", handle);
  64. m_buffer.enqueue(SwapOut{reinterpret_cast<TensorInfo*>(handle)});
  65. }
  66. }
  67. void ChannelImpl::drop(Handle handle) {
  68. if (m_enable_evict & DROP) {
  69. mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(),
  70. "invalid handle: %p", handle);
  71. m_buffer.enqueue(Drop{reinterpret_cast<TensorInfo*>(handle)});
  72. }
  73. }
  74. void ChannelImpl::dispatch_default_cpu(
  75. std::shared_ptr<OpDef> op,
  76. const SmallVector<TensorInfo*>& input_infos,
  77. const SmallVector<LogicalTensorDesc>& input_descs,
  78. SmallVector<Handle>* outputs) {
  79. auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs);
  80. SmallVector<DeviceTensorND> input_tensornds;
  81. input_tensornds.reserve(input_descs.size());
  82. CompNode output_cn;
  83. {
  84. MGB_LOCK_GUARD(m_mutex);
  85. for (auto&& info : input_infos) {
  86. auto input_cn = info->desc.comp_node;
  87. if (!output_cn.valid()) {
  88. output_cn = input_cn;
  89. } else {
  90. mgb_assert(output_cn == input_cn, "cannot decide output comp node");
  91. }
  92. if (info->ptr && info->ptr->try_get_value()) {
  93. input_tensornds.emplace_back(info->ptr->get_value().proxy_to_default_cpu());
  94. } else {
  95. mgb_assert(!info->h_value.empty(), "inp->h_value is empty!");
  96. input_tensornds.emplace_back(info->h_value.proxy_to_default_cpu());
  97. }
  98. }
  99. }
  100. outputs->reserve(output_descs.size());
  101. SmallVector<DeviceTensorND> output_tensornds;
  102. output_tensornds.reserve(output_descs.size());
  103. for (auto&& desc : output_descs) {
  104. // TODO: may conflict with condtake, which need alloc inside
  105. mgb_assert(!desc.layout.is_empty());
  106. // use HostTensorND alloc_host for cuda pinned memory
  107. output_tensornds.emplace_back(HostTensorND(output_cn, desc.layout).proxy_to_default_cpu());
  108. }
  109. OpDef::apply_on_device_tensornd(*op, input_tensornds, &output_tensornds);
  110. SmallVector<TensorInfo*> output_infos;
  111. output_infos.reserve(output_descs.size());
  112. for (auto&& tensornd : output_tensornds) {
  113. HostTensorND host_tensornd = HostTensorND::make_proxy(tensornd)
  114. .proxy_to_comp_node(output_cn);
  115. // use `put` for consistency
  116. auto info = reinterpret_cast<TensorInfo*>(put(host_tensornd, false));
  117. mgb_assert(info->desc.layout.ndim != 0);
  118. output_infos.push_back(info);
  119. outputs->push_back(info);
  120. }
  121. if (m_enable_evict & DROP) {
  122. for (auto out : output_infos) {
  123. out->path.op = op;
  124. for (auto out_ : output_infos) {
  125. out->path.outputs.push_back(m_st.at(out_));
  126. }
  127. for (auto inp : input_infos) {
  128. out->path.inputs.push_back(m_st.at(inp));
  129. inp->path.dep_outputs.push_back(m_st.at(out));
  130. }
  131. }
  132. }
  133. }
  134. void ChannelImpl::dispatch_kernel(
  135. std::shared_ptr<OpDef> op,
  136. const SmallVector<TensorInfo*>& input_infos,
  137. const SmallVector<LogicalTensorDesc>& input_descs,
  138. SmallVector<Handle>* outputs) {
  139. auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs);
  140. ApplyOp cmd{std::move(op)};
  141. cmd.inputs = std::move(input_infos);
  142. cmd.outputs.reserve(output_descs.size());
  143. outputs->reserve(output_descs.size());
  144. for (auto&& desc : output_descs) {
  145. auto info = alloc();
  146. info->desc = desc;
  147. // make sure desc's value is consistent with h_value
  148. if (!info->desc.value.empty()) {
  149. info->h_value = HostTensorND::make_proxy(desc.value)
  150. .proxy_to_comp_node(desc.comp_node);
  151. }
  152. m_valid_handle.insert(info);
  153. cmd.outputs.push_back(info);
  154. outputs->push_back(info);
  155. }
  156. if (m_enable_evict & DROP) {
  157. for (auto out : cmd.outputs) {
  158. out->path.op = cmd.op;
  159. for (auto out_ : cmd.outputs) {
  160. out->path.outputs.push_back(m_st.at(out_));
  161. }
  162. for (auto inp : cmd.inputs) {
  163. out->path.inputs.push_back(m_st.at(inp));
  164. inp->path.dep_outputs.push_back(m_st.at(out));
  165. }
  166. }
  167. }
  168. m_buffer.enqueue(std::move(cmd));
  169. if (!validated && m_async_level == 1) {
  170. sync();
  171. } else if (m_async_level == 0) {
  172. sync();
  173. // check device error
  174. for (auto&& oup : *outputs) {
  175. auto info = reinterpret_cast<TensorInfo*>(oup);
  176. info->ptr->comp_node().sync();
  177. }
  178. }
  179. }
  180. SmallVector<Handle> ChannelImpl::apply_op(
  181. std::shared_ptr<OpDef> op,
  182. const SmallVector<Handle>& inputs) {
  183. for (auto i : inputs) {
  184. mgb_assert(m_valid_handle.find(i) != m_valid_handle.end(),
  185. "invalid handle: %p", i);
  186. }
  187. SmallVector<TensorInfo*> input_infos;
  188. input_infos.reserve(inputs.size());
  189. SmallVector<LogicalTensorDesc> input_descs;
  190. input_descs.reserve(inputs.size());
  191. {
  192. MGB_LOCK_GUARD(m_mutex);
  193. for (auto i : inputs) {
  194. auto info = reinterpret_cast<TensorInfo*>(i);
  195. mgb_assert(!info->invalid, "Invalid tensor, unable to apply_op!");
  196. input_infos.push_back(info);
  197. input_descs.push_back(info->desc);
  198. }
  199. }
  200. SmallVector<Handle> outputs;
  201. switch (OpDef::decide_dispatch_mode(*op, input_descs)) {
  202. case DEFAULT_CPU: {
  203. dispatch_default_cpu(op, input_infos, input_descs, &outputs);
  204. break;
  205. }
  206. case KERNEL: {
  207. dispatch_kernel(op, input_infos, input_descs, &outputs);
  208. break;
  209. }
  210. }
  211. return outputs;
  212. }
  213. HostTensorND ChannelImpl::get_value(Handle handle) {
  214. mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(),
  215. "invalid handle: %p", handle);
  216. auto info = reinterpret_cast<TensorInfo*>(handle);
  217. std::unique_lock<decltype(m_mutex)> lock(m_mutex);
  218. mgb_assert(!m_waitee);
  219. if (!info->value_fetched) {
  220. mgb_assert(!info->invalid, "Invalid tensor, unable to get_value!");
  221. m_waitee = info;
  222. m_buffer.enqueue(GetValue{info});
  223. m_cv.wait(lock, [&]() {
  224. check_worker_exc_unsafe();
  225. return info->value_fetched;
  226. });
  227. m_waitee = nullptr;
  228. }
  229. mgb_assert(info->ptr->value_fetched());
  230. return info->ptr->get_value();
  231. }
  232. TensorShape ChannelImpl::get_shape(Handle handle) {
  233. mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(),
  234. "invalid handle: %p", handle);
  235. auto info = reinterpret_cast<TensorInfo*>(handle);
  236. if (info->desc.layout.ndim != 0) {
  237. return info->desc.layout;
  238. }
  239. std::unique_lock<decltype(m_mutex)> lock(m_mutex);
  240. mgb_assert(!m_waitee);
  241. m_waitee = info;
  242. m_buffer.enqueue(Flush{info});
  243. m_cv.wait(lock, [&]() {
  244. check_worker_exc_unsafe();
  245. return static_cast<bool>(info->ptr);
  246. });
  247. m_waitee = nullptr;
  248. TensorShape ret = info->ptr->layout();
  249. mgb_assert(ret.ndim != 0);
  250. return ret;
  251. }
  252. DType ChannelImpl::get_dtype(Handle handle) {
  253. mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(),
  254. "invalid handle: %p", handle);
  255. auto info = reinterpret_cast<TensorInfo*>(handle);
  256. auto ret = info->desc.layout.dtype;
  257. mgb_assert(ret.valid());
  258. return ret;
  259. }
  260. CompNode ChannelImpl::get_device(Handle handle) {
  261. mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(),
  262. "invalid handle: %p", handle);
  263. auto info = reinterpret_cast<TensorInfo*>(handle);
  264. auto ret = info->desc.comp_node;
  265. mgb_assert(ret.valid());
  266. return ret;
  267. }
  268. DeviceTensorND ChannelImpl::get_dev_tensor(Handle handle) {
  269. mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(),
  270. "invalid handle: %p", handle);
  271. auto info = reinterpret_cast<TensorInfo*>(handle);
  272. std::unique_lock<decltype(m_mutex)> lock(m_mutex);
  273. mgb_assert(!m_waitee);
  274. m_waitee = info;
  275. m_buffer.enqueue(Flush{info});
  276. m_cv.wait(lock, [&]() {
  277. check_worker_exc_unsafe();
  278. return static_cast<bool>(info->ptr);
  279. });
  280. m_waitee = nullptr;
  281. return info->ptr->dev_tensor();
  282. }
  283. void ChannelImpl::sync() {
  284. if (!m_buffer.empty()) {
  285. m_buffer.enqueue(Flush{});
  286. }
  287. m_worker.wait_all_task_finish();
  288. MGB_LOCK_GUARD(m_mutex);
  289. check_worker_exc_unsafe();
  290. }
  291. void ChannelImpl::close() {
  292. sync();
  293. }
  294. void ChannelImpl::config_async_level(int level) {
  295. mgb_assert(level <= 2 && level >= 0, "async_level should be 0, 1 or 2");
  296. m_async_level = level;
  297. }
  298. int ChannelImpl::get_async_level() {
  299. return m_async_level;
  300. }
  301. TensorInfo* ChannelImpl::alloc() {
  302. MGB_LOCK_GUARD(m_mutex);
  303. auto info = m_pool.alloc();
  304. m_st.insert(info);
  305. return info;
  306. }
  307. void ChannelImpl::free(TensorInfo* ptr) {
  308. MGB_LOCK_GUARD(m_mutex);
  309. if (ptr->path.dep_outputs.size() > 0) {
  310. remove_dep(ptr);
  311. }
  312. m_st.erase(ptr);
  313. mgb_assert(ptr->allow_delete, "delete before ref_cnt = 0");
  314. m_pool.free(ptr);
  315. }
  316. ChannelImpl::~ChannelImpl() {
  317. close();
  318. }
  319. void ChannelImpl::produce_tensor(TensorInfo* dest, TensorPtr ptr, bool notice = true) {
  320. auto lock = notice ? std::unique_lock<std::mutex>(m_mutex)
  321. : std::unique_lock<std::mutex>();
  322. dest->value_fetched = ptr->value_fetched();
  323. // update tensor desc for static infer
  324. dest->desc.layout = ptr->layout();
  325. dest->desc.comp_node = ptr->comp_node();
  326. dest->ptr = std::move(ptr);
  327. if (notice && m_waitee == dest) {
  328. m_cv.notify_all();
  329. }
  330. }
  331. void ChannelImpl::do_swap_out(TensorInfo* dest) {
  332. if (dest->evict_type == DROP) {
  333. mgb_log_warn("the evict type of tensor %p was set to DROP, this SWAP operation will be ignored", dest);
  334. return;
  335. }
  336. if (!dest->ptr) {
  337. return;
  338. }
  339. dest->evict_type = SWAP;
  340. dest->value_fetched = false;
  341. // TODO: swap in parallel
  342. dest->h_value = dest->ptr->get_value();
  343. dest->ptr.reset();
  344. }
  345. void ChannelImpl::do_swap_in(TensorInfo* dest) {
  346. if (dest->ptr) {
  347. return;
  348. }
  349. if (dest->h_value.empty()) {
  350. mgb_log_error("backup of the tensor %p not found", dest);
  351. return;
  352. }
  353. produce_tensor(dest, Tensor::make(dest->h_value), false);
  354. dest->evict_type = NONE;
  355. }
  356. void ChannelImpl::remove_dep(TensorInfo* dest) {
  357. for (auto i : dest->path.dep_outputs) {
  358. auto out_ptr = i.lock();
  359. if (out_ptr) {
  360. regenerate(out_ptr.get(), true);
  361. }
  362. }
  363. }
  364. void ChannelImpl::do_drop(TensorInfo* dest) {
  365. if (dest->evict_type == SWAP) {
  366. mgb_log_warn("the evict type of tensor %p was set to SWAP, this DROP operation will be ignored", dest);
  367. return;
  368. }
  369. if (!dest->path.op) {
  370. mgb_log_warn("the input that produced tensor %p has been deleted, this drop operation will be ignored", dest);
  371. return;
  372. }
  373. if (dest->recompute_times >= m_max_recompute_time) {
  374. mgb_log_warn("the recomputation time for tensor %p exceeds the limit, this drop operation will be ignored", dest);
  375. return;
  376. }
  377. if (!dest->ptr) {
  378. return;
  379. }
  380. dest->evict_type = DROP;
  381. dest->value_fetched = false;
  382. dest->ptr.reset();
  383. }
  384. void ChannelImpl::set_swap_flag(bool flag) {
  385. if (flag) {
  386. m_enable_evict |= SWAP;
  387. } else {
  388. m_enable_evict &= ~SWAP;
  389. }
  390. }
  391. void ChannelImpl::set_drop_flag(bool flag) {
  392. if (flag) {
  393. m_enable_evict |= DROP;
  394. } else {
  395. m_enable_evict &= ~DROP;
  396. }
  397. }
  398. void ChannelImpl::set_buffer_length(int length) {
  399. m_buffer.set_capacity(length);
  400. }
  401. void ChannelImpl::regenerate(TensorInfo* info, bool must_drop = false) {
  402. if (!info->ptr && info->evict_type != NONE) {
  403. if (info->evict_type == SWAP) {
  404. do_swap_in(info);
  405. } else {
  406. mgb_assert(info->evict_type == DROP);
  407. mgb_assert(info->path.op, "recomputation path not found");
  408. auto path = info->path;
  409. SmallVector<TensorPtr> inputs;
  410. inputs.reserve(path.inputs.size());
  411. for (auto i : path.inputs) {
  412. mgb_assert(i, "invalid history input");
  413. if (!i->ptr) {
  414. regenerate(i.get(), must_drop);
  415. }
  416. inputs.push_back(i->ptr);
  417. }
  418. auto outputs = OpDef::apply_on_physical_tensor(*path.op, inputs);
  419. for (size_t i = 0; i < outputs.size(); i ++) {
  420. auto out_ptr = path.outputs[i].lock();
  421. if (out_ptr) {
  422. out_ptr->recompute_times ++;
  423. if (!out_ptr->ptr && out_ptr->evict_type == DROP) {
  424. produce_tensor(out_ptr.get(), std::move(outputs[i]), false);
  425. }
  426. }
  427. }
  428. }
  429. }
  430. if (must_drop) {
  431. if (info->path.op) {
  432. info->path.op.reset();
  433. info->path.inputs.clear();
  434. if (info->evict_type == DROP) {
  435. info->evict_type = NONE;
  436. }
  437. }
  438. }
  439. }
  440. void ChannelImpl::process_one_task(Command& cmd) {
  441. //TODO: remove std::visit for support osx 10.12
  442. std::visit([this](auto& cmd) {
  443. using T = std::remove_reference_t<decltype(cmd)>;
  444. try {
  445. if constexpr (std::is_same_v<T, Put>) {
  446. auto value = cmd.no_cache ? std::make_shared<Tensor>(cmd.value) : Tensor::make(cmd.value);
  447. produce_tensor(cmd.dest, std::move(value));
  448. } else if constexpr (std::is_same_v<T, ApplyOp>) {
  449. SmallVector<TensorPtr> tensor_inputs;
  450. tensor_inputs.reserve(cmd.inputs.size());
  451. // refcnt == 1, owners: [TensorInfo::ptr]
  452. for (auto i : cmd.inputs) {
  453. if (m_enable_evict && i->evict_type != NONE) {
  454. if (!i->ptr) {
  455. regenerate(i);
  456. }
  457. }
  458. mgb_assert(i->ptr, "Invalid input tensor ptr!");
  459. // refcnt ++, owners: [i->ptr, tensor_inputs]
  460. tensor_inputs.push_back(i->ptr);
  461. }
  462. // Fused by command buffer. @see: CommandBuffer::fuse_del
  463. // Now if dest is inplacable, it's refcnt would be decreased to 1 and owned by tensor_inputs after Del.
  464. // Note for exprs like 'y = x op x', inplace is unsupported yet but Del would be also fused.
  465. for (auto* del : cmd.dels) {
  466. // refcnt --, owners: [tensor_inputs]
  467. // if it's decreased to 1, would be detected at @see: proxy_graph_detail::apply_on_physical_tensor
  468. free(del);
  469. }
  470. // Here std::move is REQUIRED for removing duplicated references.
  471. auto tensor_outputs = OpDef::apply_on_physical_tensor(
  472. *cmd.op, std::move(tensor_inputs));
  473. mgb_assert(tensor_outputs.size() == cmd.outputs.size());
  474. for (size_t i = 0; i < tensor_outputs.size(); ++i) {
  475. produce_tensor(cmd.outputs[i], std::move(tensor_outputs[i]));
  476. }
  477. } else if constexpr (std::is_same_v<T, Del>) {
  478. free(cmd.dest);
  479. } else if constexpr (std::is_same_v<T, GetValue>) {
  480. if (m_enable_evict && cmd.dest->evict_type != NONE) {
  481. if (!cmd.dest->ptr) {
  482. regenerate(cmd.dest);
  483. }
  484. }
  485. mgb_assert(cmd.dest->ptr, "Invalid tensor ptr!");
  486. cmd.dest->ptr->fetch_value();
  487. MGB_LOCK_GUARD(m_mutex);
  488. cmd.dest->value_fetched = true;
  489. if (m_waitee == cmd.dest) {
  490. m_cv.notify_all();
  491. }
  492. } else if constexpr (std::is_same_v<T, SwapIn>) {
  493. do_swap_in(cmd.dest);
  494. } else if constexpr (std::is_same_v<T, SwapOut>) {
  495. do_swap_out(cmd.dest);
  496. } else if constexpr (std::is_same_v<T, Drop>) {
  497. do_drop(cmd.dest);
  498. } else if constexpr (std::is_same_v<T, Move>) {
  499. produce_tensor(cmd.dest, cmd.src->ptr);
  500. free(cmd.src);
  501. } else {
  502. static_assert(std::is_same_v<T, Flush> ||
  503. std::is_same_v<T, Nop>);
  504. }
  505. } catch (...) {
  506. MGB_LOCK_GUARD(m_mutex);
  507. if constexpr (std::is_same_v<T, ApplyOp>) {
  508. for (auto oup : cmd.outputs) {
  509. oup->invalid = true;
  510. }
  511. } else if constexpr (std::is_same_v<T, Put>) {
  512. cmd.dest->invalid = true;
  513. }
  514. m_worker_exc = std::current_exception();
  515. m_cv.notify_all();
  516. }
  517. }, cmd);
  518. }
  519. void ChannelImpl::check_worker_exc_unsafe() {
  520. if (m_worker_exc) {
  521. // for reuse interpreter_for_py after some exception tests
  522. m_waitee = nullptr;
  523. std::exception_ptr exc;
  524. std::swap(exc, m_worker_exc);
  525. std::rethrow_exception(exc);
  526. }
  527. }
  528. void ChannelImpl::CommandBuffer::enqueue(Command cmd) {
  529. if (std::get_if<Del>(&cmd) && fuse_del(std::get<Del>(cmd))) {
  530. return;
  531. }
  532. auto command_repr = std::visit([](auto& cmd){ return cmd.to_string(); }, cmd);
  533. mgb_log_debug("%s Enqueued", command_repr.c_str());
  534. m_commands.push_back(std::move(cmd));
  535. auto flush_pos = flush_pos_for(m_commands.back());
  536. flush(flush_pos);
  537. }
  538. void ChannelImpl::CommandBuffer::flush(Handle pos) {
  539. for (auto iter = m_commands.begin(); iter != pos; ++iter) {
  540. auto command_repr = std::visit([](auto& cmd){ return cmd.to_string(); }, *iter);
  541. mgb_log_debug("%s Flushed", command_repr.c_str());
  542. m_owner->m_worker.add_task(std::move(*iter));
  543. }
  544. m_commands.erase(m_commands.begin(), pos);
  545. }
  546. auto ChannelImpl::CommandBuffer::flush_pos_for(const Command& cmd) -> Handle {
  547. return std::visit([this](const auto& cmd) {
  548. using T = std::decay_t<decltype(cmd)>;
  549. if constexpr (std::is_same_v<T, ApplyOp>) {
  550. auto* op_type = cmd.op->dyn_typeinfo();
  551. if (op_type == RemoteRecv::typeinfo() ||
  552. op_type == RemoteSend::typeinfo() ||
  553. op_type == CollectiveComm::typeinfo() ||
  554. op_type == opr::InputCallback::typeinfo() ||
  555. op_type == opr::OutputCallback::typeinfo() ||
  556. op_type == BackwardGraph::typeinfo()) {
  557. return m_commands.end();
  558. }
  559. } else if constexpr (std::is_same_v<T, GetValue>) {
  560. return m_commands.end();
  561. } else if constexpr (std::is_same_v<T, Flush>) {
  562. if (cmd.dest == nullptr) {
  563. return m_commands.end();
  564. }
  565. auto produce_iter = find_produce(cmd.dest, {m_commands.begin(), m_commands.end()});
  566. if (produce_iter != m_commands.end()) {
  567. return produce_iter + 1;
  568. }
  569. }
  570. if (m_commands.size() > m_capacity) {
  571. return m_commands.begin() + (m_commands.size() - m_capacity);
  572. }
  573. return m_commands.begin();
  574. }, cmd);
  575. }
  576. /**
  577. * 1. Find ApplyOp(dest) in buffered commands
  578. * 2. Check if there are other usages between ApplyOp and Del, return false if not
  579. * 3. Fuse Del into ApplyOp, return true
  580. */
  581. bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) {
  582. auto* dest = cmd.dest;
  583. // TODO: eliminate Puts
  584. auto begin = m_commands.begin(), end = m_commands.end();
  585. auto apply_iter = std::find_if(begin, end, [dest](const Command& cmd){
  586. if (auto* apply = std::get_if<ApplyOp>(&cmd)) {
  587. return std::count(apply->inputs.begin(), apply->inputs.end(), dest) > 0;
  588. }
  589. return false;
  590. });
  591. if (apply_iter == end || find_last_usage(dest, {apply_iter+1, end}) != end) {
  592. return false;
  593. }
  594. mgb_log_debug("%s Fused", cmd.to_string().c_str());
  595. std::get<ApplyOp>(*apply_iter).dels.push_back(dest);
  596. return true;
  597. }
  598. auto ChannelImpl::CommandBuffer::find_last_usage(TensorInfo* dest, Range range)
  599. -> Handle {
  600. auto found = range[1];
  601. for (auto iter = range[0]; iter != range[1]; ++iter) {
  602. std::visit([&](const auto& cmd) {
  603. using T = std::decay_t<decltype(cmd)>;
  604. if constexpr (std::is_same_v<T, ApplyOp>) {
  605. if (std::count(cmd.inputs.begin(), cmd.inputs.end(),
  606. dest) > 0) {
  607. found = iter;
  608. }
  609. } else if constexpr (std::is_same_v<T, GetValue>) {
  610. if (cmd.dest == dest) {
  611. found = iter;
  612. }
  613. } else if constexpr (std::is_same_v<T, SwapIn> ||
  614. std::is_same_v<T, SwapOut> ||
  615. std::is_same_v<T, Drop>) {
  616. //TODO: ignore swap-like commands, just remove them from buffer
  617. if (cmd.dest == dest) {
  618. found = iter;
  619. }
  620. }
  621. }, *iter);
  622. };
  623. return found;
  624. }
  625. auto ChannelImpl::CommandBuffer::find_produce(TensorInfo* dest, Range range)
  626. -> Handle {
  627. return std::find_if(range[0], range[1], [dest](auto& cmd) {
  628. return std::visit([dest](const auto& cmd){
  629. using T = std::decay_t<decltype(cmd)>;
  630. if constexpr (std::is_same_v<T, ApplyOp>) {
  631. return std::count(cmd.outputs.begin(), cmd.outputs.end(), dest) > 0;
  632. } else if constexpr (std::is_same_v<T, Put>) {
  633. return cmd.dest == dest;
  634. }
  635. return false;
  636. }, cmd);
  637. });
  638. }

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台