You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

megbrain_wrap.cpp 32 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046
  1. /**
  2. * \file python_module/src/cpp/megbrain_wrap.cpp
  3. *
  4. * This file is part of MegBrain, a deep learning framework developed by Megvii.
  5. *
  6. * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  7. *
  8. */
  9. #include "./megbrain_wrap.h"
  10. #include "./python_helper.h"
  11. #include "./megbrain_pubapi_internal.h"
  12. #include "megbrain/version.h"
  13. #include "megbrain/tensor.h"
  14. #include "megbrain/comp_node_env.h"
  15. #include "megbrain/opr/io.h"
  16. #include "megbrain/opr/utility.h"
  17. #include "megbrain/gopt/inference.h"
  18. #include "megbrain/utils/thread.h"
  19. #include "megbrain/utils/timer.h"
  20. #include <cstring>
  21. using namespace mgb;
  22. namespace {
  23. bool g_global_finalize_called = false;
  24. /*!
  25. * \brief record the vars produced from user-created Host2DeviceCopy
  26. *
  27. * Note that the vars are mapped by address of underlying HostTensorND, so
  28. * in the case of partial execution, vars in the parent graph can be
  29. * retrieved from oprs in the sub graphs.
  30. */
  31. class UserInputVars final : public UserDataContainer::UserData {
  32. MGB_TYPEINFO_OBJ_DECL;
  33. //! we keep this mapping to handle multi-part compiling, where new
  34. //! graphs would be created and the var in the original graph is needed
  35. ThinHashMap<HostTensorND*, VarNode*> m_tensor2var;
  36. public:
  37. void register_var(SymbolVar x) {
  38. m_tensor2var[x.node()->owner_opr()
  39. ->cast_final_safe<opr::Host2DeviceCopy>()
  40. .host_data()
  41. .get()] = x.node();
  42. }
  43. //! get the corresponding var from an opr if it has been registered;
  44. //! return nullptr otherwise
  45. VarNode* check(cg::OperatorNodeBase* opr) const {
  46. if (opr->same_type<opr::Host2DeviceCopy>()) {
  47. auto ptr = opr->cast_final<opr::Host2DeviceCopy>()
  48. .host_data()
  49. .get();
  50. auto iter = m_tensor2var.find(ptr);
  51. return iter == m_tensor2var.end() ? nullptr : iter->second;
  52. }
  53. return nullptr;
  54. }
  55. static UserInputVars& get(ComputingGraph* graph) {
  56. return *graph->options()
  57. .user_data.get_user_data_or_create<UserInputVars>();
  58. }
  59. };
  60. __attribute__((constructor))
  61. void global_init() {
  62. CompNode::enable_affinity_for_cpu(true);
  63. }
  64. } // anonymous namespace
  65. MGB_TYPEINFO_OBJ_IMPL(UserInputVars);
  66. /* ================= SharedND ================= */
  67. bool SharedND::sync(mgb::DeviceTensorND &dv) {
  68. if (m_copy_sync) {
  69. dv.sync();
  70. return true;
  71. }
  72. return false;
  73. }
  74. void SharedND::_set_init_shape(const std::vector<size_t> &shape) {
  75. mgb_assert(m_dev_tensor && m_dev_tensor->empty());
  76. m_dev_tensor->resize(npy::vec2shape(shape));
  77. }
  78. void SharedND::_resize(const std::vector<size_t> &shape) {
  79. auto tshp = npy::vec2shape(shape);
  80. if (m_dev_tensor) {
  81. m_dev_tensor->resize(tshp);
  82. } else {
  83. mgb_assert(m_var);
  84. m_var->shape_alloc(tshp);
  85. }
  86. }
  87. void SharedND::_reset_zero() {
  88. fill_zero_dev_tensor(*m_dev_tensor);
  89. }
  90. void SharedND::_copy_from_npyarr(PyObject *npyarr) {
  91. auto do_copy = [&](DeviceTensorND *dest, VarNode *var) {
  92. DType dtype = dest ? dest->dtype() : var->dtype();
  93. mgb_assert(dtype.valid());
  94. auto hv = npy::np2tensor(npyarr, npy::Meth::borrow(), dtype);
  95. if (var) {
  96. // only setup by assign(), by craniotome
  97. var->shape_alloc(hv.shape());
  98. dest = &var->mutable_dev_tensor();
  99. }
  100. if (!sync(dest->copy_from(hv))) {
  101. m_async_copy_refkeeper = hv;
  102. } else {
  103. m_async_copy_refkeeper = {};
  104. }
  105. };
  106. if (m_var) {
  107. mgb_assert(!m_dev_tensor);
  108. do_copy(nullptr, m_var);
  109. } else {
  110. mgb_assert(m_dev_tensor);
  111. do_copy(m_dev_tensor.get(), nullptr);
  112. }
  113. }
  114. PyObject* SharedND::_get_npyarr() {
  115. mgb_assert(m_dev_tensor);
  116. if (m_dev_tensor->empty())
  117. Py_RETURN_NONE;
  118. HostTensorND hv;
  119. hv.comp_node(CompNode::default_cpu())
  120. .copy_from(*m_dev_tensor)
  121. .sync();
  122. return npy::ndarray_from_tensor(hv, npy::ShareType::TRY_SHARE);
  123. }
  124. PyObject* SharedND::_get_dtype() {
  125. mgb_assert(m_dev_tensor);
  126. return npy::dtype_mgb2np(m_dev_tensor->dtype());
  127. }
  128. void SharedND::_copy_from_value_proxy(CompGraphCallbackValueProxy &value) {
  129. if (value.eager_copy()) {
  130. mgb_log_warn("copy from eager-copied CompGraphCallbackValueProxy into"
  131. " SharedND; consider using callback_lazycopy; traceback:\n%s",
  132. PyStackExtracter::run().c_str());
  133. }
  134. if (m_var) {
  135. mgb_assert(!m_dev_tensor);
  136. auto &&src = value.dev_tensor();
  137. m_var->shape_alloc(src.shape()).
  138. mutable_dev_tensor().copy_from(src);
  139. } else {
  140. mgb_assert(m_dev_tensor);
  141. sync(m_dev_tensor->copy_from(value.dev_tensor()));
  142. }
  143. }
  144. void SharedND::_share_from_value_proxy(CompGraphCallbackValueProxy& value) {
  145. if (value.eager_copy()) {
  146. mgb_log_warn(
  147. "share value from eager-copied CompGraphCallbackValueProxy into"
  148. " SharedND; consider using callback_lazycopy; traceback:\n%s",
  149. PyStackExtracter::run().c_str());
  150. }
  151. if (m_var) {
  152. mgb_assert(!m_dev_tensor);
  153. m_var->reset_dev_tensor_from_tensor(value.dev_tensor());
  154. } else {
  155. mgb_assert(m_dev_tensor);
  156. *m_dev_tensor = value.dev_tensor();
  157. }
  158. }
  159. SharedND SharedND::_from_symvar(SymbolVar symvar) {
  160. auto opr = symvar.node()->owner_opr();
  161. if (auto vsnd = opr->try_cast_final<opr::VolatileSharedDeviceTensor>()) {
  162. return SharedND(vsnd->dev_data());
  163. }
  164. if (auto snd = opr->try_cast_final<opr::SharedDeviceTensor>()) {
  165. return SharedND(snd->dev_data());
  166. }
  167. mgb_throw(MegBrainError, "cannot convert from %s", opr->dyn_typeinfo()->name);
  168. }
  169. uintptr_t SharedND::_pubapi_dev_tensor_ptr(int version) {
  170. DeviceTensorND *dv;
  171. if (m_dev_tensor) {
  172. mgb_assert(!m_var);
  173. dv = m_dev_tensor.get();
  174. } else {
  175. mgb_assert(m_var);
  176. dv = nullptr;
  177. }
  178. void *ret;
  179. if (version == 0) {
  180. if (dv) {
  181. ret = dv->raw_ptr();
  182. } else {
  183. ret = m_var->dev_tensor().raw_ptr();
  184. }
  185. } else {
  186. init_pubapi_dev_tensor(m_pubapi_dev_tensor, dv, m_var, false);
  187. ret = &m_pubapi_dev_tensor;
  188. }
  189. return reinterpret_cast<uintptr_t>(ret);
  190. }
  191. SymbolVar SharedND::_as_sym_var(CompGraph &cg, const std::string &name,
  192. bool volatile_) {
  193. mgb_assert(m_dev_tensor);
  194. OperatorNodeConfig config;
  195. if (!name.empty())
  196. config.name(name);
  197. if (volatile_) {
  198. return opr::VolatileSharedDeviceTensor::make(cg.get(), m_dev_tensor,
  199. config);
  200. } else {
  201. return opr::SharedDeviceTensor::make(cg.get(), m_dev_tensor, config);
  202. }
  203. }
  204. std::vector<size_t> SharedND::_get_shape(){
  205. if (m_var) {
  206. mgb_assert(!m_dev_tensor);
  207. return npy::shape2vec(m_var->shape());
  208. }
  209. mgb_assert(m_dev_tensor);
  210. return npy::shape2vec(m_dev_tensor->shape());
  211. }
  212. void SharedND::copy_to_sub_from_shared(
  213. int axis, ptrdiff_t begin, ptrdiff_t end, ptrdiff_t step,
  214. const SharedND &rhs) {
  215. mgb_assert(m_dev_tensor && rhs.m_dev_tensor);
  216. auto sub = m_dev_tensor->sub(
  217. Slice(begin, end, step).apply(m_dev_tensor->layout(), axis));
  218. sub.copy_from_fixlayout(*rhs.m_dev_tensor).sync();
  219. }
  220. void SharedND::copy_from_shared_sub(const SharedND &rhs,
  221. int axis, ptrdiff_t begin, ptrdiff_t end, ptrdiff_t step) {
  222. mgb_assert(m_dev_tensor && rhs.m_dev_tensor);
  223. if (axis == -3) {
  224. sync(m_dev_tensor->copy_from_fixlayout(*rhs.m_dev_tensor));
  225. } else if (axis == -2) {
  226. sync(m_dev_tensor->copy_from(*rhs.m_dev_tensor));
  227. } else {
  228. auto sub = rhs.m_dev_tensor->sub(
  229. Slice(begin, end, step).apply(
  230. rhs.m_dev_tensor->layout(), axis));
  231. sync(m_dev_tensor->copy_from(sub));
  232. }
  233. }
  234. void SharedND::_check_before_share_memory(const SharedND& rhs) {
  235. mgb_assert(rhs.m_dev_tensor);
  236. mgb_assert(m_dev_tensor);
  237. mgb_assert(rhs.m_dev_tensor->dtype() == m_dev_tensor->dtype());
  238. mgb_assert(rhs.m_dev_tensor->comp_node() == m_dev_tensor->comp_node());
  239. }
  240. void SharedND::_share_memory_from(const SharedND& rhs, size_t begin) {
  241. _check_before_share_memory(rhs);
  242. m_dev_tensor->reset(
  243. rhs.m_dev_tensor->storage().sub(m_dev_tensor->dtype().size() * begin),
  244. m_dev_tensor->layout());
  245. }
  246. void SharedND::_reset_dev_tensor(const SharedND &rhs) {
  247. _check_before_share_memory(rhs);
  248. *m_dev_tensor = *(rhs.m_dev_tensor);
  249. }
  250. /* ================= _HostSharedND ================= */
  251. void _HostSharedND::ensure_own_storage() {
  252. if (!m_own_storage) {
  253. mgb_assert(m_tensor);
  254. HostTensorND val{m_tensor->comp_node(), m_tensor->dtype()};
  255. if (!m_tensor->empty()) {
  256. val.resize(m_tensor->shape());
  257. }
  258. *m_tensor = std::move(val);
  259. m_own_storage = true;
  260. }
  261. }
  262. void _HostSharedND::_resize(const std::vector<size_t> &shape) {
  263. ensure_own_storage();
  264. m_tensor->resize(npy::vec2shape(shape));
  265. }
  266. void _HostSharedND::_copy_from_npyarr(PyObject *npyarr, bool borrow) {
  267. mgb_assert(m_tensor);
  268. mgb_assert(m_tensor->dtype().valid());
  269. if (!m_borrow_on_cpu &&
  270. m_tensor->comp_node().device_type() == CompNode::DeviceType::CPU) {
  271. borrow = false;
  272. }
  273. if (borrow) {
  274. auto val = npy::np2tensor(
  275. npyarr, npy::Meth::borrow(m_tensor->comp_node()),
  276. m_tensor->dtype());
  277. m_own_storage = false;
  278. *m_tensor = std::move(val);
  279. } else {
  280. ensure_own_storage();
  281. npy::np2tensor(npyarr,
  282. npy::Meth::copy_into(m_tensor.get()), m_tensor->dtype());
  283. }
  284. }
  285. SymbolVar _HostSharedND::_as_sym_var(CompGraph &cg, bool enable_static_infer,
  286. const std::string &name) {
  287. if (m_tensor->empty())
  288. cg.get().options().allocate_static_mem_after_graph_compile = false;
  289. OperatorNodeConfig config;
  290. if (!name.empty())
  291. config.name(name);
  292. SymbolVar ret;
  293. if (enable_static_infer) {
  294. ret = opr::Host2DeviceCopy::make(cg.get(), m_tensor, config);
  295. } else {
  296. ret = opr::Host2DeviceCopy::make_no_value_infer(cg.get(), m_tensor,
  297. config);
  298. }
  299. UserInputVars::get(&cg.get()).register_var(ret);
  300. return ret;
  301. }
  302. _HostSharedND _HostSharedND::make_proxy(SymbolVar var) {
  303. auto &&opr = var.node()->owner_opr()->
  304. cast_final_safe<opr::Host2DeviceCopy>();
  305. _HostSharedND rst{var.node()->comp_node(), var.dtype()};
  306. rst.m_tensor = opr.host_data();
  307. rst.m_proxied_opr = &opr;
  308. return rst;
  309. }
  310. std::string _HostSharedND::__repr__() const {
  311. if (m_proxied_opr) {
  312. return ssprintf("<HostSharedND proxy at %p for %s>",
  313. this, m_proxied_opr->cname());
  314. }
  315. return ssprintf("<HostSharedND at %p>", this);
  316. }
  317. PyObject* _HostSharedND::_get_dtype() {
  318. mgb_assert(m_tensor);
  319. return npy::dtype_mgb2np(m_tensor->dtype());
  320. }
  321. /* ================= CompGraphCallbackValueProxy ================= */
  322. CompGraphCallbackValueProxy
  323. CompGraphCallbackValueProxy::make_raw_host_value_proxy(
  324. const mgb::HostTensorND &hv) {
  325. CompGraphCallbackValueProxy ret;
  326. ret.m_use_raw_hv = true;
  327. ret.m_hv = hv;
  328. ret.m_is_active = true;
  329. return ret;
  330. }
  331. void CompGraphCallbackValueProxy::setup(
  332. const mgb::DeviceTensorND &val, bool eager_copy) {
  333. while (__atomic_load_n(&m_is_active, __ATOMIC_SEQ_CST)) {
  334. // wait for previous callback to finish
  335. std::this_thread::yield();
  336. }
  337. mgb_assert(!m_use_raw_hv && val.shape_valid());
  338. m_eager_copy = eager_copy;
  339. m_dev_value = val;
  340. if (eager_copy) {
  341. m_value_used = false;
  342. do_copy();
  343. } else {
  344. m_value_used = true;
  345. }
  346. __atomic_store_n(&m_is_active, true, __ATOMIC_SEQ_CST);
  347. }
  348. void CompGraphCallbackValueProxy::do_copy() {
  349. mgb_assert(!m_use_raw_hv && m_dev_value.shape_valid());
  350. m_hv.copy_from(m_dev_value);
  351. auto cn = m_hv.comp_node();
  352. if (!m_copy_event)
  353. m_copy_event = cn.create_event();
  354. m_copy_event->record();
  355. }
  356. void CompGraphCallbackValueProxy::sync() {
  357. mgb_assert(!m_use_raw_hv);
  358. RealTimer t0;
  359. double next_warn_time = 2, warn_time_delta = 1;
  360. while (!m_copy_event->finished()) {
  361. usleep(1);
  362. if (t0.get_secs() >= next_warn_time) {
  363. mgb_log_warn("wait d2h copy for more than %.3f secs",
  364. t0.get_secs());
  365. next_warn_time += warn_time_delta;
  366. warn_time_delta += 1;
  367. }
  368. }
  369. }
  370. void CompGraphCallbackValueProxy::on_finished() {
  371. mgb_assert(m_is_active && !m_use_raw_hv);
  372. m_dev_value = {};
  373. if (m_hv.shape_valid()) {
  374. m_hv.resize({}); // resize to reuse buffer
  375. }
  376. __atomic_store_n(&m_is_active, false, __ATOMIC_SEQ_CST);
  377. if (!m_value_used) {
  378. mgb_log_warn("computing graph callback did not read the value");
  379. }
  380. }
  381. PyObject* CompGraphCallbackValueProxy::_get_npyarr() {
  382. mgb_assert(m_is_active);
  383. if (!m_use_raw_hv) {
  384. mgb_assert(m_dev_value.shape_valid());
  385. if (!m_hv.shape_valid()) {
  386. do_copy();
  387. sync();
  388. }
  389. }
  390. m_value_used = true;
  391. return npy::ndarray_from_tensor(m_hv, npy::ShareType::TRY_SHARE);
  392. }
  393. PyObject* CompGraphCallbackValueProxy::_get_dtype() {
  394. mgb_assert(m_is_active);
  395. if (m_use_raw_hv)
  396. return npy::dtype_mgb2np(m_hv.dtype());
  397. mgb_assert(m_dev_value.shape_valid());
  398. return npy::dtype_mgb2np(m_dev_value.dtype());
  399. }
  400. std::vector<size_t> CompGraphCallbackValueProxy::_get_shape() {
  401. mgb_assert(m_is_active);
  402. if (m_use_raw_hv)
  403. return npy::shape2vec(m_hv.shape());
  404. mgb_assert(m_dev_value.shape_valid());
  405. return npy::shape2vec(m_dev_value.shape());
  406. }
  407. uintptr_t CompGraphCallbackValueProxy::_pubapi_dev_tensor_ptr(int version) {
  408. mgb_assert(m_is_active && !m_use_raw_hv);
  409. mgb_assert(m_dev_value.shape_valid());
  410. void *ret;
  411. if (version == 0) {
  412. ret = m_dev_value.raw_ptr();
  413. } else {
  414. init_pubapi_dev_tensor(
  415. m_pubapi_dev_tensor, &m_dev_value, nullptr, true);
  416. ret = &m_pubapi_dev_tensor;
  417. }
  418. return reinterpret_cast<uintptr_t>(ret);
  419. }
  420. mgb::CompNode CompGraphCallbackValueProxy::_get_comp_node() {
  421. mgb_assert(m_is_active && !m_use_raw_hv);
  422. mgb_assert(m_dev_value.shape_valid());
  423. return m_dev_value.comp_node();
  424. }
  425. /* ================= AsyncExec ================= */
  426. class AsyncExec::Core {
  427. public:
  428. Core(std::unique_ptr<mgb::cg::AsyncExecutable> f):
  429. m_func(std::move(f))
  430. {
  431. }
  432. mgb::cg::AsyncExecutable* func() const {
  433. return m_func.get();
  434. }
  435. struct CallbackParam {
  436. std::vector<CompGraphCallbackValueProxy> value;
  437. _CompGraphCallback *cb;
  438. };
  439. void dispatch_callback(const CallbackParam &param) {
  440. m_worker.add_task(param);
  441. }
  442. void wait_callback_finish() {
  443. m_worker.wait_all_task_finish();
  444. }
  445. private:
  446. std::unique_ptr<mgb::cg::AsyncExecutable> m_func;
  447. class Worker final: public AsyncQueueSC<CallbackParam, Worker> {
  448. public:
  449. void process_one_task(CallbackParam &task) {
  450. for (auto &tmp_value: task.value) {
  451. tmp_value.sync();
  452. }
  453. task.cb->call_pycb();
  454. }
  455. };
  456. Worker m_worker;
  457. };
  458. AsyncExec::AsyncExec(std::unique_ptr<mgb::cg::AsyncExecutable> f):
  459. m_core(std::make_shared<Core>(std::move(f)))
  460. {
  461. }
  462. AsyncExec::~AsyncExec() {
  463. if (m_core)
  464. _wait();
  465. }
  466. AsyncExec::Core* AsyncExec::core() const {
  467. return m_core.get();
  468. }
  469. void AsyncExec::_execute() {
  470. m_core->func()->execute();
  471. }
  472. std::string AsyncExec::_to_json_str() {
  473. auto jv = m_core->func()->to_json();
  474. return jv->to_string();
  475. }
  476. void AsyncExec::_wait() {
  477. m_core->wait_callback_finish();
  478. m_core->func()->wait();
  479. }
  480. double AsyncExec::_get_prev_exec_time() {
  481. return m_core->func()->get_prev_exec_time();
  482. }
  483. SymbolVarArray AsyncExec::_find_mutable_input() {
  484. ThinHashSet<VarNode*> used_set;
  485. UserInputVars* user_vars = nullptr;
  486. auto cb = [&](cg::OperatorNodeBase* opr) {
  487. if (!user_vars) {
  488. ComputingGraph* g;
  489. if (m_multi_part_par_graph)
  490. g = m_multi_part_par_graph.get();
  491. else
  492. g = opr->owner_graph();
  493. user_vars = &UserInputVars::get(g);
  494. }
  495. if (auto var = user_vars->check(opr)) {
  496. used_set.insert(var);
  497. }
  498. return true;
  499. };
  500. m_core->func()->iter_opr_seq(cb);
  501. for (auto i : m_core->func()->get_rt_static_source_deps()) {
  502. cb(i.dest->owner_opr());
  503. }
  504. SymbolVarArray ret;
  505. ret.reserve(used_set.size());
  506. ret.insert(ret.begin(), used_set.begin(), used_set.end());
  507. return ret;
  508. }
  509. void AsyncExec::clear_device_memory() {
  510. _wait();
  511. m_core->func()->clear_device_memory();
  512. }
  513. std::vector<std::pair<CompNode, size_t>>
  514. AsyncExec::_update_static_alloc_plan_and_get_size() {
  515. std::vector<std::pair<CompNode, size_t>> ret;
  516. for (auto&& i : m_core->func()->update_static_alloc_plan_and_get_size()) {
  517. ret.emplace_back(i.first, i.second);
  518. }
  519. return ret;
  520. }
  521. /* ================= _CompGraphCallback ================= */
  522. void _CompGraphCallback::set_async_exec(const AsyncExec &ae) {
  523. mgb_assert(!m_ae_core);
  524. m_ae_core = ae.core();
  525. }
  526. void _CompGraphCallback::set_eager_copy(bool flag) {
  527. mgb_assert(!m_cb_created);
  528. m_eager_copy = flag;
  529. }
  530. std::function<void(mgb::SmallVector<mgb::DeviceTensorND> &)> _CompGraphCallback::make_multi_input_callback() {
  531. mgb_assert(!m_cb_created);
  532. m_cb_created = true;
  533. // shared_ptr would delete this afterwards
  534. std::shared_ptr <_CompGraphCallback> self(this);
  535. auto cb = [self](SmallVector <mgb::DeviceTensorND> &data) {
  536. for (size_t i = self->m_value_proxies.size(); i < data.size(); ++i) {
  537. self->m_value_proxies.emplace_back();
  538. }
  539. if (self->m_eager_copy) {
  540. mgb_assert(self->m_ae_core);
  541. for (size_t i = 0; i < self->m_value_proxies.size(); ++i) {
  542. self->m_value_proxies[i].setup(data[i], true);
  543. }
  544. self->m_ae_core->dispatch_callback(
  545. AsyncExec::Core::CallbackParam{self->m_value_proxies, self.get()}
  546. );
  547. } else {
  548. for (size_t i = 0; i < self->m_value_proxies.size(); ++i)
  549. self->m_value_proxies[i].setup(data[i], false);
  550. self->call_pycb();
  551. }
  552. };
  553. return cb;
  554. }
  555. std::function<void(mgb::DeviceTensorND &)> _CompGraphCallback::make_callback() {
  556. this->m_value_proxies.emplace_back();
  557. mgb_assert(!m_cb_created);
  558. m_cb_created = true;
  559. // shared_ptr would delete this afterwards
  560. std::shared_ptr <_CompGraphCallback> self(this);
  561. auto cb = [self](mgb::DeviceTensorND &data) {
  562. if (self->m_eager_copy) {
  563. mgb_assert(self->m_ae_core);
  564. self->m_value_proxies[0].setup(data, true);
  565. self->m_ae_core->dispatch_callback(
  566. AsyncExec::Core::CallbackParam{self->m_value_proxies, self.get()}
  567. );
  568. } else {
  569. self->m_value_proxies[0].setup(data, false);
  570. self->call_pycb();
  571. }
  572. };
  573. return cb;
  574. }
  575. void _CompGraphCallback::call_pycb() {
  576. try {
  577. call(m_value_proxies);
  578. } catch (...) {
  579. for(auto &m_value_proxy: m_value_proxies) {
  580. m_value_proxy.on_finished();
  581. }
  582. throw;
  583. }
  584. for(auto &m_value_proxy: m_value_proxies) {
  585. m_value_proxy.on_finished();
  586. }
  587. }
  588. /* ================= CompGraph ================= */
  589. class CompGraph::PyUserData final: public UserDataContainer::UserData,
  590. public NonCopyableObj {
  591. MGB_TYPEINFO_OBJ_DECL;
  592. PyObject *m_obj;
  593. public:
  594. PyUserData() {
  595. PYTHON_GIL;
  596. m_obj = PyDict_New();
  597. mgb_assert(m_obj, "failed to create python object");
  598. }
  599. ~PyUserData() {
  600. PYTHON_GIL;
  601. Py_DECREF(m_obj);
  602. }
  603. PyObject* get() const {
  604. return m_obj;
  605. }
  606. };
  607. MGB_TYPEINFO_OBJ_IMPL(CompGraph::PyUserData);
  608. mgb::ComputingGraph& CompGraph::get() const {
  609. if (m_comp_graph_own)
  610. return *m_comp_graph_own;
  611. auto &&val = m_comp_graph_borrow.lock();
  612. mgb_assert(val, "CompGraph has been destructed");
  613. return *val;
  614. }
  615. void CompGraph::clear_device_memory() {
  616. if (!m_comp_graph_own)
  617. return;
  618. m_comp_graph_own->clear_device_memory();
  619. }
  620. PyObject* CompGraph::_user_data() {
  621. auto ct = get().options().user_data.get_user_data_or_create<PyUserData>();
  622. auto ret = ct->get();
  623. PYTHON_GIL;
  624. Py_INCREF(ret);
  625. return ret;
  626. }
  627. void CompGraph::_add_output_spec(
  628. mgb::cg::SymbolVar &var, _CompGraphCallback *callback) {
  629. cg::ComputingGraph::Callback cb;
  630. if (callback) {
  631. cb = callback->make_callback();
  632. m_raw_callbacks.push_back({callback, m_out_specs.size() - 1});
  633. }
  634. if (m_out_specs.empty()) {
  635. m_out_specs.emplace_back();
  636. }
  637. m_out_specs.back().push_back({var, cb});
  638. }
  639. AsyncExec CompGraph::_do_compile(bool copy, bool optimize_for_inference) {
  640. mgb_assert(m_out_specs.size() == 1, "got %zu output specs for compile",
  641. m_out_specs.size());
  642. auto&& spec = m_out_specs[0];
  643. if (optimize_for_inference) {
  644. SymbolVarArray vars;
  645. vars.reserve(spec.size());
  646. for (auto&& i : spec) {
  647. vars.push_back(i.first);
  648. }
  649. vars = gopt::optimize_for_inference(vars, {});
  650. mgb_assert(vars.size() == spec.size());
  651. for (size_t i = 0; i < vars.size(); ++i) {
  652. spec[i].first = vars[i];
  653. }
  654. }
  655. std::unique_ptr<mgb::cg::AsyncExecutable> async_executable;
  656. if (get().options().eager_evaluation ||
  657. (copy && get().current_comp_seq())) {
  658. // need to copy a new comp graph
  659. SymbolVarArray vars;
  660. vars.reserve(spec.size());
  661. for (auto&& i : spec) {
  662. vars.emplace_back(i.first);
  663. }
  664. // copy graph
  665. auto new_graph = mgb::ComputingGraph::make();
  666. SymbolVarArray new_vars =
  667. replace_vars_comp_graph(std::move(vars), new_graph.get());
  668. mgb_assert(new_vars.size() == spec.size());
  669. // register input
  670. auto h2d = find_h2d(new_vars);
  671. for (auto&& i : h2d) {
  672. UserInputVars::get(new_graph.get()).register_var(i);
  673. }
  674. mgb::ComputingGraph::OutputSpec new_spec;
  675. new_spec.reserve(spec.size());
  676. for (size_t i = 0; i < spec.size(); ++i) {
  677. new_spec.emplace_back(mgb::ComputingGraph::OutputSpecItem{
  678. new_vars[i], spec[i].second});
  679. }
  680. async_executable = new_graph->compile(new_spec);
  681. } else {
  682. async_executable = get().compile(spec);
  683. }
  684. AsyncExec ret{std::move(async_executable)};
  685. for (auto&& i : m_raw_callbacks) {
  686. mgb_assert(!i.second);
  687. i.first->set_async_exec(ret);
  688. }
  689. _clear_output_spec();
  690. return ret;
  691. }
  692. std::vector<AsyncExec> CompGraph::_do_compile_multi_part() {
  693. // last spec is empty due to an extra call to _add_multi_part_endpoint()
  694. mgb_assert(m_out_specs.size() > 1 && m_out_specs.back().empty(),
  695. "got %zu output specs for multi-part compile",
  696. m_out_specs.size());
  697. m_out_specs.pop_back();
  698. std::vector<AsyncExec> ret;
  699. ret.reserve(m_out_specs.size());
  700. auto graph = get().shared_from_this();
  701. for (auto&& i : graph->compile_multi_part(m_out_specs)) {
  702. ret.emplace_back(std::move(i));
  703. }
  704. for (auto&& i : ret) {
  705. i.set_multi_part_par_graph(graph);
  706. }
  707. for (auto&& i : m_raw_callbacks) {
  708. i.first->set_async_exec(ret.at(i.second));
  709. }
  710. _clear_output_spec();
  711. return ret;
  712. }
  713. /* ================= SharedScalar ================= */
  714. SharedScalar::SharedScalar(PyObject *val):
  715. m_val{std::make_shared<DTypeScalar>()}
  716. {
  717. _set(val);
  718. }
  719. HostTensorND& SharedScalar::val_as_host_nd() {
  720. if (m_val_as_host_nd.empty()) {
  721. HostTensorStorage storage;
  722. storage.reset(CompNode::default_cpu(), m_val->dtype().size(),
  723. {m_val, static_cast<dt_byte*>(
  724. const_cast<void*>(m_val->storage()))});
  725. m_val_as_host_nd.reset(storage, {TensorShape{1}, m_val->dtype()});
  726. }
  727. return m_val_as_host_nd;
  728. }
  729. void SharedScalar::_set(PyObject *val) {
  730. auto tensor = npy::np2tensor(val, npy::Meth::borrow(), {});
  731. mgb_assert(tensor.layout().is_scalar(),
  732. "value given to SharedScalar must be scalar; got shape %s",
  733. tensor.shape().to_string().c_str());
  734. if (m_dtype_locked) {
  735. mgb_assert(tensor.dtype() == m_val->dtype(),
  736. "dtype for SharedScalar has been locked as %s, "
  737. "but attempt to set it to %s", m_val->dtype().name(),
  738. tensor.dtype().name());
  739. }
  740. m_val->set_raw(tensor.dtype(), tensor.raw_ptr());
  741. if (!m_dev_val.empty()) {
  742. auto &&hv = val_as_host_nd();
  743. for (auto &&i: m_dev_val)
  744. i.second->copy_from_fixlayout(hv);
  745. }
  746. }
  747. PyObject* SharedScalar::_get() {
  748. HostTensorND hv{CompNode::default_cpu(), TensorShape{1}, m_val->dtype()};
  749. memcpy(hv.raw_ptr(), m_val->storage(), m_val->dtype().size(1));
  750. return npy::ndarray_from_tensor(hv, npy::ShareType::TRY_SHARE);
  751. }
  752. SymbolVar SharedScalar::_as_sym_var(CompGraph &cg, mgb::CompNode &cn) {
  753. m_dtype_locked = true;
  754. auto &&dv = m_dev_val[cn];
  755. auto &&hv = val_as_host_nd();
  756. if (!dv) {
  757. dv = std::make_shared<DeviceTensorND>(cn);
  758. dv->copy_from(hv);
  759. }
  760. return opr::SharedDeviceTensor::make(cg.get(), dv,
  761. ssprintf("SharedScalar@%p", m_val.get()));
  762. }
  763. /* =============== Operator =============== */
  764. const std::unique_ptr<mgb::OprFootprint> Operator::sm_opr_footprint_ptr{
  765. std::make_unique<mgb::OprFootprint>()};
  766. /* ================= misc ================= */
  767. SymbolVar fill_retain_dtype(SymbolVar var, PyObject *value) {
  768. auto tensor = npy::np2tensor(value, npy::Meth::borrow(), {});
  769. mgb_assert(tensor.shape().is_scalar(),
  770. "value for fill_retain_dtype must be scalar; got shape %s",
  771. tensor.shape().to_string().c_str());
  772. switch (tensor.dtype().enumv()) {
  773. #define cb(_dt) case DTypeTrait<_dt>::enumv: \
  774. static_assert(sizeof(DTypeTrait<_dt>::ctype) <= sizeof(int), \
  775. "bad dtype size"); \
  776. return var.fill_retain_dtype(static_cast<int>( \
  777. *tensor.ptr<DTypeTrait<_dt>::ctype>()));
  778. MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb)
  779. #undef cb
  780. case DTypeEnum::Float32:
  781. return var.fill_retain_dtype(*tensor.ptr<dt_float32>());
  782. case DTypeEnum::Float16:
  783. return var.fill_retain_dtype(
  784. static_cast<float>(*tensor.ptr<dt_float16>()));
  785. case DTypeEnum::BFloat16:
  786. return var.fill_retain_dtype(
  787. static_cast<float>(*tensor.ptr<dt_bfloat16>()));
  788. // TODO: What does this mean?
  789. case DTypeEnum::Quantized8Asymm:
  790. case DTypeEnum::QuantizedS32:
  791. case DTypeEnum::QuantizedS8:
  792. case DTypeEnum::Quantized4Asymm:
  793. case DTypeEnum::QuantizedS4:
  794. case DTypeEnum::Byte:
  795. case DTypeEnum::QuantizedS16:
  796. break;
  797. #define cb(low_bit, size) \
  798. case DTypeEnum::low_bit##size: \
  799. break;
  800. MEGDNN_FOREACH_LOWBIT_DTYPE(cb)
  801. #undef cb
  802. }
  803. throw ConversionError(ssprintf(
  804. "unsupported value dtype: %s", tensor.dtype().name()));
  805. }
  806. PyObject* get_symvar_inferred_value(mgb::SymbolVar symvar) {
  807. auto var = symvar.node();
  808. auto&& mgr = var->owner_graph()->static_infer_manager();
  809. using IT = cg::static_infer::InferType;
  810. auto it = mgr.get_infer_type(var);
  811. if (!(it.value & (IT::CONST | IT::RT_STATIC)))
  812. Py_RETURN_NONE;
  813. auto val = mgr.infer_value_fallible(var);
  814. if (!val)
  815. Py_RETURN_NONE;
  816. auto hv = HostTensorND::make_proxy(*val);
  817. return npy::ndarray_from_tensor(hv, npy::ShareType::MUST_UNSHARE);
  818. }
  819. void _mgb_global_finalize() {
  820. CompNode::finalize();
  821. g_global_finalize_called = true;
  822. }
  823. bool global_finalized() {
  824. return g_global_finalize_called;
  825. }
  826. std::vector<size_t> _get_mgb_version() {
  827. return {MGB_MAJOR, MGB_MINOR, MGB_PATCH, MGB_IS_DEV};
  828. }
  829. SymbolVarArray _grad(SymbolVar target, SymbolVarArray wrts,
  830. bool warn_mid_wrt, int use_virtual_grad,
  831. bool return_zero_for_nodep) {
  832. if (use_virtual_grad == -1) {
  833. use_virtual_grad = std::abs(
  834. target.node()->owner_graph()->options().graph_opt_level) >= 2;
  835. }
  836. if (use_virtual_grad) {
  837. mgb_assert(return_zero_for_nodep,
  838. "can't return a null var when using virtual grad opr");
  839. SymbolVarArray ret;
  840. ret.reserve(wrts.size());
  841. for (auto&& wrt : wrts) {
  842. ret.push_back(opr::VirtualGrad::make(target, wrt));
  843. }
  844. return ret;
  845. }
  846. return cg::grad(target, wrts, warn_mid_wrt, return_zero_for_nodep);
  847. }
  848. SymbolVar _inter_graph_trans_var(
  849. CompGraph &dest_graph, SymbolVar src) {
  850. auto &&graph = dest_graph.get();
  851. auto trans = mgb::cg::InterGraphVarTransformer::get(graph);
  852. mgb_assert(trans, "trans func on graph %p has not been setup", &graph);
  853. return trans->trans(src.node());
  854. }
  855. SymbolVar _get_graph_optimizer_replaced_var(SymbolVar src) {
  856. return gopt::GraphOptimizer::var_replace_lookup(src.node());
  857. }
  858. void mark_as_input(ComputingGraph* cg, SymbolVar var) {
  859. VarNode* node = var.node();
  860. mgb_assert(node->owner_graph() == cg);
  861. mgb_assert(node->owner_opr()->same_type<opr::Host2DeviceCopy>());
  862. UserInputVars::get(cg).register_var(var);
  863. }
  864. namespace {
  865. void add_update_impl(const DeviceTensorND& dest,
  866. const DeviceTensorND& delta_nobrd,
  867. float alpha, float beta, float bias) {
  868. auto&& cn = dest.comp_node();
  869. using DT = CompNode::DeviceType;
  870. mgb_assert(cn == delta_nobrd.comp_node() &&
  871. (cn.device_type() == DT::CUDA || cn.device_type() == DT::CPU));
  872. mgb_assert(dest.dtype() == delta_nobrd.dtype());
  873. auto&& delta = delta_nobrd.sub(SubTensorSpec::make_from_offset_elem(
  874. delta_nobrd.layout().broadcast(dest.shape()), 0));
  875. cn.activate();
  876. if (!static_cast<bool>(alpha) && beta == 1 &&
  877. !static_cast<bool>(bias)) {
  878. dest.copy_from_fixlayout(delta);
  879. } else {
  880. auto&& handle = MegDNNHandle::get(
  881. CompNodeEnv::from_comp_node(cn)).handle();
  882. auto&& op = handle->create_operator<megdnn::AddUpdate>();
  883. op->param() = {alpha, beta, bias};
  884. op->exec(dest.as_megdnn(), delta.as_megdnn());
  885. if (cn.device_type() == DT::CPU && cn != CompNode::default_cpu()) {
  886. CompNodeEnv::from_comp_node(cn).cpu_env().dispatch(
  887. [p = op.release()] { delete p; }
  888. );
  889. }
  890. }
  891. }
  892. } // anonymous namespace
  893. void _add_update_fastpath(SharedND& dest_, SharedND& delta_,
  894. float alpha, float beta, float bias) {
  895. auto&& dest = dest_.dev_tensor();
  896. auto&& delta = delta_.dev_tensor();
  897. add_update_impl(*dest, *delta, alpha, beta, bias);
  898. }
  899. void _add_update_fastpath(SharedND& dest_, CompGraphCallbackValueProxy& delta_,
  900. float alpha, float beta, float bias) {
  901. auto&& dest = dest_.dev_tensor();
  902. auto&& delta = delta_.dev_tensor();
  903. add_update_impl(*dest, delta, alpha, beta, bias);
  904. }
  905. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台