You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiler_impl.cpp 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604
  1. /**
  2. * \file src/gopt/impl/profiler_impl.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "./opr_format_modifier.h"
  13. #include "./utils.h"
  14. #include "megbrain/gopt/framework.h"
  15. #include "megbrain/gopt/profiler.h"
  16. #include "megbrain/graph/event.h"
  17. #include "megbrain/opr/dnn/pooling.h"
  18. #include "megbrain/opr/imgproc.h"
  19. #include "megbrain/opr/io.h"
  20. #include "megbrain/opr/nn_int.h"
  21. #include "megbrain/plugin/base.h"
  22. #include "megbrain/serialization/sereg.h"
  23. using namespace mgb;
  24. using namespace cg;
  25. using namespace opr;
  26. using namespace gopt;
  27. using ReformatKey = ReformatManager::ReformatKey;
  28. namespace {
  29. using OprFormat = Problem::OprFormat;
  30. OprFormat tensor_formats_to_opr_format(TensorFormats tensor_format) {
  31. switch (tensor_format) {
  32. case TensorFormats::NCHW:
  33. return OprFormat::NCHW;
  34. case TensorFormats::NCHWc4:
  35. return OprFormat::NCHW4;
  36. case TensorFormats::NCHWc8:
  37. return OprFormat::NCHW8;
  38. case TensorFormats::NCHWc32:
  39. return OprFormat::NCHW32;
  40. case TensorFormats::NCHWc64:
  41. return OprFormat::NCHW64;
  42. case TensorFormats::NHWC:
  43. return OprFormat::NHWC;
  44. case TensorFormats::CHWNc4:
  45. return OprFormat::CHWN4;
  46. default:
  47. mgb_throw(
  48. MegBrainError, "tensor format(%u) is not supported",
  49. static_cast<uint32_t>(tensor_format));
  50. }
  51. }
  52. class GraphPartitionProfiler final : public PluginBase {
  53. using CompNodeEventPtr = std::unique_ptr<CompNode::Event>;
  54. public:
  55. using OprFilter = thin_function<bool(OperatorNodeBase*)>;
  56. struct OprKernEvent {
  57. CompNodeEventPtr start, end;
  58. };
  59. GraphPartitionProfiler(ComputingGraph* graph, OprFilter opr_filter);
  60. ~GraphPartitionProfiler() noexcept;
  61. float duration_in_usec() const;
  62. private:
  63. void record_event(CompNodeEventPtr& dest, CompNode cn) {
  64. if (dest == nullptr)
  65. dest = cn.create_event(CompNode::Event::NEED_TIMER);
  66. dest->record();
  67. }
  68. ThinHashMap<OperatorNodeBase*, OprKernEvent> m_kern_event;
  69. OprFilter m_opr_filter;
  70. };
  71. GraphPartitionProfiler::GraphPartitionProfiler(
  72. ComputingGraph* graph, OprFilter opr_filter)
  73. : PluginBase(graph), m_opr_filter(opr_filter) {
  74. using namespace event;
  75. auto on_before_kern = [this](BeforeKernel const& event) {
  76. if (!m_opr_filter(event.opr))
  77. return;
  78. auto evptr = &m_kern_event[event.opr].start;
  79. record_event(*evptr, event.comp_node);
  80. };
  81. auto on_after_kern = [this](AfterKernel const& event) {
  82. if (!m_opr_filter(event.opr))
  83. return;
  84. auto evptr = &m_kern_event[event.opr].end;
  85. record_event(*evptr, event.comp_node);
  86. };
  87. auto&& ev = graph->event();
  88. add_event_handler(ev.register_receiver<BeforeKernel>(on_before_kern));
  89. add_event_handler(ev.register_receiver<AfterKernel>(on_after_kern));
  90. }
  91. GraphPartitionProfiler::~GraphPartitionProfiler() noexcept {
  92. auto wait = [](const CompNodeEventPtr& ev) {
  93. if (ev)
  94. ev->host_wait();
  95. };
  96. for (auto&& i : m_kern_event) {
  97. wait(i.second.start);
  98. wait(i.second.end);
  99. }
  100. }
  101. float GraphPartitionProfiler::duration_in_usec() const {
  102. float device_duration = 0.f;
  103. for (auto&& kern_ev : m_kern_event) {
  104. auto&& event = kern_ev.second;
  105. event.end->host_wait();
  106. device_duration += 1e6 * event.start->elapsed_time_until(*event.end);
  107. }
  108. return device_duration;
  109. }
  110. /*!
  111. * \brief An operator that indicates its input var node is contiguous
  112. */
  113. // clang-format off
  114. MGB_DEFINE_OPR_CLASS(MarkInputContiguous, SingleCNOperatorNodeBase) //{
  115. void scn_do_execute() override {};
  116. void init_output_static_infer_desc() override;
  117. void add_input_layout_constraint() override {
  118. input(0)->add_layout_constraint_contiguous();
  119. }
  120. public:
  121. MarkInputContiguous(VarNode* input, const OperatorNodeConfig& config);
  122. static SymbolVar make(SymbolVar input, const OperatorNodeConfig& config = {});
  123. };
  124. // clang-format on
  125. MGB_DYN_TYPE_OBJ_FINAL_IMPL(MarkInputContiguous);
  126. MarkInputContiguous::MarkInputContiguous(
  127. VarNode* input, const OperatorNodeConfig& config)
  128. : Super(input->owner_graph(), config, "mark_contiguous", {input}) {
  129. add_input({input});
  130. add_output(None);
  131. }
  132. SymbolVar MarkInputContiguous::make(SymbolVar input, const OperatorNodeConfig& config) {
  133. return input.insert_single_output_opr<MarkInputContiguous>(input.node(), config);
  134. }
  135. void MarkInputContiguous::init_output_static_infer_desc() {
  136. using namespace cg::static_infer;
  137. auto&& mgr = owner_graph()->static_infer_manager();
  138. mgr.register_shape_infer(output(0), ShapeInferDesc::make_identity(input(0)));
  139. }
  140. } // namespace
  141. /* ================== ProfilerImpl =================*/
  142. ProfilerImpl::ProfilerImpl(int runs, float opr_threshold,
  143. float var_node_threshold)
  144. : m_opr_threshold{opr_threshold},
  145. m_var_node_threshold{var_node_threshold},
  146. m_runs{runs} {
  147. m_opr_filter = [this](const OperatorNodeBase* opr,
  148. OperatorNodeBase* new_opr) {
  149. /// \note: for the considerations of performance, we skip nchw(naive)
  150. /// kernels for conv bias on CUDA platform. to remove this later
  151. if (auto conv = try_cast_as_op<opr::ConvBiasForward>(new_opr)) {
  152. if (conv->output(0)->comp_node().device_type() ==
  153. CompNode::DeviceType::CUDA &&
  154. conv->input(0)->dtype().category() ==
  155. DTypeCategory::QUANTIZED &&
  156. conv->param().format == OprFormat::NCHW) {
  157. return false;
  158. }
  159. }
  160. float comp1 = m_opr_footprint.get_computation(
  161. const_cast<OperatorNodeBase*>(opr));
  162. float comp2 = m_opr_footprint.get_computation(new_opr);
  163. if (comp2 > m_opr_threshold * comp1)
  164. return false;
  165. return true;
  166. };
  167. m_var_node_filter = [this](const VarNode* var, TensorShape from,
  168. TensorShape to, ReformatKey key) {
  169. /// \note: due to the alignment requirement of low-bit tensor, we skip
  170. /// some layout transform for low-bit tensors. The skipped layout
  171. /// transforms do not have corresponding dnn kernel and cannot be
  172. /// implemented by tensor manip operators (like reshape, dimshuffle,
  173. /// subtensor, etc.).
  174. if (var->dtype().enumv() == DTypeEnum::QuantizedS4 ||
  175. var->dtype().enumv() == DTypeEnum::Quantized4Asymm) {
  176. if (key.input_format == TensorFormats::NCHW &&
  177. key.output_format != TensorFormats::NHWC &&
  178. key.output_format != TensorFormats::NCHWc64) {
  179. return false;
  180. }
  181. if (key.output_format == TensorFormats::NCHW &&
  182. key.input_format != TensorFormats::NHWC &&
  183. key.input_format != TensorFormats::NCHWc64) {
  184. return false;
  185. }
  186. }
  187. TensorLayout orig_ly = {var->shape(), var->dtype()},
  188. from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()};
  189. float orig_memory = orig_ly.span().dist_byte() * 2.f;
  190. float reformat_memory =
  191. from_ly.span().dist_byte() + to_ly.span().dist_byte();
  192. if (reformat_memory > orig_memory * m_var_node_threshold)
  193. return false;
  194. return true;
  195. };
  196. }
  197. ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
  198. const OperatorNodeBase* opr, TensorFormats base_format,
  199. const SmallVector<TensorFormats>& available_tensor_formats,
  200. ReformatAttribute extra_attribute) const {
  201. OperatorNodeRecord record;
  202. record.opr = opr;
  203. auto& costs = record.costs;
  204. for (auto&& f : available_tensor_formats) {
  205. auto opr_format = tensor_formats_to_opr_format(f);
  206. costs[opr_format] = profile_operator(opr, base_format, f, extra_attribute);
  207. }
  208. return record;
  209. }
  210. float ProfilerImpl::profile_operator(
  211. const OperatorNodeBase* opr, TensorFormats base_format,
  212. TensorFormats tensor_format, ReformatAttribute extra_attribute) const {
  213. auto graph = ComputingGraph::make();
  214. graph->options().graph_opt_level = 0;
  215. graph->options().var_sanity_check_first_run = false;
  216. VarNodeArray new_inps(opr->input().size());
  217. for (size_t i = 0; i < opr->input().size(); ++i) {
  218. auto&& var = opr->input(i);
  219. auto&& cn = var->comp_node();
  220. auto&& dtype = var->dtype();
  221. auto dval = std::make_shared<DeviceTensorND>(cn, dtype);
  222. auto aligned_tensor_shape = ReformatManager::make_aligned_tensor_shape(
  223. var, base_format, tensor_format, extra_attribute);
  224. dval->resize(aligned_tensor_shape);
  225. auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
  226. new_inps[i] = aligned_var.node();
  227. }
  228. auto new_opr = serialization::copy_opr_shallow(
  229. *opr, new_inps, opr->config(), {graph.get()});
  230. if (!m_opr_filter(opr, new_opr))
  231. return PROFILE_TIME_OUT;
  232. auto y = new_opr->output(0);
  233. auto mark = MarkInputContiguous::make(SymbolVar(y));
  234. auto func = graph->compile({{mark, {}}});
  235. auto filter = [new_opr](OperatorNodeBase* opr) { return opr == new_opr; };
  236. auto profiler =
  237. std::make_unique<GraphPartitionProfiler>(graph.get(), std::move(filter));
  238. for (int i = 0; i < m_runs; ++i)
  239. func->execute();
  240. return profiler->duration_in_usec();
  241. }
  242. ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
  243. const OperatorNodeBase* opr, const OprTensorFormatsConfiguration& base_config,
  244. const SmallVector<OprTensorFormatsConfiguration>& available_configs,
  245. ReformatAttribute extra_attribute) const {
  246. OperatorNodeRecord record;
  247. record.opr = opr;
  248. auto& costs = record.costs;
  249. for (auto&& i : available_configs) {
  250. costs[i.opr_format] = profile_operator(opr, base_config, i, extra_attribute);
  251. }
  252. return record;
  253. }
  254. float ProfilerImpl::profile_operator(
  255. const OperatorNodeBase* opr, const OprTensorFormatsConfiguration& base_config,
  256. const OprTensorFormatsConfiguration& config,
  257. ReformatAttribute extra_attribute) const {
  258. auto graph = ComputingGraph::make();
  259. graph->options().graph_opt_level = 0;
  260. graph->options().var_sanity_check_first_run = false;
  261. VarNodeArray new_inps(opr->input().size());
  262. size_t i = 0;
  263. size_t nr_input_tensor =
  264. std::min(config.input_tensor_formats.size(), opr->input().size());
  265. for (; i < nr_input_tensor; ++i) {
  266. auto&& var = opr->input(i);
  267. auto&& cn = var->comp_node();
  268. auto&& dtype = var->dtype();
  269. auto dval = std::make_shared<DeviceTensorND>(cn, dtype);
  270. TensorShape aligned_shape;
  271. if (config.input_tensor_types[i] == TensorType::WEIGHT) {
  272. mgb_assert(base_config.input_tensor_types[i] == TensorType::WEIGHT);
  273. aligned_shape = ReformatManager::make_aligned_weight_shape(
  274. var, base_config.input_tensor_formats[i],
  275. config.input_tensor_formats[i], config.output_tensor_formats[0],
  276. extra_attribute);
  277. } else {
  278. mgb_assert(
  279. base_config.input_tensor_types[i] == config.input_tensor_types[i]);
  280. mgb_assert(base_config.input_tensor_types[i] == TensorType::FEATURE);
  281. aligned_shape = ReformatManager::make_aligned_tensor_shape(
  282. var, base_config.input_tensor_formats[i],
  283. config.input_tensor_formats[i], extra_attribute);
  284. }
  285. dval->resize(aligned_shape);
  286. auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
  287. new_inps[i] = aligned_var.node();
  288. }
  289. for (; i < opr->input().size(); ++i) {
  290. auto&& var = opr->input(i);
  291. auto&& cn = var->comp_node();
  292. auto&& dtype = var->dtype();
  293. auto hval = std::make_shared<HostTensorND>(cn, dtype);
  294. hval->resize(var->shape());
  295. auto cb = [&](DeviceTensorND& d) { hval->copy_from(d).sync(); };
  296. {
  297. auto cg = var->owner_graph();
  298. cg->compile({{var, cb}})->execute();
  299. }
  300. auto imm = opr::ImmutableTensor::make(*graph, *hval);
  301. new_inps[i] = imm.node();
  302. }
  303. VarNode* y = mgb::gopt::intl::modify_opr_format(config.opr_format, new_inps, opr);
  304. #if 0
  305. static const ThinHashSet<Typeinfo*> multi_algo_oprs = {
  306. opr::Convolution::typeinfo(),
  307. opr::ConvBiasForward::typeinfo(),
  308. opr::ConvolutionBackwardData::typeinfo(),
  309. opr::PoolingForward::typeinfo(),
  310. };
  311. if (multi_algo_oprs.count(opr->dyn_typeinfo()) &&
  312. !mgb::gopt::intl::has_available_algo(new_inps, y->owner_opr()))
  313. return PROFILE_TIME_OUT;
  314. #endif
  315. if (!m_opr_filter(opr, y->owner_opr()))
  316. return PROFILE_TIME_OUT;
  317. auto mark = MarkInputContiguous::make(SymbolVar(y));
  318. auto func = graph->compile({{mark, {}}});
  319. auto new_opr = y->owner_opr();
  320. auto filter = [&new_opr](OperatorNodeBase* opr) { return opr == new_opr; };
  321. auto profiler =
  322. std::make_unique<GraphPartitionProfiler>(graph.get(), std::move(filter));
  323. for (int i = 0; i < m_runs; ++i)
  324. func->execute();
  325. return profiler->duration_in_usec();
  326. }
  327. ProfilerImpl::VarNodeRecord ProfilerImpl::profile_var_node(
  328. const VarNode* var, TensorFormats base_format,
  329. const SmallVector<TensorFormats>& available_tensor_formats,
  330. ReformatAttribute attribute) const {
  331. VarNodeRecord record;
  332. record.var = var;
  333. auto& costs = record.costs;
  334. for (auto&& i : available_tensor_formats) {
  335. for (auto&& o : available_tensor_formats) {
  336. if (i == o)
  337. continue;
  338. ReformatKey key{
  339. i, o, attribute, var->dtype().enumv(), var->dtype().enumv()};
  340. costs[{i, o}] = profile_var_node(var, base_format, key);
  341. }
  342. }
  343. return record;
  344. }
  345. float ProfilerImpl::profile_var_node(
  346. const VarNode* var, TensorFormats base_format, const ReformatKey& key) const {
  347. auto&& cn = var->comp_node();
  348. auto&& dtype = var->dtype();
  349. auto dval = std::make_shared<DeviceTensorND>(cn, dtype);
  350. auto aligned_tensor_shape = ReformatManager::make_aligned_tensor_shape(
  351. var, base_format, key.input_format, key.attribute);
  352. dval->resize(aligned_tensor_shape);
  353. auto graph = ComputingGraph::make();
  354. graph->options().graph_opt_level = 0;
  355. graph->options().var_sanity_check_first_run = false;
  356. auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
  357. auto builder = ReformatManager::instance().auto_aligned_reformat_featrue(
  358. var, base_format, key);
  359. auto y = builder({aligned_var.node()});
  360. if (!m_var_node_filter(var, aligned_tensor_shape, y->shape(), key))
  361. return PROFILE_TIME_OUT;
  362. ThinHashSet<OperatorNodeBase*> set;
  363. DepOprIter iter([&set](OperatorNodeBase* opr) { set.insert(opr); });
  364. iter.add(y->owner_opr());
  365. iter.set_visited(aligned_var.node()->owner_opr());
  366. auto mark = MarkInputContiguous::make(SymbolVar(y));
  367. auto func = graph->compile({{mark, {}}});
  368. auto filter = [&set](OperatorNodeBase* opr) { return set.count(opr) > 0; };
  369. auto profiler =
  370. std::make_unique<GraphPartitionProfiler>(graph.get(), std::move(filter));
  371. for (int i = 0; i < m_runs; ++i)
  372. func->execute();
  373. return profiler->duration_in_usec();
  374. }
  375. ProfilerImpl::ProfilingResult ProfilerImpl::profile(const Problem& problem) const {
  376. ConstVarPropogate cvprop{ConstVarType::IMMUTABLE_AND_PARAM};
  377. {
  378. auto cb = [&cvprop](OperatorNodeBase* opr) { cvprop.add_opr(opr); };
  379. DepOprIter iter{cb};
  380. for (auto&& o : problem.graph_partition().output()) {
  381. iter.add(o->owner_opr());
  382. }
  383. }
  384. static const ThinHashMap<Typeinfo*, size_t> format_aware_input_tensors = {
  385. #define cb(_Opr, _arity) {_Opr::typeinfo(), _arity}
  386. cb(Convolution, 2),
  387. cb(ConvBiasForward, 4),
  388. cb(ConvolutionBackwardData, 2),
  389. cb(PoolingForward, 1),
  390. cb(WarpPerspective, 1),
  391. cb(Resize, 1),
  392. #undef cb
  393. };
  394. static const ThinHashSet<Typeinfo*> skip_opr_types = {
  395. TypeCvt::typeinfo(), Elemwise::typeinfo(), ElemwiseMultiType::typeinfo()};
  396. ThinHashSet<VarNode*> vars;
  397. ThinHashSet<OperatorNodeBase*> oprs;
  398. ThinHashSet<OperatorNodeBase*> skip_oprs;
  399. for (auto&& opr : problem.graph_partition().all_oprs()) {
  400. if (cvprop.is_const(opr))
  401. continue;
  402. bool skip = true;
  403. for (auto&& i : opr->input()) {
  404. skip &= problem.graph_partition().input().count(i) > 0 ||
  405. skip_oprs.count(i->owner_opr()) > 0;
  406. }
  407. skip &= skip_opr_types.count(opr->dyn_typeinfo());
  408. if (skip)
  409. skip_oprs.insert(opr);
  410. oprs.insert(opr);
  411. auto find = format_aware_input_tensors.find(opr->dyn_typeinfo());
  412. if (find == format_aware_input_tensors.end()) {
  413. for (auto&& i : opr->input()) {
  414. if (!cvprop.is_const(i)) {
  415. vars.insert(i);
  416. }
  417. }
  418. } else {
  419. size_t nr_input_tensor = std::min(find->second, opr->input().size());
  420. for (size_t i = 0; i < nr_input_tensor; ++i) {
  421. if (!cvprop.is_const(opr->input(i))) {
  422. vars.insert(opr->input(i));
  423. }
  424. }
  425. }
  426. for (auto&& ov : opr->usable_output()) {
  427. vars.insert(ov);
  428. }
  429. }
  430. auto base_format = problem.base_format();
  431. auto&& available_tensor_formats = problem.available_tensor_formats();
  432. auto&& reformat_attribute = problem.attribute().reformat_attribute;
  433. ProfilingResult profiling_result;
  434. auto& opr_record = profiling_result.opr_record;
  435. auto& var_record = profiling_result.var_record;
  436. for (auto&& var : vars) {
  437. var_record[var] = profile_var_node(
  438. var, base_format, available_tensor_formats, reformat_attribute);
  439. }
  440. for (auto&& opr : oprs) {
  441. auto&& opr_configs = problem.opr_configs();
  442. auto find = opr_configs.find(opr->dyn_typeinfo());
  443. if (find == opr_configs.end()) {
  444. if (skip_oprs.count(opr) > 0) {
  445. SmallVector<TensorFormats> tensor_formats = {base_format};
  446. opr_record[opr] = profile_operator(
  447. opr, base_format, tensor_formats, reformat_attribute);
  448. } else {
  449. opr_record[opr] = profile_operator(
  450. opr, base_format, available_tensor_formats, reformat_attribute);
  451. }
  452. } else {
  453. auto&& dispatchers = find->second;
  454. SmallVector<OprTensorFormatsConfiguration> configs;
  455. for (const auto& item : dispatchers) {
  456. auto config = (*item.second)(opr);
  457. if (config.valid()) {
  458. configs.emplace_back(config.val());
  459. }
  460. }
  461. auto base_config = problem.base_config(opr);
  462. opr_record[opr] =
  463. profile_operator(opr, base_config, configs, reformat_attribute);
  464. }
  465. }
  466. for (auto&& rpair : opr_record) {
  467. mgb_log_debug("%s", rpair.second.to_string().c_str());
  468. }
  469. for (auto&& rpair : var_record) {
  470. mgb_log_debug("%s", rpair.second.to_string().c_str());
  471. }
  472. return profiling_result;
  473. }
  474. /* ================== ProfilerBase =================*/
  475. std::string ProfilerBase::OperatorNodeRecord::to_string() const {
  476. auto str = ssprintf(
  477. "\nopr type: %s\nopr name: %s\ninputs:\n", opr->dyn_typeinfo()->name,
  478. opr->cname());
  479. for (auto&& i : opr->input()) {
  480. str += ssprintf(
  481. "\tvar: %s\n\tshape: %s\n", i->cname(), i->shape().to_string().c_str());
  482. }
  483. str += ssprintf(
  484. "outputs:\n\tvar: %s\n\tshape: %s\ncosts:\n", opr->output(0)->cname(),
  485. opr->output(0)->shape().to_string().c_str());
  486. for (auto&& cpair : costs) {
  487. str += ssprintf(
  488. "\tformat: %s; cost:%f", opr_format_to_string(cpair.first),
  489. cpair.second);
  490. }
  491. return str;
  492. }
  493. std::string ProfilerBase::VarNodeRecord::to_string() const {
  494. auto str = ssprintf("\nvar: %s\ncosts:", var->cname());
  495. for (auto&& cpair : costs) {
  496. auto&& formats = cpair.first;
  497. str += ssprintf(
  498. "\n\tformat: (i:%s;o:%s); cost:%f",
  499. tensor_formats_to_named_tensor_shape(formats.first).to_string().c_str(),
  500. tensor_formats_to_named_tensor_shape(formats.second)
  501. .to_string()
  502. .c_str(),
  503. cpair.second);
  504. }
  505. return str;
  506. }
  507. std::unique_ptr<ProfilerBase> ProfilerBase::make_profiler() {
  508. return std::make_unique<ProfilerImpl>();
  509. }
  510. std::unique_ptr<ProfilerBase> ProfilerBase::make_cached_profiler(
  511. const char* path) {
  512. return std::make_unique<CachedProfiler>(path);
  513. }
  514. /* ================== CachedProfiler =================*/
  515. CachedProfiler::CachedProfiler(const char* path, int runs, float opr_threshold,
  516. float var_node_threshold)
  517. : ProfilerImpl(runs, opr_threshold, var_node_threshold), m_path{path} {
  518. if (m_path != nullptr) { // file cache
  519. ProfilerCache::inst().set_impl(
  520. std::make_unique<InFilePersistentCache>(m_path));
  521. }
  522. }
  523. CachedProfiler::ProfilingResult CachedProfiler::profile(
  524. const Problem& problem) const {
  525. auto ret = ProfilerImpl::profile(problem);
  526. if (m_path != nullptr)
  527. ProfilerCache::inst().dump_cache(m_path);
  528. return ret;
  529. }
  530. float CachedProfiler::profile_operator(
  531. const OperatorNodeBase* opr, TensorFormats base_format,
  532. TensorFormats tensor_format, ReformatAttribute extra_attribute) const {
  533. ProfilerCache::Key key{opr, tensor_formats_to_opr_format(tensor_format),
  534. extra_attribute};
  535. auto ret = ProfilerCache::inst().get(key);
  536. if (ret.valid())
  537. return ret.val();
  538. auto rst = ProfilerImpl::profile_operator(opr, base_format, tensor_format,
  539. extra_attribute);
  540. ProfilerCache::inst().put(key, rst);
  541. return rst;
  542. }
  543. float CachedProfiler::profile_operator(
  544. const OperatorNodeBase* opr,
  545. const OprTensorFormatsConfiguration& base_config,
  546. const OprTensorFormatsConfiguration& config,
  547. ReformatAttribute extra_attribute) const {
  548. ProfilerCache::Key key{opr, config.opr_format, extra_attribute};
  549. auto ret = ProfilerCache::inst().get(key);
  550. if (ret.valid())
  551. return ret.val();
  552. auto rst = ProfilerImpl::profile_operator(opr, base_config, config,
  553. extra_attribute);
  554. ProfilerCache::inst().put(key, rst);
  555. return rst;
  556. }
  557. float CachedProfiler::profile_var_node(const VarNode* var,
  558. TensorFormats base_format,
  559. const ReformatKey& key) const {
  560. ProfilerCache::Key pf_key{var, key};
  561. auto ret = ProfilerCache::inst().get(pf_key);
  562. if (ret.valid())
  563. return ret.val();
  564. auto rst = ProfilerImpl::profile_var_node(var, base_format, key);
  565. ProfilerCache::inst().put(pf_key, rst);
  566. return rst;
  567. }
  568. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台