You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiler.cpp 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. #include "megbrain/rdnn/profiler.h"
  2. #include "megbrain/utils/invoke.h"
  3. #include "megdnn/handle.h"
  4. #include "megdnn/oprs/base.h"
  5. #if MGB_ROCM
  6. #include "hcc_detail/hcc_defs_prologue.h"
  7. #include "megcore_rocm.h"
  8. #endif
  9. //! TODO: here has to be know some megdnn::opr when there is produced midout.h
  10. //! fix it if there is another graceful way.
  11. #include "megdnn/oprs.h"
  12. #include "midout.h"
  13. MIDOUT_DECL(megbrain_opr_profile)
  14. #define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_profile, __VA_ARGS__) {
  15. #define MIDOUT_E \
  16. } \
  17. MIDOUT_END();
  18. namespace {
  19. std::string serialize_policy(const megdnn::ExecutionPolicy& policy) {
  20. std::string ret;
  21. //! serialize AlgorithmDesc
  22. megdnn::Algorithm::serialize_write_pod(policy.algo.handle_type, ret);
  23. megdnn::Algorithm::serialize_write_pod(policy.algo.type, ret);
  24. uint32_t param_size = policy.algo.param.size();
  25. uint32_t name_size = policy.algo.name.size();
  26. megdnn::Algorithm::serialize_write_pod<uint32_t>(param_size, ret);
  27. megdnn::Algorithm::serialize_write_pod<uint32_t>(name_size, ret);
  28. ret += policy.algo.param;
  29. ret += policy.algo.name;
  30. //! serialize sub_policy
  31. uint32_t size = policy.sub_policy.size();
  32. megdnn::Algorithm::serialize_write_pod(size, ret);
  33. for (auto&& sub : policy.sub_policy) {
  34. ret += serialize_policy(sub);
  35. }
  36. return ret;
  37. }
  38. megdnn::ExecutionPolicy deserialize_policy(
  39. const char* buf, uint32_t size, uint32_t& offset) {
  40. megdnn::ExecutionPolicy ret;
  41. #define cb(_val, _type) \
  42. _val = megdnn::Algorithm::deserialize_read_pod<_type>(buf, offset); \
  43. offset += sizeof(_val)
  44. cb(ret.algo.handle_type, megdnn::Handle::HandleType);
  45. cb(ret.algo.type, uint32_t);
  46. uint32_t param_size = 0;
  47. uint32_t name_size = 0;
  48. cb(param_size, uint32_t);
  49. cb(name_size, uint32_t);
  50. if (param_size > 0) {
  51. ret.algo.param = std::string(buf + offset, param_size);
  52. offset += param_size;
  53. }
  54. if (name_size > 0) {
  55. ret.algo.name = std::string(buf + offset, name_size);
  56. offset += name_size;
  57. }
  58. uint32_t nr_policy = 0;
  59. cb(nr_policy, uint32_t);
  60. #undef cb
  61. for (uint32_t i = 0; i < nr_policy; i++) {
  62. ret.sub_policy.push_back(deserialize_policy(buf, size, offset));
  63. }
  64. return ret;
  65. }
  66. } // namespace
  67. namespace mgb {
  68. namespace rdnn {
  69. #define APPLY(statement, ...) \
  70. mgb::apply( \
  71. [&](const auto&... args) { return statement; }, \
  72. std::tuple_cat(__VA_ARGS__))
  73. ////////////// TimedProfiler::Param::ExecutionPolicyBlob //////////////////////
  74. template <typename Opr>
  75. typename TimedProfiler<Opr>::Param::ExecutionPolicyBlob TimedProfiler<Opr>::Param::
  76. ExecutionPolicyBlob::serialize(const megdnn::ExecutionPolicy& policy) {
  77. ExecutionPolicyBlob ret;
  78. std::string serialize_bin = serialize_policy(policy);
  79. mgb_assert(serialize_bin.size() < MAX_SIZE_IN_BYTES);
  80. memcpy(ret.data, serialize_bin.data(), serialize_bin.size());
  81. ret.size = serialize_bin.size();
  82. return ret;
  83. }
  84. template <typename Opr>
  85. megdnn::ExecutionPolicy TimedProfiler<Opr>::Param::ExecutionPolicyBlob::deserialize()
  86. const {
  87. uint32_t offset = 0;
  88. auto&& ret = deserialize_policy(data, size, offset);
  89. mgb_assert(offset == size);
  90. return std::move(ret);
  91. }
  92. #define INST(Opr) \
  93. template typename TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob \
  94. TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::serialize( \
  95. const megdnn::ExecutionPolicy& policy); \
  96. template megdnn::ExecutionPolicy \
  97. TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::deserialize() const;
  98. DNN_FOREACH_FASTRUN_OPR(INST)
  99. #undef INST
  100. ////////////////// TimedProfiler //////////////////////////////
  101. template <typename Opr>
  102. const double TimedProfiler<Opr>::timeout_setting =
  103. TimedProfiler<Opr>::init_timeout_setting();
  104. template <typename Opr>
  105. double TimedProfiler<Opr>::init_timeout_setting() {
  106. #if MGB_ENABLE_FASTRUN
  107. sys::TimedFuncInvoker::ins().register_func(
  108. AlgoChooserFuncId<Opr>::ID, &TimedProfiler<Opr>::prof_impl,
  109. &TimedProfiler<Opr>::prof_init_device);
  110. auto to_set = MGB_GETENV("MGB_CONV_PROFILING_TIMEOUT");
  111. if (to_set)
  112. return std::stod(to_set);
  113. #endif
  114. return 0;
  115. }
  116. #define APPLY(statement, ...) \
  117. mgb::apply( \
  118. [&](const auto&... args) { return statement; }, \
  119. std::tuple_cat(__VA_ARGS__))
  120. template <typename Opr>
  121. void TimedProfiler<Opr>::preprocess(
  122. const TensorLayoutArray&, const megdnn::SmallVector<DeviceTensorND>&,
  123. UniqPtrWithCN<Opr>&, megdnn::Workspace&, std::array<TensorLayout, arity>&,
  124. std::array<DeviceTensorND, arity_in>&, PreprocessFilter<Opr>&) {
  125. // Opr is neither convbias nor convolution.This function do nothing.
  126. }
  127. //! convbias
  128. template <>
  129. void TimedProfiler<megdnn::ConvBias>::preprocess(
  130. const TensorLayoutArray& preprocessed_layout,
  131. const SmallVector<DeviceTensorND>& flt_val,
  132. UniqPtrWithCN<megdnn::ConvBias>& megdnn_opr, megdnn::Workspace& mdn_workspace,
  133. std::array<TensorLayout, arity>& layouts,
  134. std::array<DeviceTensorND, arity_in>& inp_val,
  135. PreprocessFilter<megdnn::ConvBias>& prep_flt) {
  136. if (!preprocessed_layout.empty()) {
  137. auto&& pf = prep_flt;
  138. pf.algorithm_id = nullptr;
  139. pf.tensors.resize(flt_val.size());
  140. for (size_t i = 0; i < flt_val.size(); i++) {
  141. pf.tensors[i] = flt_val[i].as_megdnn();
  142. }
  143. APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace),
  144. std::forward_as_tuple(
  145. layouts[0], inp_val[1].as_megdnn(), inp_val[2].as_megdnn()),
  146. array_skip<arity_in - 1>(layouts));
  147. }
  148. }
  149. //! convolution
  150. template <>
  151. void TimedProfiler<megdnn::ConvolutionForward>::preprocess(
  152. const TensorLayoutArray& preprocessed_layout,
  153. const megdnn::SmallVector<DeviceTensorND>& flt_val,
  154. UniqPtrWithCN<megdnn::ConvolutionForward>& megdnn_opr,
  155. megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts,
  156. std::array<DeviceTensorND, arity_in>& inp_val,
  157. PreprocessFilter<megdnn::ConvolutionForward>& prep_flt) {
  158. if (!preprocessed_layout.empty()) {
  159. auto&& pf = prep_flt;
  160. pf.algorithm_id = nullptr;
  161. pf.tensors.resize(flt_val.size());
  162. for (size_t i = 0; i < flt_val.size(); i++) {
  163. pf.tensors[i] = flt_val[i].as_megdnn();
  164. }
  165. APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace),
  166. std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()),
  167. array_skip<2>(layouts));
  168. }
  169. }
  170. template <typename Opr>
  171. typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
  172. const TParam& raw_param) {
  173. MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_impl")))
  174. #if MGB_ROCM
  175. bool miopen_algo_search_enabled;
  176. megcore::getMIOpenAlgoSearchStatus(&miopen_algo_search_enabled);
  177. mgb_assert(miopen_algo_search_enabled, "MIOpen algo search not enabled");
  178. #endif
  179. auto&& param = raw_param.as_single_pod<Param>();
  180. CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical);
  181. auto megdnn_opr = opr::intl::create_megdnn_opr<Opr>(cn);
  182. std::array<TensorLayout, arity> layouts;
  183. auto from_enum = [&](DTypeEnum enumv) -> DType {
  184. switch (enumv) {
  185. #define cb(_dt) \
  186. case DTypeTrait<_dt>::enumv: \
  187. return _dt(1.0f, static_cast<uint8_t>(0))
  188. cb(dtype::Quantized8Asymm);
  189. cb(dtype::Quantized4Asymm);
  190. #undef cb
  191. #define cb(_dt) \
  192. case DTypeTrait<_dt>::enumv: \
  193. return _dt(1.0f)
  194. cb(dtype::QuantizedS8);
  195. cb(dtype::QuantizedS16);
  196. cb(dtype::QuantizedS32);
  197. cb(dtype::QuantizedS4);
  198. default:
  199. return DType::from_enum(enumv);
  200. #undef cb
  201. }
  202. };
  203. for (int i = 0; i < arity; ++i) {
  204. layouts[i] = {param.shapes[i], from_enum(param.dtypes[i])};
  205. }
  206. megdnn_opr->param() = param.opr_param;
  207. megdnn_opr->execution_policy() = param.execution_policy.deserialize();
  208. // Allocate preprocessed weight buffers.
  209. TensorLayoutArray preprocessed_layout;
  210. if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
  211. if (param.allow_weight_preprocess) {
  212. preprocessed_layout = APPLY(
  213. _(megdnn_opr)->deduce_preprocessed_filter_layout(args...), layouts);
  214. }
  215. });
  216. {
  217. // first allocate a whole chunk to avoid memory fragmentation (here we
  218. // rely on memory allocator to reuse memory)
  219. auto align = cn.get_mem_addr_alignment();
  220. size_t tot_size = align;
  221. for (int i = 0; i < arity; ++i) {
  222. tot_size += layouts[i].span().high_byte + align;
  223. }
  224. for (const auto& layout : preprocessed_layout) {
  225. tot_size += layout.span().high_byte + align;
  226. }
  227. tot_size += param.workspace;
  228. DeviceTensorStorage storage{cn};
  229. storage.ensure_size(tot_size);
  230. }
  231. // allocate input and output memory
  232. std::array<DeviceTensorND, arity_in> inp_val;
  233. std::array<DeviceTensorND, arity_out> out_val;
  234. DeviceTensorND workspace;
  235. for (int i = 0; i < arity_in; ++i) {
  236. inp_val[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]);
  237. }
  238. for (int i = 0; i < arity_out; ++i) {
  239. out_val[i]
  240. .comp_node(cn)
  241. .dtype(layouts[arity_in + i].dtype)
  242. .resize(layouts[arity_in + i]);
  243. }
  244. megdnn::Workspace mdn_workspace;
  245. // allocate workspace
  246. if (param.workspace) {
  247. workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace});
  248. mdn_workspace.size = param.workspace;
  249. mdn_workspace.raw_ptr = workspace.raw_ptr();
  250. }
  251. // allocate storage for preprocessed filter
  252. SmallVector<DeviceTensorND> flt_val(preprocessed_layout.size());
  253. for (size_t i = 0; i < preprocessed_layout.size(); i++) {
  254. flt_val[i] = {
  255. cn, preprocessed_layout[i], preprocessed_layout[i].dtype,
  256. preprocessed_layout[i].format};
  257. }
  258. for (int i = 0; i < arity_in; ++i) {
  259. fill_zero_dev_tensor(inp_val[i]);
  260. }
  261. PreprocessFilter<Opr> prep_flt;
  262. preprocess(
  263. preprocessed_layout, flt_val, megdnn_opr, mdn_workspace, layouts, inp_val,
  264. prep_flt);
  265. RealTimer timer;
  266. auto ev_start = cn.create_event(CompNode::Event::NEED_TIMER),
  267. ev_end = cn.create_event(CompNode::Event::NEED_TIMER);
  268. ev_start->record();
  269. if_constexpr<opr_supports_preprocess<Opr>()>(
  270. [&](auto _) {
  271. auto&& opr = _(megdnn_opr);
  272. PreprocessFilter<Opr>* pf =
  273. preprocessed_layout.empty() ? nullptr : &prep_flt;
  274. APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val,
  275. out_val);
  276. },
  277. /* else */
  278. [&](auto _) {
  279. APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val,
  280. out_val);
  281. });
  282. ev_end->record();
  283. megdnn::Algorithm* algo =
  284. megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
  285. mgb_assert(algo);
  286. double next_report_time = 0.5;
  287. while (!ev_end->finished()) {
  288. if (timer.get_secs() >= next_report_time) {
  289. #if MGB_ENABLE_GETENV
  290. mgb_log_warn(
  291. "profiling conv algo %s already took %.3f/%.3f secs"
  292. " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ",
  293. algo->name(), timer.get_secs(), param.actual_timeout);
  294. #else
  295. mgb_log_warn(
  296. "profiling conv algo %s already took %.3f/%.3f secs", algo->name(),
  297. timer.get_secs(), param.actual_timeout);
  298. #endif
  299. next_report_time = timer.get_secs() + 1;
  300. }
  301. using namespace std::literals;
  302. #if !__DEPLOY_ON_XP_SP2__
  303. std::this_thread::sleep_for(1000us);
  304. #endif
  305. }
  306. // release all free blocks owned by child process,
  307. // in order to avoid main process running out of memory
  308. cn.try_coalesce_all_free_memory();
  309. mgb_assert(ev_start->finished());
  310. return TResult::from_pod(Result{ev_start->elapsed_time_until(*ev_end)});
  311. MIDOUT_E
  312. };
  313. template <typename Opr>
  314. Maybe<typename TimedProfiler<Opr>::Result> TimedProfiler<Opr>::profile(
  315. const Param& param, double& timeout) {
  316. mgb_assert(timeout >= 0);
  317. if (!timeout) {
  318. timeout = timeout_setting;
  319. } else if (timeout_setting) {
  320. timeout = std::min(timeout, timeout_setting);
  321. }
  322. param.actual_timeout = timeout ? timeout : std::numeric_limits<double>::infinity();
  323. auto res = sys::TimedFuncInvoker::ins().invoke(
  324. AlgoChooserFuncId<Opr>::ID, TParam::from_pod(const_cast<Param&>(param)),
  325. timeout);
  326. if (res.valid())
  327. return res.val().template as_single_pod<Result>();
  328. return None;
  329. }
  330. template <typename Opr>
  331. void TimedProfiler<Opr>::prof_init_device(const TParam& raw_param) {
  332. MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_init_device")))
  333. #if MGB_ROCM
  334. megcore::enableMIOpenAlgoSearch(true);
  335. #endif
  336. auto&& param = raw_param.as_single_pod<Param>();
  337. CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical);
  338. // wait for cuda init, so its time does not get accounted in timeout
  339. cn.sync();
  340. MIDOUT_E
  341. }
  342. #define INST(Opr) \
  343. template const double TimedProfiler<megdnn::Opr>::timeout_setting; \
  344. template double TimedProfiler<megdnn::Opr>::init_timeout_setting(); \
  345. template typename TimedProfiler<megdnn::Opr>::TResult \
  346. TimedProfiler<megdnn::Opr>::prof_impl(const TParam& raw_param); \
  347. template Maybe<typename TimedProfiler<megdnn::Opr>::Result> \
  348. TimedProfiler<megdnn::Opr>::profile(const Param& param, double& timeout); \
  349. template void TimedProfiler<megdnn::Opr>::prof_init_device(const TParam& raw_param);
  350. DNN_FOREACH_FASTRUN_OPR(INST)
  351. #undef INST
  352. } // namespace rdnn
  353. } // namespace mgb
  354. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}