| @@ -86,7 +86,6 @@ from .core._imperative_rt.core2 import sync as _sync | |||
| from .core._imperative_rt.common import ( | |||
| get_supported_sm_versions as _get_supported_sm_versions, | |||
| ) | |||
| from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func | |||
| from .config import * | |||
| from .device import * | |||
| from .logger import enable_debug_log, get_logger, set_log_file, set_log_level | |||
| @@ -118,13 +117,6 @@ def _check_sm_version(): | |||
| _check_sm_version() | |||
| _set_fork_exec_path_for_timed_func( | |||
| sys.executable, | |||
| os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"), | |||
| ) | |||
| del _set_fork_exec_path_for_timed_func | |||
| _exit_handlers = [] | |||
| @@ -14,9 +14,11 @@ from ._imperative_rt.core2 import ( | |||
| __compute_mode = "default" | |||
| _benchmark_kernel = False | |||
| _deterministic_kernel = False | |||
| _benchmark_with_subprocess = False | |||
| __all__ = [ | |||
| "benchmark_kernel", | |||
| "benchmark_with_subprocess", | |||
| "deterministic_kernel", | |||
| "async_level", | |||
| "disable_memory_forwarding", | |||
| @@ -71,6 +73,34 @@ def deterministic_kernel(mod, option: bool): | |||
| _deterministic_kernel = option | |||
| @property | |||
| def benchmark_with_subprocess(mod): | |||
| r"""Whether or not run possible algorithms on real device to find the best one. The default option is false, | |||
| which means use heuristic to choose the fastest algorithm. | |||
| Examples: | |||
| .. code-block:: | |||
| import megengine as mge | |||
| mge.config.benchmark_with_subprocess = True | |||
| """ | |||
| return _benchmark_with_subprocess | |||
| @benchmark_with_subprocess.setter | |||
| def benchmark_with_subprocess(mod, option: bool): | |||
| if option: | |||
| import sys | |||
| from ._imperative_rt.utils import _set_fork_exec_path_for_timed_func | |||
| _set_fork_exec_path_for_timed_func( | |||
| sys.executable, | |||
| os.path.join( | |||
| os.path.dirname(__file__), "../utils", "_timed_func_fork_exec_entry.py" | |||
| ), | |||
| ) | |||
| @property | |||
| def async_level(mod) -> int: | |||
| r"""Get or set config whether raise error exactly when invoking op. The default level is 2, | |||
| @@ -481,7 +481,7 @@ class TimedFuncInvokerImpl final : public TimedFuncInvoker { | |||
| return iter->second.direct_call(param); | |||
| if (!m_fork_exec_impl) { | |||
| mgb_log_warn( | |||
| mgb_log_debug( | |||
| "timeout is set, but no fork_exec_impl not given; " | |||
| "timeout would be ignored"); | |||
| return iter->second.direct_call(param); | |||
| @@ -595,6 +595,10 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp | |||
| auto&& search_items = flatten_search_space<Opr>(*this, circular_deps_checker); | |||
| FOREACH_OPR_TYPE_DISPATCH(search_items, { | |||
| auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn); | |||
| // skip different sub opr, for example: | |||
| // skip matmul algo when profiling convolution | |||
| if (m_dnn_opr->get_opr_type() != megdnn_opr->get_opr_type()) | |||
| continue; | |||
| megdnn_opr->param() = | |||
| Algorithm::deserialize_read_pod<typename _Opr::Param>(_item.param); | |||
| typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( | |||
| @@ -609,7 +613,9 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp | |||
| // result, retrive_from_cache = true, allow_log = true | |||
| typename AlgoChooser<Opr>::ImplExecutionPolicy policy; | |||
| construct_execution_policy(selected_strategy, policy); | |||
| return policy; | |||
| if (policy.algo.valid()) | |||
| return policy; | |||
| return choose_by_heuristic(selected_strategy); | |||
| MIDOUT_E | |||
| } | |||
| @@ -712,7 +718,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy( | |||
| ::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(), | |||
| Algorithm::attribute_str(target_attr.first).c_str(), | |||
| Algorithm::attribute_str(target_attr.second).c_str()); | |||
| mgb_log_warn( | |||
| mgb_log_debug( | |||
| "No algo get from cache for %s. This may caused by " | |||
| "mismatch with model and cache file or imcomplete " | |||
| "cache file. ex. profiling with version1, but " | |||
| @@ -876,6 +882,10 @@ Maybe<AlgoChooserProfileCache::ResultEntry> AlgoChooser<Opr>::AlgoChooserHelper: | |||
| if (!rst.valid()) | |||
| return None; | |||
| // subprocess will return dbl_max when meomry limit is not satisfied | |||
| if (rst.val().time == std::numeric_limits<double>::max()) | |||
| return None; | |||
| std::string algo_desc; | |||
| serialize_write_pod(policy.algo, algo_desc); | |||
| return AlgoChooserProfileCache::ResultEntry{ | |||
| @@ -893,6 +903,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
| auto&& rst = get_profile_result_from_cache(selected_strategy); | |||
| // rst.first.valid means there exists valid algorithms for current opr, just return | |||
| // otherwise need to profile | |||
| // in order to avoid reprofile in fastrun | |||
| if (rst.first.valid()) | |||
| return; | |||
| AlgoChooserProfileCache::Result prof_rst; | |||
| @@ -901,6 +912,10 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
| std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts); | |||
| double cur_timeout = 0; | |||
| size_t data_size = 0; | |||
| for (auto ly : m_fastrun_layouts) | |||
| data_size += ly.span().dist_byte(); | |||
| auto workspace_limit = | |||
| m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit); | |||
| RealTimer timer; | |||
| @@ -925,6 +940,12 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
| ImplExecutionPolicy policy; | |||
| policy.algo = algo.desc; | |||
| // skip naive algo, can not using attribute to determine naive algo, thus using | |||
| // strcmp | |||
| if (algo.desc.name.compare("NAIVE") == 0) { | |||
| continue; | |||
| } | |||
| //! check negative attribute : skip negative attribute | |||
| auto palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo); | |||
| if (palgo->contain_attribute_any(target_attr.second)) { | |||
| @@ -938,10 +959,13 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
| //! check workspace limit | |||
| construct_execution_policy(selected_strategy, policy); | |||
| mgb_assert( | |||
| policy.algo.valid(), | |||
| "construct execution policy must success when profiling"); | |||
| if (get_workspace_size_bytes(policy) > workspace_limit) { | |||
| // this will failed | |||
| // when construct matmul algorithm for convolution opr | |||
| if (!policy.algo.valid()) | |||
| continue; | |||
| size_t workspace_needed = get_workspace_size_bytes(policy); | |||
| if (data_size + workspace_needed > | |||
| m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) { | |||
| continue; | |||
| } | |||
| @@ -957,7 +981,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
| }) | |||
| // megbrain uncatched exception | |||
| MGB_CATCH(..., { | |||
| mgb_log_warn("caught exception during %s", msg.c_str()); | |||
| mgb_log_debug("caught exception during %s", msg.c_str()); | |||
| continue; | |||
| }) | |||
| if (!cur_rst.valid()) { | |||
| @@ -982,20 +1006,22 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
| "workspace limite requirement(%zu)", | |||
| ::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(), | |||
| Algorithm::attribute_str(target_attr.second).c_str(), workspace_limit); | |||
| mgb_assert(!prof_rst.empty(), "%s", msg.c_str()); | |||
| // allowed to have empty profile result for current opr | |||
| // append some previous profiled results | |||
| if (rst.second.valid()) | |||
| prof_rst.insert( | |||
| prof_rst.end(), rst.second.val().begin(), rst.second.val().end()); | |||
| FixedTensorLayouts incache_layouts = m_incache_layouts; | |||
| typename Opr::Param origin_param = m_dnn_opr->param(); | |||
| AlgoChooserProfileCache::Key cache_key{ | |||
| incache_layouts.data(), incache_layouts.size(), &origin_param, | |||
| sizeof(origin_param)}; | |||
| AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); | |||
| cache.put(cache_key, prof_rst); | |||
| if (!prof_rst.empty()) { | |||
| FixedTensorLayouts incache_layouts = m_incache_layouts; | |||
| typename Opr::Param origin_param = m_dnn_opr->param(); | |||
| AlgoChooserProfileCache::Key cache_key{ | |||
| incache_layouts.data(), incache_layouts.size(), &origin_param, | |||
| sizeof(origin_param)}; | |||
| AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); | |||
| cache.put(cache_key, prof_rst); | |||
| } | |||
| MIDOUT_E | |||
| } | |||
| @@ -245,21 +245,34 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||
| } | |||
| }); | |||
| { | |||
| // first allocate a whole chunk to avoid memory fragmentation (here we | |||
| // rely on memory allocator to reuse memory) | |||
| auto align = cn.get_mem_addr_alignment(); | |||
| size_t tot_size = align; | |||
| for (int i = 0; i < arity; ++i) { | |||
| tot_size += layouts[i].span().high_byte + align; | |||
| } | |||
| for (const auto& layout : preprocessed_layout) { | |||
| tot_size += layout.span().high_byte + align; | |||
| } | |||
| tot_size += param.workspace; | |||
| DeviceTensorStorage storage{cn}; | |||
| storage.ensure_size(tot_size); | |||
| megdnn::Algorithm* algo = | |||
| megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo); | |||
| mgb_assert(algo); | |||
| #if !MGB_BUILD_SLIM_SERVING | |||
| #if MGB_CUDA || MGB_ROCM | |||
| // if tot_size > workspace_limit, then skip current algo, return double_max | |||
| // this assertion is needed because when profiling algo with subprocess, | |||
| // child process would occupy some cuda memory for initialization | |||
| // this assertion is the most accurate than before | |||
| size_t workspace_limit = | |||
| std::max(cn.get_free_mem(), cn.get_max_block_size_available()); | |||
| auto align = cn.get_mem_addr_alignment(); | |||
| size_t tot_size = align; | |||
| for (int i = 0; i < arity; ++i) { | |||
| tot_size += layouts[i].span().high_byte + align; | |||
| } | |||
| for (const auto& layout : preprocessed_layout) { | |||
| tot_size += layout.span().high_byte + align; | |||
| } | |||
| tot_size += param.workspace; | |||
| if (tot_size > workspace_limit) { | |||
| mgb_log_debug( | |||
| "current memory is not enouugh when profiling algo %s\n", algo->name()); | |||
| return TResult::from_pod(Result{std::numeric_limits<double>::max()}); | |||
| } | |||
| #endif | |||
| #endif | |||
| // allocate input and output memory | |||
| std::array<DeviceTensorND, arity_in> inp_val; | |||
| @@ -334,20 +347,17 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||
| }); | |||
| ev_end->record(); | |||
| megdnn::Algorithm* algo = | |||
| megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo); | |||
| mgb_assert(algo); | |||
| double next_report_time = 0.5; | |||
| while (!ev_end->finished()) { | |||
| if (timer.get_secs() >= next_report_time) { | |||
| #if MGB_ENABLE_GETENV | |||
| mgb_log_debug( | |||
| "profiling conv algo %s already took %.3f/%.3f secs" | |||
| "profiling algo %s already took %.3f/%.3f secs" | |||
| " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", | |||
| algo->name(), timer.get_secs(), param.actual_timeout); | |||
| #else | |||
| mgb_log_debug( | |||
| "profiling conv algo %s already took %.3f/%.3f secs", algo->name(), | |||
| "profiling algo %s already took %.3f/%.3f secs", algo->name(), | |||
| timer.get_secs(), param.actual_timeout); | |||
| #endif | |||
| next_report_time = timer.get_secs() + 1; | |||
| @@ -357,6 +367,19 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||
| std::this_thread::sleep_for(1000us); | |||
| #endif | |||
| } | |||
| DeviceTensorStorage storage; | |||
| for (int i = 0; i < arity_in; ++i) { | |||
| inp_val[i].reset(storage, TensorLayout{}); | |||
| } | |||
| for (int i = 0; i < arity_out; ++i) { | |||
| out_val[i].reset(storage, TensorLayout{}); | |||
| } | |||
| for (size_t i = 0; i < preprocessed_layout.size(); i++) { | |||
| flt_val[i].reset(storage, TensorLayout{}); | |||
| } | |||
| mdn_workspace = megdnn::Workspace{}; | |||
| workspace.reset(storage, TensorLayout{}); | |||
| // release all free blocks owned by child process, | |||
| // in order to avoid main process running out of memory | |||
| cn.try_coalesce_all_free_memory(); | |||