| @@ -39,6 +39,9 @@ endif() | |||||
| add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 ${PROJECT_BINARY_DIR}/third_party/range-v3) | add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 ${PROJECT_BINARY_DIR}/third_party/range-v3) | ||||
| target_link_libraries(${MODULE_NAME} PRIVATE range-v3) | target_link_libraries(${MODULE_NAME} PRIVATE range-v3) | ||||
| add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/Json ${PROJECT_BINARY_DIR}/third_party/Json) | |||||
| target_link_libraries(${MODULE_NAME} PRIVATE nlohmann_json::nlohmann_json) | |||||
| target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${MGB_OPDEF_OUT_DIR}) | target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${MGB_OPDEF_OUT_DIR}) | ||||
| target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME}) | target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME}) | ||||
| target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter) | target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter) | ||||
| @@ -1013,9 +1013,8 @@ void init_tensor(py::module m) { | |||||
| interpreter_for_py->sync(); | interpreter_for_py->sync(); | ||||
| imperative::Profiler::stop_profile(); | imperative::Profiler::stop_profile(); | ||||
| auto results = imperative::Profiler::collect(); | auto results = imperative::Profiler::collect(); | ||||
| auto options = imperative::Profiler::get_options(); | |||||
| return [results=std::move(results), options=std::move(options)](std::string basename, std::string format){ | |||||
| imperative::Profiler::dump_profile(basename, format, results, options); | |||||
| return [results=std::move(results)](std::string basename, std::string format){ | |||||
| imperative::Profiler::dump_profile(basename, format, results); | |||||
| }; | }; | ||||
| }, py::call_guard<py::gil_scoped_release>()); | }, py::call_guard<py::gil_scoped_release>()); | ||||
| m.def("sync", | m.def("sync", | ||||
| @@ -19,6 +19,7 @@ | |||||
| #include "megbrain/imperative/op_def.h" | #include "megbrain/imperative/op_def.h" | ||||
| #include "megbrain/imperative/utils/to_string.h" | #include "megbrain/imperative/utils/to_string.h" | ||||
| #include "./stack_manager.h" | |||||
| #include "./tensor_info.h" | #include "./tensor_info.h" | ||||
| namespace mgb::imperative { | namespace mgb::imperative { | ||||
| @@ -193,7 +194,7 @@ struct PopScope { | |||||
| } | } | ||||
| }; | }; | ||||
| using Command = std::variant<Put, | |||||
| using CommandData = std::variant<Put, | |||||
| ApplyOp, | ApplyOp, | ||||
| Del, | Del, | ||||
| GetValue, | GetValue, | ||||
| @@ -206,14 +207,20 @@ using Command = std::variant<Put, | |||||
| PushScope, | PushScope, | ||||
| PopScope>; | PopScope>; | ||||
| using IdentifiedCommand = std::pair<uint64_t, Command>; | |||||
| struct Command { | |||||
| uint64_t id; | |||||
| CommandData data; | |||||
| StackManager::Trace trace; | |||||
| }; | |||||
| // using IdentifiedCommand = std::pair<uint64_t, Command>; | |||||
| } | } | ||||
| template <> | template <> | ||||
| struct ToStringTrait<interpreter::intl::Command>{ | struct ToStringTrait<interpreter::intl::Command>{ | ||||
| std::string operator()(const interpreter::intl::Command& cmd) const { | std::string operator()(const interpreter::intl::Command& cmd) const { | ||||
| return std::visit([](const auto& cmd){ | |||||
| std::string content = std::visit([](const auto& cmd){ | |||||
| std::string result = cmd.get_name(); | std::string result = cmd.get_name(); | ||||
| result += "{"; | result += "{"; | ||||
| cmd.get_props([&](const char* key, auto&& value) { | cmd.get_props([&](const char* key, auto&& value) { | ||||
| @@ -224,7 +231,8 @@ struct ToStringTrait<interpreter::intl::Command>{ | |||||
| }); | }); | ||||
| result += "}"; | result += "}"; | ||||
| return result; | return result; | ||||
| }, cmd); | |||||
| }, cmd.data); | |||||
| return content; | |||||
| } | } | ||||
| }; | }; | ||||
| @@ -127,9 +127,8 @@ Handle ChannelImpl::put(const HostTensorND& value, bool no_cache) { | |||||
| MGB_LOCK_GUARD(m_spin); | MGB_LOCK_GUARD(m_spin); | ||||
| mgb_assert(check_available(), "Channel already closed"); | mgb_assert(check_available(), "Channel already closed"); | ||||
| auto& state = get_channel_state(); | auto& state = get_channel_state(); | ||||
| state.scopes.push("Put"); | |||||
| auto _ = StackManager::Guard{"Put", &state.stack_manager}; | |||||
| auto info = put_impl(value, no_cache); | auto info = put_impl(value, no_cache); | ||||
| state.scopes.pop("Put"); | |||||
| return info; | return info; | ||||
| } | } | ||||
| @@ -158,16 +157,15 @@ Handle ChannelImpl::put(const DeviceTensorND& data, const HostTensorND& hvalue) | |||||
| } | } | ||||
| TensorInfo* ChannelImpl::put_impl(const DeviceTensorND& data, const HostTensorND& hvalue) { | TensorInfo* ChannelImpl::put_impl(const DeviceTensorND& data, const HostTensorND& hvalue) { | ||||
| auto& state = get_channel_state(); | auto& state = get_channel_state(); | ||||
| state.scopes.push("Put"); | |||||
| auto _ = StackManager::Guard{"Put", &state.stack_manager}; | |||||
| auto info = alloc(); | auto info = alloc(); | ||||
| RECORD_EVENT(TensorCommandEvent, info->id, TensorCommandEvent::Put); | |||||
| RECORD_EVENT(TensorCommandEvent, info->id, TensorCommandKind::Put); | |||||
| init(info, {data.layout(), data.comp_node()}); | init(info, {data.layout(), data.comp_node()}); | ||||
| info->mem_desc.id = StorageIdentifier::make(++m_storage_id); | info->mem_desc.id = StorageIdentifier::make(++m_storage_id); | ||||
| info->ptr = Tensor::make(data, hvalue); | info->ptr = Tensor::make(data, hvalue); | ||||
| RECORD_EVENT(TensorProduceEvent, info->id, info->desc.layout, info->desc.comp_node, data.raw_ptr()); | RECORD_EVENT(TensorProduceEvent, info->id, info->desc.layout, info->desc.comp_node, data.raw_ptr()); | ||||
| info->status = TensorInfo::Produced; | info->status = TensorInfo::Produced; | ||||
| RECORD_EVENT(TensorCommandFinishEvent, info->id, TensorCommandFinishEvent::Put); | |||||
| state.scopes.pop("Put"); | |||||
| RECORD_EVENT(TensorCommandFinishEvent, info->id, TensorCommandKind::Put); | |||||
| return info; | return info; | ||||
| } | } | ||||
| @@ -230,7 +228,7 @@ void ChannelImpl::dispatch_default_cpu( | |||||
| auto& state = get_channel_state(); | auto& state = get_channel_state(); | ||||
| auto name = op->trait()->make_name(*op); | auto name = op->trait()->make_name(*op); | ||||
| state.scopes.push(name); | |||||
| auto _ = StackManager::Guard(name, &state.stack_manager); | |||||
| auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); | auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); | ||||
| RECORD_EVENT(ShapeInferEvent, validated); | RECORD_EVENT(ShapeInferEvent, validated); | ||||
| @@ -291,9 +289,9 @@ void ChannelImpl::dispatch_default_cpu( | |||||
| } | } | ||||
| return op_info; | return op_info; | ||||
| }; | }; | ||||
| RECORD_EVENT(OpDispatchEvent, op_id, op->trait()->name, op_info_getter, tinfo_to_tid(input_infos), tinfo_to_tid(output_infos)); | |||||
| state.scopes.pop(name); | |||||
| RECORD_EVENT(OpDispatchEvent, op_id, op->trait()->name, op_info_getter, | |||||
| tinfo_to_tid(input_infos), tinfo_to_tid(output_infos), | |||||
| state.stack_manager.dump()); | |||||
| } | } | ||||
| void ChannelImpl::dispatch_kernel( | void ChannelImpl::dispatch_kernel( | ||||
| @@ -305,7 +303,7 @@ void ChannelImpl::dispatch_kernel( | |||||
| auto& options = state.options; | auto& options = state.options; | ||||
| auto name = op->trait()->make_name(*op); | auto name = op->trait()->make_name(*op); | ||||
| state.scopes.push(name); | |||||
| auto _ = StackManager::Guard{name, &state.stack_manager}; | |||||
| auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); | auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); | ||||
| RECORD_EVENT(ShapeInferEvent, validated); | RECORD_EVENT(ShapeInferEvent, validated); | ||||
| @@ -334,7 +332,9 @@ void ChannelImpl::dispatch_kernel( | |||||
| } | } | ||||
| return op_info; | return op_info; | ||||
| }; | }; | ||||
| RECORD_EVENT(OpDispatchEvent, cmd.id, cmd.op->trait()->name, op_info_getter, tinfo_to_tid(cmd.inputs), tinfo_to_tid(cmd.outputs)); | |||||
| RECORD_EVENT(OpDispatchEvent, cmd.id, cmd.op->trait()->name, op_info_getter, | |||||
| tinfo_to_tid(cmd.inputs), tinfo_to_tid(cmd.outputs), | |||||
| state.stack_manager.dump()); | |||||
| m_buffer.enqueue(std::move(cmd)); | m_buffer.enqueue(std::move(cmd)); | ||||
| if (!validated && options.async_level == 1) { | if (!validated && options.async_level == 1) { | ||||
| sync_impl(); | sync_impl(); | ||||
| @@ -346,7 +346,6 @@ void ChannelImpl::dispatch_kernel( | |||||
| info->ptr->comp_node().sync(); | info->ptr->comp_node().sync(); | ||||
| } | } | ||||
| } | } | ||||
| state.scopes.pop(name); | |||||
| } | } | ||||
| SmallVector<Handle> ChannelImpl::apply_op( | SmallVector<Handle> ChannelImpl::apply_op( | ||||
| @@ -505,7 +504,8 @@ TensorInfo* ChannelImpl::alloc() { | |||||
| }(); | }(); | ||||
| info->id = Profiler::next_id(); | info->id = Profiler::next_id(); | ||||
| if (Profiler::is_profiling()) { | if (Profiler::is_profiling()) { | ||||
| info->name = state.scopes.next_tensor_name(); | |||||
| size_t tensor_id = state.stack_manager.current()->next_id("tensor"); | |||||
| info->name = state.stack_manager.dump().to_string() + ssprintf(":%zu", tensor_id); | |||||
| } | } | ||||
| return info; | return info; | ||||
| } | } | ||||
| @@ -554,7 +554,7 @@ void ChannelImpl::free(TensorInfo* ptr) { | |||||
| } | } | ||||
| void ChannelImpl::recursive_free(TensorInfo* ptr) { | void ChannelImpl::recursive_free(TensorInfo* ptr) { | ||||
| RECORD_EVENT(TensorCommandEvent, ptr->id, TensorCommandEvent::RecFree); | |||||
| RECORD_EVENT(TensorCommandEvent, ptr->id, TensorCommandKind::RecFree); | |||||
| SmallVector<TensorInfo*> inps; | SmallVector<TensorInfo*> inps; | ||||
| if (ptr->producer) { | if (ptr->producer) { | ||||
| for (auto i : ptr->producer->inputs) { | for (auto i : ptr->producer->inputs) { | ||||
| @@ -569,7 +569,7 @@ void ChannelImpl::recursive_free(TensorInfo* ptr) { | |||||
| recursive_free(i); | recursive_free(i); | ||||
| } | } | ||||
| } | } | ||||
| RECORD_EVENT(TensorCommandFinishEvent, ptr->id, TensorCommandFinishEvent::RecFree); | |||||
| RECORD_EVENT(TensorCommandFinishEvent, ptr->id, TensorCommandKind::RecFree); | |||||
| } | } | ||||
| void ChannelImpl::real_free(TensorInfo* ptr) { | void ChannelImpl::real_free(TensorInfo* ptr) { | ||||
| @@ -625,9 +625,9 @@ void ChannelImpl::regenerate(TensorInfo* dest) { | |||||
| m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest}); | m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest}); | ||||
| if (!m_applying) flush_apply_stack(); | if (!m_applying) flush_apply_stack(); | ||||
| } else if (dest->evict_type == EvictType::SWAP) { | } else if (dest->evict_type == EvictType::SWAP) { | ||||
| RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandEvent::ReGen); | |||||
| RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen); | |||||
| produce_tensor(dest, Tensor::make(dest->h_value)); | produce_tensor(dest, Tensor::make(dest->h_value)); | ||||
| RECORD_EVENT(TensorCommandFinishEvent, dest->id, TensorCommandFinishEvent::ReGen); | |||||
| RECORD_EVENT(TensorCommandFinishEvent, dest->id, TensorCommandKind::ReGen); | |||||
| } | } | ||||
| } | } | ||||
| @@ -721,22 +721,24 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) { | |||||
| // refcnt --, owners: [tensor_inputs] | // refcnt --, owners: [tensor_inputs] | ||||
| // if it's decreased to 1, would be detected at @see: proxy_graph_detail::apply_on_physical_tensor | // if it's decreased to 1, would be detected at @see: proxy_graph_detail::apply_on_physical_tensor | ||||
| uint64_t del_id = del->id; | uint64_t del_id = del->id; | ||||
| RECORD_EVENT(OpDelEvent, del_id); | |||||
| RECORD_EVENT(TensorCommandEvent, del_id, TensorCommandKind::Del); | |||||
| free(del); | free(del); | ||||
| RECORD_EVENT(OpDelFinishEvent, del_id); | |||||
| RECORD_EVENT(TensorCommandFinishEvent, del_id, TensorCommandKind::Del); | |||||
| } | } | ||||
| // Before wait | // Before wait | ||||
| //TODO: split operator wait and execute so that OpWait could be corrected recorded. | //TODO: split operator wait and execute so that OpWait could be corrected recorded. | ||||
| // Before execute | // Before execute | ||||
| for (auto&& [device, kernel_id]: kernels) { | for (auto&& [device, kernel_id]: kernels) { | ||||
| RECORD_EVENT(KernelExecuteEvent, apply_id, kernel_id, Timer::record_event(device)); | |||||
| RECORD_EVENT(KernelLaunchEvent, apply_id, kernel_id, device); | |||||
| RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); | |||||
| } | } | ||||
| // Apply op | // Apply op | ||||
| // Here std::move is REQUIRED for removing duplicated references. | // Here std::move is REQUIRED for removing duplicated references. | ||||
| auto outputs = apply_on_physical_tensor(apply_on_physical_tensor, *cmd.op, inputs); | auto outputs = apply_on_physical_tensor(apply_on_physical_tensor, *cmd.op, inputs); | ||||
| // After execute | // After execute | ||||
| for (auto&& [device, kernel_id]: kernels) { | for (auto&& [device, kernel_id]: kernels) { | ||||
| RECORD_EVENT(KernelExecuteFinishEvent, apply_id, kernel_id, Timer::record_event(device)); | |||||
| RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); | |||||
| RECORD_EVENT(KernelLaunchFinishEvent, apply_id, kernel_id, device); | |||||
| } | } | ||||
| // End profiling operator | // End profiling operator | ||||
| mgb_assert(outputs.size() == cmd.outputs.size()); | mgb_assert(outputs.size() == cmd.outputs.size()); | ||||
| @@ -787,7 +789,7 @@ void ChannelImpl::flush_apply_stack() { | |||||
| m_dtr.pin(cmd.inputs); | m_dtr.pin(cmd.inputs); | ||||
| } | } | ||||
| if (recomp) { | if (recomp) { | ||||
| RECORD_EVENT(TensorCommandEvent, recomp->id, TensorCommandEvent::ReGen); | |||||
| RECORD_EVENT(TensorCommandEvent, recomp->id, TensorCommandKind::ReGen); | |||||
| } | } | ||||
| } | } | ||||
| bool regen = false; | bool regen = false; | ||||
| @@ -810,7 +812,7 @@ void ChannelImpl::flush_apply_stack() { | |||||
| m_apply_stack.pop(); | m_apply_stack.pop(); | ||||
| do_apply_op(cmd_backup); | do_apply_op(cmd_backup); | ||||
| if (recomp_backup) { | if (recomp_backup) { | ||||
| RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandFinishEvent::ReGen); | |||||
| RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen); | |||||
| for (auto o : cmd_backup.outputs) { | for (auto o : cmd_backup.outputs) { | ||||
| if (o) { | if (o) { | ||||
| m_dtr.update_dsu_after_recompute(o); | m_dtr.update_dsu_after_recompute(o); | ||||
| @@ -902,7 +904,7 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) { | |||||
| check_worker_exc_unsafe(); | check_worker_exc_unsafe(); | ||||
| return require_host ? host_available() : static_cast<bool>(info->ptr); | return require_host ? host_available() : static_cast<bool>(info->ptr); | ||||
| }); | }); | ||||
| RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop, m_waitee == nullptr); | |||||
| RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop); | |||||
| m_waitee = nullptr; | m_waitee = nullptr; | ||||
| return info->ptr; | return info->ptr; | ||||
| } | } | ||||
| @@ -1003,7 +1005,7 @@ std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPt | |||||
| return {outputs_desc, alloc_storage(outputs_desc), alloc_storage(workspaces_desc)}; | return {outputs_desc, alloc_storage(outputs_desc), alloc_storage(workspaces_desc)}; | ||||
| } | } | ||||
| void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
| void ChannelImpl::process_one_task(Command& icmd) { | |||||
| using namespace ranges; | using namespace ranges; | ||||
| using namespace ranges::views; | using namespace ranges::views; | ||||
| auto& state = get_worker_state(); | auto& state = get_worker_state(); | ||||
| @@ -1012,10 +1014,12 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
| auto cmd_visitor = [&](const auto& cmd) { | auto cmd_visitor = [&](const auto& cmd) { | ||||
| using T = std::decay_t<decltype(cmd)>; | using T = std::decay_t<decltype(cmd)>; | ||||
| if constexpr (std::is_same_v<T, Put>) { | if constexpr (std::is_same_v<T, Put>) { | ||||
| RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Put); | |||||
| RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Put); | |||||
| RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node())); | |||||
| auto value = cmd.no_cache ? std::make_shared<Tensor>(cmd.value) : Tensor::make(cmd.value); | auto value = cmd.no_cache ? std::make_shared<Tensor>(cmd.value) : Tensor::make(cmd.value); | ||||
| RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node())); | |||||
| produce_tensor(cmd.dest, std::move(value)); | produce_tensor(cmd.dest, std::move(value)); | ||||
| RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Put); | |||||
| RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Put); | |||||
| sample_on_device(cmd.dest->desc.comp_node, false); | sample_on_device(cmd.dest->desc.comp_node, false); | ||||
| } else if constexpr (std::is_same_v<T, ApplyOp>) { | } else if constexpr (std::is_same_v<T, ApplyOp>) { | ||||
| for (auto& i : cmd.inputs) { | for (auto& i : cmd.inputs) { | ||||
| @@ -1084,11 +1088,11 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
| } | } | ||||
| } | } | ||||
| } else if constexpr (std::is_same_v<T, Del>) { | } else if constexpr (std::is_same_v<T, Del>) { | ||||
| RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Del); | |||||
| RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Del); | |||||
| CompNode device = cmd.dest->desc.comp_node; | CompNode device = cmd.dest->desc.comp_node; | ||||
| uint64_t tensor_id = cmd.dest->id; | uint64_t tensor_id = cmd.dest->id; | ||||
| free(cmd.dest); | free(cmd.dest); | ||||
| RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandFinishEvent::Del); | |||||
| RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandKind::Del); | |||||
| sample_on_device(device, false); | sample_on_device(device, false); | ||||
| } else if constexpr (std::is_same_v<T, GetValue>) { | } else if constexpr (std::is_same_v<T, GetValue>) { | ||||
| if (cmd.dest->invalid) return; | if (cmd.dest->invalid) return; | ||||
| @@ -1102,26 +1106,26 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
| imperative_log_profile_end("GetValue"); | imperative_log_profile_end("GetValue"); | ||||
| } else if constexpr (std::is_same_v<T, SwapIn>) { | } else if constexpr (std::is_same_v<T, SwapIn>) { | ||||
| if (cmd.dest->invalid) return; | if (cmd.dest->invalid) return; | ||||
| RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapIn); | |||||
| RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapIn); | |||||
| produce_tensor(cmd.dest, Tensor::make(cmd.dest->h_value)); | produce_tensor(cmd.dest, Tensor::make(cmd.dest->h_value)); | ||||
| RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapIn); | |||||
| RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapIn); | |||||
| sample_on_device(cmd.dest->desc.comp_node, false); | sample_on_device(cmd.dest->desc.comp_node, false); | ||||
| } else if constexpr (std::is_same_v<T, SwapOut>) { | } else if constexpr (std::is_same_v<T, SwapOut>) { | ||||
| if (cmd.dest->invalid) return; | if (cmd.dest->invalid) return; | ||||
| RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapOut); | |||||
| RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapOut); | |||||
| cmd.dest->h_value = cmd.dest->ptr->get_value(); | cmd.dest->h_value = cmd.dest->ptr->get_value(); | ||||
| if (cmd.dest->evict_type == EvictType::NONE) { | if (cmd.dest->evict_type == EvictType::NONE) { | ||||
| cmd.dest->evict_type = EvictType::SWAP; | cmd.dest->evict_type = EvictType::SWAP; | ||||
| cmd.dest->status = TensorInfo::Swapped; | cmd.dest->status = TensorInfo::Swapped; | ||||
| release_tensor(cmd.dest); | release_tensor(cmd.dest); | ||||
| } | } | ||||
| RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapOut); | |||||
| RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapOut); | |||||
| sample_on_device(cmd.dest->desc.comp_node, false); | sample_on_device(cmd.dest->desc.comp_node, false); | ||||
| } else if constexpr (std::is_same_v<T, Drop>) { | } else if constexpr (std::is_same_v<T, Drop>) { | ||||
| if (cmd.dest->invalid) return; | if (cmd.dest->invalid) return; | ||||
| RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Drop); | |||||
| RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Drop); | |||||
| do_drop(cmd.dest, true); | do_drop(cmd.dest, true); | ||||
| RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Drop); | |||||
| RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Drop); | |||||
| } else if constexpr (std::is_same_v<T, SetOption>) { | } else if constexpr (std::is_same_v<T, SetOption>) { | ||||
| options.set_option(cmd.key, cmd.value); | options.set_option(cmd.key, cmd.value); | ||||
| } else if constexpr (std::is_same_v<T, StartProfile>) { | } else if constexpr (std::is_same_v<T, StartProfile>) { | ||||
| @@ -1138,6 +1142,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
| if (Profiler::get_option("sample_rate", 0)) { | if (Profiler::get_option("sample_rate", 0)) { | ||||
| sample_on_device(device, true); | sample_on_device(device, true); | ||||
| } | } | ||||
| RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); | |||||
| }); | }); | ||||
| RECORD_EVENT(StartProfileFinishEvent); | RECORD_EVENT(StartProfileFinishEvent); | ||||
| } else if constexpr (std::is_same_v<T, StopProfile>) { | } else if constexpr (std::is_same_v<T, StopProfile>) { | ||||
| @@ -1186,7 +1191,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
| notify_tensor_unsafe(m_waitee); | notify_tensor_unsafe(m_waitee); | ||||
| } | } | ||||
| } | } | ||||
| }, icmd.second); | |||||
| }, icmd.data); | |||||
| } | } | ||||
| void ChannelImpl::check_worker_exc_unsafe() { | void ChannelImpl::check_worker_exc_unsafe() { | ||||
| @@ -1203,12 +1208,13 @@ void ChannelImpl::check_worker_exc_unsafe() { | |||||
| } | } | ||||
| } | } | ||||
| void ChannelImpl::CommandBuffer::enqueue(Command cmd) { | |||||
| void ChannelImpl::CommandBuffer::enqueue(CommandData cmd) { | |||||
| auto& state = m_owner->get_channel_state(); | |||||
| if (std::get_if<Del>(&cmd) && fuse_del(std::get<Del>(cmd))) { | if (std::get_if<Del>(&cmd) && fuse_del(std::get<Del>(cmd))) { | ||||
| return; | return; | ||||
| } | } | ||||
| // mgb_log_debug("%s Enqueued", to_string(cmd).c_str()); | // mgb_log_debug("%s Enqueued", to_string(cmd).c_str()); | ||||
| m_commands.push_back(std::move(cmd)); | |||||
| m_commands.push_back({Profiler::next_id(), std::move(cmd), state.stack_manager.dump()}); | |||||
| auto flush_pos = flush_pos_for(m_commands.back()); | auto flush_pos = flush_pos_for(m_commands.back()); | ||||
| flush(flush_pos); | flush(flush_pos); | ||||
| } | } | ||||
| @@ -1222,7 +1228,7 @@ void ChannelImpl::CommandBuffer::flush(Handle pos) { | |||||
| if (Profiler::is_profiling()) { | if (Profiler::is_profiling()) { | ||||
| mgb_log_debug("%s Flushed", to_string(*iter).c_str()); | mgb_log_debug("%s Flushed", to_string(*iter).c_str()); | ||||
| } | } | ||||
| m_owner->m_worker.add_task(IdentifiedCommand{Profiler::next_id(), std::move(*iter)}); | |||||
| m_owner->m_worker.add_task(std::move(*iter)); | |||||
| } | } | ||||
| m_commands.erase(m_commands.begin(), pos); | m_commands.erase(m_commands.begin(), pos); | ||||
| } | } | ||||
| @@ -1248,7 +1254,7 @@ auto ChannelImpl::CommandBuffer::flush_pos_for(const Command& cmd) -> Handle { | |||||
| return m_commands.begin() + (m_commands.size() - buffer_length); | return m_commands.begin() + (m_commands.size() - buffer_length); | ||||
| } | } | ||||
| return m_commands.begin(); | return m_commands.begin(); | ||||
| }, cmd); | |||||
| }, cmd.data); | |||||
| } | } | ||||
| /** | /** | ||||
| @@ -1261,7 +1267,7 @@ bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) { | |||||
| // TODO: eliminate Puts | // TODO: eliminate Puts | ||||
| auto begin = m_commands.begin(), end = m_commands.end(); | auto begin = m_commands.begin(), end = m_commands.end(); | ||||
| auto apply_iter = std::find_if(begin, end, [dest](const Command& cmd){ | auto apply_iter = std::find_if(begin, end, [dest](const Command& cmd){ | ||||
| if (auto* apply = std::get_if<ApplyOp>(&cmd)) { | |||||
| if (auto* apply = std::get_if<ApplyOp>(&cmd.data)) { | |||||
| return std::count(apply->inputs.begin(), apply->inputs.end(), dest) > 0; | return std::count(apply->inputs.begin(), apply->inputs.end(), dest) > 0; | ||||
| } | } | ||||
| return false; | return false; | ||||
| @@ -1270,7 +1276,7 @@ bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| // mgb_log_debug("%s Fused", to_string(Command{cmd}).c_str()); | // mgb_log_debug("%s Fused", to_string(Command{cmd}).c_str()); | ||||
| std::get<ApplyOp>(*apply_iter).dels.push_back(dest); | |||||
| std::get<ApplyOp>(apply_iter->data).dels.push_back(dest); | |||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -1297,7 +1303,7 @@ auto ChannelImpl::CommandBuffer::find_last_usage(TensorInfo* dest, Range range) | |||||
| found = iter; | found = iter; | ||||
| } | } | ||||
| } | } | ||||
| }, *iter); | |||||
| }, iter->data); | |||||
| }; | }; | ||||
| return found; | return found; | ||||
| } | } | ||||
| @@ -1313,7 +1319,7 @@ auto ChannelImpl::CommandBuffer::find_produce(TensorInfo* dest, Range range) | |||||
| return cmd.dest == dest; | return cmd.dest == dest; | ||||
| } | } | ||||
| return false; | return false; | ||||
| }, cmd); | |||||
| }, cmd.data); | |||||
| }); | }); | ||||
| } | } | ||||
| @@ -1340,7 +1346,7 @@ void ChannelImpl::push_scope(std::string name) { | |||||
| MGB_LOCK_GUARD(m_spin); | MGB_LOCK_GUARD(m_spin); | ||||
| mgb_assert(check_available(), "Channel already closed"); | mgb_assert(check_available(), "Channel already closed"); | ||||
| auto& state = get_channel_state(); | auto& state = get_channel_state(); | ||||
| state.scopes.push(name); | |||||
| state.stack_manager.enter(name); | |||||
| RECORD_EVENT(ScopeEvent, name); | RECORD_EVENT(ScopeEvent, name); | ||||
| m_buffer.enqueue(PushScope{name}); | m_buffer.enqueue(PushScope{name}); | ||||
| } | } | ||||
| @@ -1349,7 +1355,7 @@ void ChannelImpl::pop_scope(std::string name) { | |||||
| MGB_LOCK_GUARD(m_spin); | MGB_LOCK_GUARD(m_spin); | ||||
| mgb_assert(check_available(), "Channel already closed"); | mgb_assert(check_available(), "Channel already closed"); | ||||
| auto& state = get_channel_state(); | auto& state = get_channel_state(); | ||||
| state.scopes.pop(name); | |||||
| state.stack_manager.exit(name); | |||||
| RECORD_EVENT(ScopeFinishEvent, name); | RECORD_EVENT(ScopeFinishEvent, name); | ||||
| m_buffer.enqueue(PopScope{name}); | m_buffer.enqueue(PopScope{name}); | ||||
| } | } | ||||
| @@ -26,6 +26,7 @@ | |||||
| #include "./commands.h" | #include "./commands.h" | ||||
| #include "./tensor_info.h" | #include "./tensor_info.h" | ||||
| #include "./option_manager.h" | #include "./option_manager.h" | ||||
| #include "./stack_manager.h" | |||||
| #include "../profiler/events.h" | #include "../profiler/events.h" | ||||
| @@ -94,7 +95,7 @@ private: | |||||
| TensorPtr wait_tensor(TensorInfo* info, profiler::TensorProp prop); | TensorPtr wait_tensor(TensorInfo* info, profiler::TensorProp prop); | ||||
| void notify_tensor_unsafe(TensorInfo* info); | void notify_tensor_unsafe(TensorInfo* info); | ||||
| void process_one_task(IdentifiedCommand&); | |||||
| void process_one_task(Command&); | |||||
| void check_worker_exc_unsafe(); | void check_worker_exc_unsafe(); | ||||
| @@ -129,10 +130,10 @@ private: | |||||
| void assert_in_worker(); | void assert_in_worker(); | ||||
| std::thread::id get_worker_tid(); | std::thread::id get_worker_tid(); | ||||
| template <typename TCommand> | |||||
| void enqueue_command(TCommand&& cmd) { | |||||
| m_buffer.enqueue(Command{std::forward<TCommand>(cmd)}); | |||||
| } | |||||
| // template <typename TCommand> | |||||
| // void enqueue_command(TCommand&& cmd) { | |||||
| // m_buffer.enqueue(Command{std::forward<TCommand>(cmd)}); | |||||
| // } | |||||
| void sample_on_device(CompNode device, bool force); | void sample_on_device(CompNode device, bool force); | ||||
| @@ -153,13 +154,13 @@ private: | |||||
| bool m_applying = false; | bool m_applying = false; | ||||
| bool m_closed = false; | bool m_closed = false; | ||||
| struct WorkQueue : AsyncQueueSC<IdentifiedCommand, WorkQueue> { | |||||
| struct WorkQueue : AsyncQueueSC<Command, WorkQueue> { | |||||
| // set max_spin=0 to prevent Queue fetch task in busy wait manner. | // set max_spin=0 to prevent Queue fetch task in busy wait manner. | ||||
| // this won't affect throughput when python interpreter is sending enough task, | // this won't affect throughput when python interpreter is sending enough task, | ||||
| // but will significantly save CPU time when waiting for task, e.g. wait for data input | // but will significantly save CPU time when waiting for task, e.g. wait for data input | ||||
| // limit pending tasks to 10000 | // limit pending tasks to 10000 | ||||
| WorkQueue(ChannelImpl* owner) | WorkQueue(ChannelImpl* owner) | ||||
| : AsyncQueueSC<IdentifiedCommand, WorkQueue>(0, 10000), m_owner(owner) { | |||||
| : AsyncQueueSC<Command, WorkQueue>(0, 10000), m_owner(owner) { | |||||
| sys::set_thread_name("interpreter"); | sys::set_thread_name("interpreter"); | ||||
| if (const char* env_val = MGB_GETENV("MEGENGINE_ASYNC_QUEUE_SIZE")) { | if (const char* env_val = MGB_GETENV("MEGENGINE_ASYNC_QUEUE_SIZE")) { | ||||
| int len = strlen(env_val); | int len = strlen(env_val); | ||||
| @@ -171,7 +172,7 @@ private: | |||||
| update_max_items(val); | update_max_items(val); | ||||
| } | } | ||||
| } | } | ||||
| void process_one_task(IdentifiedCommand& icmd) { | |||||
| void process_one_task(Command& icmd) { | |||||
| m_owner->process_one_task(icmd); | m_owner->process_one_task(icmd); | ||||
| } | } | ||||
| void on_async_queue_worker_thread_start() override; | void on_async_queue_worker_thread_start() override; | ||||
| @@ -193,7 +194,7 @@ private: | |||||
| */ | */ | ||||
| struct CommandBuffer { | struct CommandBuffer { | ||||
| CommandBuffer(ChannelImpl* owner) : m_owner(owner) {} | CommandBuffer(ChannelImpl* owner) : m_owner(owner) {} | ||||
| void enqueue(Command cmd); | |||||
| void enqueue(CommandData cmd); | |||||
| bool empty() const { | bool empty() const { | ||||
| return m_commands.empty(); | return m_commands.empty(); | ||||
| } | } | ||||
| @@ -224,91 +225,13 @@ private: | |||||
| //! level 0: both sync. | //! level 0: both sync. | ||||
| int m_async_level = 2; | int m_async_level = 2; | ||||
| struct Scope { | |||||
| std::string name; | |||||
| std::unordered_map<std::string, std::unique_ptr<Scope>> children; | |||||
| size_t version = 0; | |||||
| size_t parent_version = 0; | |||||
| size_t tensor_count = 0; | |||||
| Scope* active_child = nullptr; | |||||
| Scope* parent = nullptr; | |||||
| Scope* enter(std::string name) { | |||||
| auto& child = children[name]; | |||||
| if (!child) { | |||||
| child = std::make_unique<Scope>(); | |||||
| child->name = name; | |||||
| child->parent = this; | |||||
| } | |||||
| if (version != child->parent_version) { | |||||
| child->version = 0; | |||||
| child->parent_version = version; | |||||
| } else { | |||||
| child->version++; | |||||
| } | |||||
| child->tensor_count = 0; | |||||
| return active_child = child.get(); | |||||
| } | |||||
| Scope* exit(std::string name) { | |||||
| mgb_assert(this->name == name, "scope name mismatch"); | |||||
| parent->active_child = nullptr; | |||||
| return parent; | |||||
| } | |||||
| }; | |||||
| class ScopeManager { | |||||
| private: | |||||
| Scope m_root; | |||||
| Scope* m_current_scope = &m_root; | |||||
| public: | |||||
| class ScopeGuard{ | |||||
| private: | |||||
| ScopeManager* m_manager; | |||||
| std::string m_name; | |||||
| public: | |||||
| ScopeGuard(ScopeManager* manager, std::string name): m_manager{manager}, m_name{name} { | |||||
| m_manager->push(m_name); | |||||
| } | |||||
| ~ScopeGuard() { | |||||
| m_manager->pop(m_name); | |||||
| } | |||||
| }; | |||||
| void push(std::string name) { | |||||
| m_current_scope = m_current_scope->enter(name); | |||||
| } | |||||
| void pop(std::string name) { | |||||
| m_current_scope = m_current_scope->exit(name); | |||||
| } | |||||
| std::string next_tensor_name() { | |||||
| std::string builder; | |||||
| Scope* scope = &m_root; | |||||
| while (true) { | |||||
| builder.append(scope->name); | |||||
| if (scope->version != 0) { | |||||
| builder.append(ssprintf("(%ld)", scope->version)); | |||||
| } | |||||
| if (scope != &m_root) { | |||||
| builder.append("."); | |||||
| } | |||||
| if (scope->active_child == nullptr) { | |||||
| builder.append(ssprintf(":%%%ld", scope->tensor_count++)); | |||||
| break; | |||||
| } else { | |||||
| scope = scope->active_child; | |||||
| } | |||||
| } | |||||
| return builder; | |||||
| } | |||||
| }; | |||||
| struct State { | struct State { | ||||
| std::thread::id tid; | std::thread::id tid; | ||||
| OptionManager options; | OptionManager options; | ||||
| }; | }; | ||||
| struct ChannelState: State { | struct ChannelState: State { | ||||
| ScopeManager scopes; | |||||
| StackManager stack_manager; | |||||
| }; | }; | ||||
| struct WorkerState: State {}; | struct WorkerState: State {}; | ||||
| @@ -0,0 +1,188 @@ | |||||
| /** | |||||
| * \file imperative/src/impl/interpreter/stack_manager.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include <string> | |||||
| #include <memory> | |||||
| #include <unordered_map> | |||||
| #include "megbrain/utils/metahelper.h" | |||||
| #include "megbrain/utils/small_vector.h" | |||||
| namespace mgb::imperative::interpreter::intl{ | |||||
| class StackSnapshot; | |||||
| class StackManager: public NonCopyableObj { | |||||
| public: | |||||
| class Node; | |||||
| class Guard; | |||||
| struct Frame; | |||||
| class Trace; | |||||
| private: | |||||
| std::unique_ptr<Node> m_root = nullptr; | |||||
| Node* m_current = nullptr; | |||||
| SmallVector<uint64_t> m_trace_id_stack; | |||||
| uint64_t m_last_trace_id = 0; | |||||
| public: | |||||
| StackManager(); | |||||
| std::pair<Node*, uint64_t> enter(std::string name); | |||||
| void exit(std::string name); | |||||
| Trace dump(); | |||||
| Node* current(); | |||||
| }; | |||||
| class StackManager::Node: public NonCopyableObj { | |||||
| private: | |||||
| std::string m_name; | |||||
| std::unordered_map<std::string, std::unique_ptr<Node>> m_children; | |||||
| std::unordered_map<std::string, size_t> m_id_table; | |||||
| Node* m_parent = nullptr; | |||||
| int64_t m_depth = -1; | |||||
| uint64_t m_version = 0; | |||||
| explicit Node(std::string name, Node* parent): m_name{name}, m_parent{parent} { | |||||
| if (parent) { | |||||
| m_depth = parent->m_depth + 1; | |||||
| } | |||||
| } | |||||
| public: | |||||
| const std::string& name() const { | |||||
| return m_name; | |||||
| } | |||||
| Node* operator[](const std::string& name) { | |||||
| auto& child = m_children[name]; | |||||
| if (child == nullptr) { | |||||
| child.reset(new Node(name, this)); | |||||
| } | |||||
| return child.get(); | |||||
| } | |||||
| Node* parent() { | |||||
| return m_parent; | |||||
| } | |||||
| bool is_root() { | |||||
| return m_parent == nullptr; | |||||
| } | |||||
| uint64_t version() const { | |||||
| return m_version; | |||||
| } | |||||
| void update_version() { | |||||
| ++m_version; | |||||
| for (auto&& [key, child]: m_children) { | |||||
| child->reset_version(); | |||||
| } | |||||
| m_id_table.clear(); | |||||
| } | |||||
| void reset_version() { | |||||
| m_version = 0; | |||||
| m_id_table.clear(); | |||||
| } | |||||
| int64_t depth() const { | |||||
| return m_depth; | |||||
| } | |||||
| uint64_t next_id(std::string key) { | |||||
| return m_id_table[key]++; | |||||
| } | |||||
| static std::unique_ptr<Node> make() { | |||||
| return std::unique_ptr<Node>(new Node("", nullptr)); | |||||
| } | |||||
| }; | |||||
| class StackManager::Guard { | |||||
| private: | |||||
| std::string m_name; | |||||
| StackManager* m_manager; | |||||
| public: | |||||
| Guard(std::string name, StackManager* manager): m_name{name}, m_manager{manager}{ | |||||
| if (m_manager) { | |||||
| m_manager->enter(name); | |||||
| } | |||||
| } | |||||
| ~Guard() { | |||||
| release(); | |||||
| } | |||||
| void release() { | |||||
| if (m_manager) { | |||||
| m_manager->exit(m_name); | |||||
| m_manager = nullptr; | |||||
| } | |||||
| } | |||||
| }; | |||||
| struct StackManager::Frame { | |||||
| StackManager::Node* node; | |||||
| uint64_t version; | |||||
| }; | |||||
| class StackManager::Trace { | |||||
| private: | |||||
| SmallVector<StackManager::Frame> m_frames; | |||||
| uint64_t m_id = 0; | |||||
| public: | |||||
| explicit Trace(StackManager::Node* top, uint64_t id): m_id{id} { | |||||
| int64_t nr_frames = top->depth() + 1; | |||||
| m_frames = SmallVector<StackManager::Frame>(nr_frames); | |||||
| StackManager::Node* node = top; | |||||
| for (int64_t i = 0; i < nr_frames; ++i) { | |||||
| m_frames[m_frames.size()-1-i] = {node, node->version()}; | |||||
| node = node->parent(); | |||||
| } | |||||
| mgb_assert(node->is_root() , ""); | |||||
| } | |||||
| Trace() = default; | |||||
| std::string to_string() const { | |||||
| std::string buffer; | |||||
| for (auto&& [node, version]: m_frames) { | |||||
| if (!buffer.empty()) { | |||||
| buffer.append("."); | |||||
| } | |||||
| buffer.append(node->name()); | |||||
| if (version != 0) { | |||||
| buffer.append(ssprintf("[%zu]", version)); | |||||
| } | |||||
| } | |||||
| return buffer; | |||||
| } | |||||
| const SmallVector<StackManager::Frame>& frames() const { | |||||
| return m_frames; | |||||
| } | |||||
| uint64_t id() const { | |||||
| return m_id; | |||||
| } | |||||
| }; | |||||
| inline StackManager::StackManager() { | |||||
| m_root = Node::make(); | |||||
| m_current = m_root.get(); | |||||
| } | |||||
| inline std::pair<StackManager::Node*, uint64_t> StackManager::enter(std::string name) { | |||||
| m_current = (*m_current)[name]; | |||||
| m_trace_id_stack.push_back(++m_last_trace_id); | |||||
| return {m_current, m_current->version()}; | |||||
| } | |||||
| inline void StackManager::exit(std::string name) { | |||||
| mgb_assert(m_current->name() == name, "scope name mismatch"); | |||||
| m_current = m_current->parent(); | |||||
| m_trace_id_stack.pop_back(); | |||||
| m_current->update_version(); | |||||
| } | |||||
| inline StackManager::Trace StackManager::dump() { | |||||
| return Trace(m_current, m_trace_id_stack.empty() ? 0 : m_trace_id_stack.back()); | |||||
| } | |||||
| inline StackManager::Node* StackManager::current() { | |||||
| return m_current; | |||||
| } | |||||
| } | |||||
| @@ -102,7 +102,8 @@ std::vector<std::pair<const char*, std::string>> props(const OpDef& def) { | |||||
| } | } | ||||
| std::string make_name(const OpDef& def) { | std::string make_name(const OpDef& def) { | ||||
| return "OprAttr"; | |||||
| auto&& attr = def.cast_final_safe<OprAttr>(); | |||||
| return attr.type; | |||||
| } | } | ||||
| OP_TRAIT_REG(OprAttr, OprAttr) | OP_TRAIT_REG(OprAttr, OprAttr) | ||||
| @@ -27,25 +27,11 @@ | |||||
| namespace mgb { | namespace mgb { | ||||
| namespace imperative { | namespace imperative { | ||||
| uint64_t Timer::get_nsecs() { | |||||
| using namespace std::chrono; | |||||
| auto finish = steady_clock::now(); | |||||
| auto duration = duration_cast<nanoseconds>(finish - m_start); | |||||
| return duration.count(); | |||||
| profiler::Time Timer::record_host() { | |||||
| return std::chrono::high_resolution_clock::now(); | |||||
| } | } | ||||
| uint64_t Timer::get_started_at() { | |||||
| return m_started_at; | |||||
| } | |||||
| void Timer::reset() { | |||||
| using namespace std::chrono; | |||||
| m_start = steady_clock::now(); | |||||
| auto now_ns = duration_cast<nanoseconds>(std::chrono::system_clock::now().time_since_epoch()); | |||||
| m_started_at = now_ns.count(); | |||||
| } | |||||
| std::shared_ptr<CompNode::Event> Timer::record_event(CompNode device) { | |||||
| std::shared_ptr<CompNode::Event> Timer::record_device(CompNode device) { | |||||
| auto event = EventPool::with_timer().alloc_shared(device); | auto event = EventPool::with_timer().alloc_shared(device); | ||||
| event->record(); | event->record(); | ||||
| return event; | return event; | ||||
| @@ -55,13 +41,13 @@ Profiler::options_t Profiler::sm_profile_options; | |||||
| std::mutex Profiler::sm_mutex; | std::mutex Profiler::sm_mutex; | ||||
| std::unordered_map<std::thread::id, Profiler*> Profiler::sm_profilers; | std::unordered_map<std::thread::id, Profiler*> Profiler::sm_profilers; | ||||
| Timer Profiler::sm_timer; | Timer Profiler::sm_timer; | ||||
| profiler::HostTime Profiler::sm_start_at; | |||||
| std::atomic_uint64_t Profiler::sm_last_id = 0; | std::atomic_uint64_t Profiler::sm_last_id = 0; | ||||
| bool Profiler::sm_profiling = false; | bool Profiler::sm_profiling = false; | ||||
| thread_local std::unique_ptr<Profiler> Profiler::tm_profiler = std::make_unique<Profiler>(); | thread_local std::unique_ptr<Profiler> Profiler::tm_profiler = std::make_unique<Profiler>(); | ||||
| std::atomic_size_t Profiler::sm_preferred_capacity; | std::atomic_size_t Profiler::sm_preferred_capacity; | ||||
| auto Profiler::get_thread_dict() -> thread_dict_t { | auto Profiler::get_thread_dict() -> thread_dict_t { | ||||
| MGB_LOCK_GUARD(sm_mutex); | |||||
| thread_dict_t thread_dict; | thread_dict_t thread_dict; | ||||
| for (auto&& [tid, profiler]: sm_profilers) { | for (auto&& [tid, profiler]: sm_profilers) { | ||||
| thread_dict[tid] = profiler->m_thread_name; | thread_dict[tid] = profiler->m_thread_name; | ||||
| @@ -69,15 +55,13 @@ auto Profiler::get_thread_dict() -> thread_dict_t { | |||||
| return thread_dict; | return thread_dict; | ||||
| } | } | ||||
| void Profiler::dump_profile(std::string basename, std::string format, results_t results, options_t options) { | |||||
| auto thread_dict = get_thread_dict(); | |||||
| if (format == "chrome_timeline.json") { | |||||
| profiler::dump_chrome_timeline(basename, options, thread_dict, results); | |||||
| } else if (format == "memory_flow.svg") { | |||||
| profiler::dump_memory_flow(basename, options, thread_dict, results); | |||||
| } else { | |||||
| void Profiler::dump_profile(std::string basename, std::string format, bundle_t result) { | |||||
| std::unordered_map<std::string, void(*)(std::string, bundle_t)> format_table; | |||||
| auto iter = format_table.find(format); | |||||
| if (iter == format_table.end()) { | |||||
| mgb_log_error("unsupported profiling format %s", format.c_str()); | mgb_log_error("unsupported profiling format %s", format.c_str()); | ||||
| } | } | ||||
| return (iter->second)(basename, result); | |||||
| } | } | ||||
| } // namespace imperative | } // namespace imperative | ||||
| @@ -17,6 +17,10 @@ | |||||
| #error Unsupported platform | #error Unsupported platform | ||||
| #endif | #endif | ||||
| #include "nlohmann/json.hpp" | |||||
| #include "megbrain/utils/debug.h" | |||||
| #include "./formats.h" | #include "./formats.h" | ||||
| #include "./states.h" | #include "./states.h" | ||||
| @@ -53,8 +57,9 @@ public: | |||||
| m_idx = idx; | m_idx = idx; | ||||
| return *this; | return *this; | ||||
| } | } | ||||
| ChromeTraceEvent& ts(uint64_t ts) { | |||||
| m_ts = ts; | |||||
| template <typename TDuration> | |||||
| ChromeTraceEvent& ts(TDuration ts) { | |||||
| m_ts = std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(ts).count(); | |||||
| return *this; | return *this; | ||||
| } | } | ||||
| ChromeTraceEvent& dur(uint64_t dur) { | ChromeTraceEvent& dur(uint64_t dur) { | ||||
| @@ -69,51 +74,46 @@ public: | |||||
| m_bp = bp; | m_bp = bp; | ||||
| return *this; | return *this; | ||||
| } | } | ||||
| ChromeTraceEvent& args(std::shared_ptr<json::Object> args) { | |||||
| ChromeTraceEvent& args(nlohmann::json args) { | |||||
| m_args = std::move(args); | m_args = std::move(args); | ||||
| return *this; | return *this; | ||||
| } | } | ||||
| ChromeTraceEvent& arg(std::string key, std::string value) { | ChromeTraceEvent& arg(std::string key, std::string value) { | ||||
| if (!m_args) { | |||||
| m_args = json::Object::make(); | |||||
| } | |||||
| (*m_args)[key] = json::String::make(value); | |||||
| m_args[key] = value; | |||||
| return *this; | return *this; | ||||
| } | } | ||||
| ChromeTraceEvent& arg(std::string key, double value) { | ChromeTraceEvent& arg(std::string key, double value) { | ||||
| if (!m_args) { | |||||
| m_args = json::Object::make(); | |||||
| } | |||||
| (*m_args)[key] = json::Number::make(value); | |||||
| m_args[key] = value; | |||||
| return *this; | return *this; | ||||
| } | } | ||||
| ChromeTraceEvent& arg(std::string key, std::shared_ptr<json::Value> value) { | |||||
| if (!m_args) { | |||||
| m_args = json::Object::make(); | |||||
| } | |||||
| (*m_args)[key] = value; | |||||
| ChromeTraceEvent& arg(std::string key, nlohmann::json value) { | |||||
| m_args[key] = value; | |||||
| return *this; | |||||
| } | |||||
| ChromeTraceEvent& stack(Trace trace) { | |||||
| m_stack = std::move(trace); | |||||
| return *this; | return *this; | ||||
| } | } | ||||
| std::shared_ptr<json::Object> to_json() const { | |||||
| auto result = json::Object::make(); | |||||
| nlohmann::json to_json() const { | |||||
| nlohmann::json result; | |||||
| auto prop_str = [&](auto key, auto value) { | auto prop_str = [&](auto key, auto value) { | ||||
| if (value.empty()) { | if (value.empty()) { | ||||
| return; | return; | ||||
| } | } | ||||
| (*result)[key] = json::String::make(value); | |||||
| result[key] = value; | |||||
| }; | }; | ||||
| auto prop_num = [&](auto key, auto value) { | auto prop_num = [&](auto key, auto value) { | ||||
| if (!value) { | if (!value) { | ||||
| return; | return; | ||||
| } | } | ||||
| (*result)[key] = json::Number::make(value.value()); | |||||
| result[key] = value.value(); | |||||
| }; | }; | ||||
| auto prop_char = [&](auto key, auto value) { | auto prop_char = [&](auto key, auto value) { | ||||
| if (!value) { | if (!value) { | ||||
| return; | return; | ||||
| } | } | ||||
| (*result)[key] = json::String::make(std::string{} + value.value()); | |||||
| result[key] = std::string{} + value.value(); | |||||
| }; | }; | ||||
| prop_str("name", m_name); | prop_str("name", m_name); | ||||
| prop_str("cat", m_cat); | prop_str("cat", m_cat); | ||||
| @@ -126,8 +126,16 @@ public: | |||||
| prop_num("dur", m_dur); | prop_num("dur", m_dur); | ||||
| prop_char("ph", m_ph); | prop_char("ph", m_ph); | ||||
| prop_char("bp", m_bp); | prop_char("bp", m_bp); | ||||
| if (m_args) { | |||||
| (*result)["args"] = m_args; | |||||
| if (!m_args.empty()) { | |||||
| result["args"] = m_args; | |||||
| } | |||||
| if (m_stack) { | |||||
| nlohmann::json stack; | |||||
| for (auto&& frame: m_stack->frames()) { | |||||
| stack.push_back(ssprintf("%s%ld", frame.node->name().c_str(), frame.version)); | |||||
| } | |||||
| std::reverse(stack.begin(), stack.end()); | |||||
| result["stack"] = stack; | |||||
| } | } | ||||
| return result; | return result; | ||||
| } | } | ||||
| @@ -140,11 +148,12 @@ private: | |||||
| std::optional<uint64_t> m_pid; | std::optional<uint64_t> m_pid; | ||||
| std::optional<uint64_t> m_id; | std::optional<uint64_t> m_id; | ||||
| std::optional<uint64_t> m_idx; | std::optional<uint64_t> m_idx; | ||||
| std::optional<uint64_t> m_ts; | |||||
| std::optional<double> m_ts; | |||||
| std::optional<uint64_t> m_dur; | std::optional<uint64_t> m_dur; | ||||
| std::optional<char> m_ph; | std::optional<char> m_ph; | ||||
| std::optional<char> m_bp; | std::optional<char> m_bp; | ||||
| std::shared_ptr<json::Object> m_args; | |||||
| nlohmann::json m_args; | |||||
| std::optional<Trace> m_stack; | |||||
| }; | }; | ||||
| class ChromeTraceEvents { | class ChromeTraceEvents { | ||||
| @@ -154,368 +163,218 @@ public: | |||||
| return m_content.back(); | return m_content.back(); | ||||
| } | } | ||||
| std::shared_ptr<json::Value> to_json() const { | |||||
| auto result = json::Object::make(); | |||||
| auto event_list = json::Array::make(); | |||||
| std::string metadata(std::string key) { | |||||
| return m_metadata[key]; | |||||
| } | |||||
| nlohmann::json to_json() const { | |||||
| nlohmann::json result; | |||||
| nlohmann::json event_list; | |||||
| nlohmann::json metadata; | |||||
| for (auto&& event: m_content) { | for (auto&& event: m_content) { | ||||
| event_list->add(event.to_json()); | |||||
| event_list.push_back(event.to_json()); | |||||
| } | |||||
| for (auto&& [key, value]: m_metadata) { | |||||
| metadata[key] = value; | |||||
| } | } | ||||
| (*result)["traceEvents"] = event_list; | |||||
| result["traceEvents"] = event_list; | |||||
| result["metadata"] = metadata; | |||||
| return result; | return result; | ||||
| } | } | ||||
| std::string to_string() const { | |||||
| auto json = to_json(); | |||||
| return "{" "traceEvents:" + nlohmann::to_string(json["traceEvents"]) + "," | |||||
| "metadata:" + nlohmann::to_string(json["metadata"]) + "}"; | |||||
| } | |||||
| private: | private: | ||||
| std::vector<ChromeTraceEvent> m_content; | std::vector<ChromeTraceEvent> m_content; | ||||
| std::unordered_map<std::string, std::string> m_metadata; | |||||
| }; | }; | ||||
| void dump_chrome_timeline(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results){ | |||||
| auto pid = getpid(); | |||||
| ProfileDataCollector collector; | |||||
| ProfileState state; | |||||
| #define HANDLE_EVENT(type, ...) \ | |||||
| collector.handle<type>([&](uint64_t id, std::thread::id tid, uint64_t time, type event) __VA_ARGS__ ); | |||||
| struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> { | |||||
| ChromeTraceEvents trace_events; | ChromeTraceEvents trace_events; | ||||
| decltype(getpid()) pid = getpid(); | |||||
| std::string pid_str = std::to_string(pid); | |||||
| #define NEW_HOST(NAME, PH) trace_events.new_event().name(NAME).pid(pid).tid(state[tid].index).ph(PH).ts((double)time/1e3) | |||||
| #define NEW_DEVICE(NAME, PH) trace_events.new_event().name(NAME).pid(pid).tid(256+state[event.event->comp_node()].index).ph(PH).ts((double)get_device_time(event.event, time)/1e3) | |||||
| #define OP_NAME op_state.name | |||||
| #define OP_KERNEL_NAME (op_state.name + "") | |||||
| #define OP_PROPS get_op_args(op_state) | |||||
| #define OP_ID event.op_id | |||||
| #define TENSOR_PROPS get_tensor_args(tensor_state, time) | |||||
| #define TENSOR_INFO get_tensor_info(tensor_state, time) | |||||
| #define TENSOR_COMMAND_KIND print_tensor_command_kind(event.kind) | |||||
| #define HANDLE_PLAIN_EVENT(START, FINISH, NAME_EXPR)\ | |||||
| HANDLE_EVENT(START, { NEW_HOST(NAME_EXPR, 'B'); })\ | |||||
| HANDLE_EVENT(FINISH, { NEW_HOST(NAME_EXPR, 'E'); }) | |||||
| #define HANDLE_TENSOR_EVENT(START, FINISH, NAME_EXPR)\ | |||||
| HANDLE_EVENT(START, { NEW_HOST(NAME_EXPR, 'B'); })\ | |||||
| HANDLE_EVENT(FINISH, { auto& tensor_state = state.tensors[event.tensor_id]; NEW_HOST(NAME_EXPR, 'E').args(TENSOR_PROPS); }) | |||||
| #define INC_COUNTER(NAME, DELTA)\ | |||||
| { state.statics.NAME += DELTA; NEW_HOST(#NAME, 'C').arg(#NAME, state.statics.NAME); } | |||||
| auto get_tensor_args = [](const ProfileTensorState& tensor, uint64_t time) -> std::shared_ptr<json::Object> { | |||||
| auto args = json::Object::make(); | |||||
| (*args)["id"] = json::Number::make(tensor.id); | |||||
| (*args)["name"] = json::String::make(tensor.name); | |||||
| (*args)["shape"] = json::String::make(tensor.layout.TensorShape::to_string()); | |||||
| (*args)["dtype"] = json::String::make(tensor.layout.dtype.name()); | |||||
| (*args)["nr_elements"] = json::Number::make(tensor.layout.total_nr_elems()); | |||||
| (*args)["device"] = json::String::make(tensor.device.to_string()); | |||||
| if (tensor.produced) { | |||||
| (*args)["living_time"] = json::String::make(std::to_string((time - tensor.produced + tensor.living_time)/1e6) + "ms"); | |||||
| } | |||||
| return args; | |||||
| }; | |||||
| auto get_tensor_info = [](const ProfileTensorState& tensor, uint64_t time) -> std::string { | |||||
| std::string name = tensor.name; | |||||
| std::string shape = tensor.layout.TensorShape::to_string(); | |||||
| std::string size_in_bytes = std::to_string(tensor.size_in_bytes()); | |||||
| std::string device = tensor.device.to_string(); | |||||
| std::string dtype = tensor.layout.dtype.name(); | |||||
| return ssprintf("%s(%s:%s:%s)", name.c_str(), shape.c_str(), dtype.c_str(), device.c_str()); | |||||
| }; | |||||
| ChromeTraceEvent& new_event(std::string name, char ph, size_t tid, profiler::HostTime time) { | |||||
| return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts(since_start(time)); | |||||
| } | |||||
| auto get_op_args = [&](const ProfileOperatorState& op) -> std::shared_ptr<json::Object> { | |||||
| auto args = json::Object::make(); | |||||
| auto params = op.params; | |||||
| for (auto&& [name, value]: params) { | |||||
| (*args)[name] = json::String::make(value); | |||||
| } | |||||
| (*args)["__id__"] = json::Number::make(op.id); | |||||
| (*args)["__name__"] = json::String::make(op.name); | |||||
| (*args)["__device__"] = json::String::make(op.device.to_string()); | |||||
| return args; | |||||
| ChromeTraceEvent& new_host_event(std::string name, char ph) { | |||||
| return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(current->tid)).ts(since_start(current->time)); | |||||
| }; | }; | ||||
| auto get_device_time = [&](const std::shared_ptr<CompNode::Event>& event, uint64_t host) -> uint64_t { | |||||
| event->host_wait(); | |||||
| auto& device_state = state.devices[event->comp_node()]; | |||||
| if (!device_state.base_event) { | |||||
| device_state.base_event = event; | |||||
| device_state.base_time = host; | |||||
| return host; | |||||
| } | |||||
| uint64_t device = device_state.base_event->elapsed_time_until(*event) * 1e9 + device_state.base_time; | |||||
| return std::max(device, host); | |||||
| ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { | |||||
| using namespace std::literals::chrono_literals; | |||||
| auto time = since_start(to_device_time(current->time, device)); | |||||
| return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(device)).ts(time); | |||||
| }; | }; | ||||
| auto print_tensor_command_kind = [&](int kind) -> const char* { | |||||
| const char* to_cstr(TensorCommandKind kind) { | |||||
| switch(kind) { | switch(kind) { | ||||
| case TensorCommandEvent::Put: | |||||
| case TensorCommandKind::Put: | |||||
| return "Put"; | return "Put"; | ||||
| case TensorCommandEvent::Drop: | |||||
| case TensorCommandKind::Drop: | |||||
| return "Drop"; | return "Drop"; | ||||
| case TensorCommandEvent::Del: | |||||
| case TensorCommandKind::Del: | |||||
| return "Del"; | return "Del"; | ||||
| case TensorCommandEvent::SwapIn: | |||||
| case TensorCommandKind::SwapIn: | |||||
| return "SwapIn"; | return "SwapIn"; | ||||
| case TensorCommandEvent::SwapOut: | |||||
| case TensorCommandKind::SwapOut: | |||||
| return "SwapOut"; | return "SwapOut"; | ||||
| case TensorCommandEvent::RecFree: | |||||
| case TensorCommandKind::RecFree: | |||||
| return "RecFree"; | return "RecFree"; | ||||
| case TensorCommandEvent::ReGen: | |||||
| case TensorCommandKind::ReGen: | |||||
| return "ReGen"; | return "ReGen"; | ||||
| case TensorCommandKind::GetValue: | |||||
| return "GetValue"; | |||||
| } | } | ||||
| return "UnknownCommand"; | return "UnknownCommand"; | ||||
| }; | |||||
| HANDLE_EVENT(OpDispatchEvent, { | |||||
| auto& op_state = state.operators[OP_ID] = {}; | |||||
| op_state.id = OP_ID; | |||||
| op_state.name = event.op_name; | |||||
| op_state.params = event.op_params(); | |||||
| op_state.inputs = event.inputs; | |||||
| op_state.outputs = event.outputs; | |||||
| NEW_HOST("OpDispatch", 'B'); | |||||
| NEW_HOST(ssprintf("%d", pid), 's') | |||||
| .cat("OpDispatch") | |||||
| .id(OP_ID) | |||||
| .scope(std::to_string(pid)); | |||||
| NEW_HOST("OpDispatch", 'E').args(OP_PROPS); | |||||
| INC_COUNTER(op_enqueue_count, 1); | |||||
| }); | |||||
| HANDLE_EVENT(OpExecuteEvent, { | |||||
| mgb_assert(OP_ID != 0); | |||||
| mgb_assert(state.operators.count(OP_ID) > 0); | |||||
| auto& op_state = state.operators[OP_ID]; | |||||
| op_state.host_begin = time; | |||||
| NEW_HOST(OP_NAME, 'B'); | |||||
| //.args(OP_PROPS); | |||||
| NEW_HOST(ssprintf("%d", pid), 't') | |||||
| .cat("OpDispatch") | |||||
| .id(OP_ID) | |||||
| .scope(std::to_string(pid)); | |||||
| INC_COUNTER(op_execute_count, 1); | |||||
| }); | |||||
| HANDLE_EVENT(OpExecuteFinishEvent, { | |||||
| auto& op_state = state.operators[event.op_id]; | |||||
| op_state.host_end = time; | |||||
| NEW_HOST(OP_NAME, 'E') | |||||
| .args(OP_PROPS); | |||||
| }); | |||||
| HANDLE_EVENT(KernelExecuteEvent, { | |||||
| auto& op_state = state.operators[event.op_id]; | |||||
| op_state.device_begin = event.event; | |||||
| NEW_HOST(ssprintf("%d", pid), 's') | |||||
| .id(event.kernel_id) | |||||
| .cat("KernelLaunch") | |||||
| .scope(std::to_string(pid)); | |||||
| NEW_DEVICE(OP_KERNEL_NAME, 'B') | |||||
| .cat("Kernel"); | |||||
| //.args(OP_PROPS); | |||||
| NEW_DEVICE(ssprintf("%d", pid), 'f') | |||||
| .id(event.kernel_id) | |||||
| .bp('e') | |||||
| .cat("KernelLaunch") | |||||
| .scope(std::to_string(pid)); | |||||
| }); | |||||
| HANDLE_EVENT(KernelExecuteFinishEvent, { | |||||
| auto& op_state = state.operators[event.op_id]; | |||||
| op_state.device_end = event.event; | |||||
| NEW_DEVICE(OP_KERNEL_NAME, 'E') | |||||
| .cat("Kernel") | |||||
| .args(OP_PROPS); | |||||
| }); | |||||
| HANDLE_EVENT(TensorDeclareEvent, { | |||||
| auto& tensor_state = state.tensors[event.tensor_id] = {}; | |||||
| tensor_state.id = event.tensor_id; | |||||
| tensor_state.name = event.name; | |||||
| }); | |||||
| HANDLE_EVENT(TensorProduceEvent, { | |||||
| auto& tensor_state = state.tensors[event.tensor_id]; | |||||
| tensor_state.device = event.device; | |||||
| tensor_state.layout = event.layout; | |||||
| tensor_state.produced = time; | |||||
| if (!tensor_state.living_time) { | |||||
| NEW_HOST(ssprintf("%d", pid), 's') | |||||
| .id(event.tensor_id) | |||||
| .cat("TensorLink") | |||||
| .scope(std::to_string(pid)); | |||||
| } else { | |||||
| NEW_HOST(ssprintf("%d", pid), 't') | |||||
| .id(event.tensor_id) | |||||
| .cat("TensorLink") | |||||
| .scope(std::to_string(pid)); | |||||
| } | |||||
| INC_COUNTER(alive_tensor_count, 1); | |||||
| INC_COUNTER(produce_tensor_count, 1); | |||||
| state.tensors_by_size.insert({tensor_state.id, tensor_state.size_in_bytes()}); | |||||
| state.tensors_by_produced.insert({tensor_state.id, tensor_state.produced}); | |||||
| }); | |||||
| HANDLE_EVENT(TensorUsageEvent, { | |||||
| NEW_HOST(ssprintf("%d", pid), 't') | |||||
| .id(event.tensor_id) | |||||
| .cat("TensorLink") | |||||
| .scope(std::to_string(pid)); | |||||
| }); | |||||
| HANDLE_EVENT(TensorReleaseEvent, { | |||||
| auto& tensor_state = state.tensors[event.tensor_id]; | |||||
| tensor_state.living_time += time - tensor_state.produced; | |||||
| tensor_state.produced = 0; | |||||
| INC_COUNTER(alive_tensor_count, -1); | |||||
| INC_COUNTER(erase_tensor_count, 1); | |||||
| state.tensors_by_size.erase({tensor_state.id, tensor_state.size_in_bytes()}); | |||||
| state.tensors_by_produced.erase({tensor_state.id, tensor_state.produced}); | |||||
| NEW_HOST(ssprintf("%d", pid), 't') | |||||
| .id(event.tensor_id) | |||||
| .cat("TensorLink") | |||||
| .scope(std::to_string(pid)); | |||||
| }); | |||||
| HANDLE_EVENT(TensorEraseEvent, { | |||||
| auto& tensor_state = state.tensors[event.tensor_id]; | |||||
| if (tensor_state.living_time) { | |||||
| NEW_HOST(ssprintf("%d", pid), 'f') | |||||
| .id(event.tensor_id) | |||||
| .bp('e') | |||||
| .cat("TensorLink") | |||||
| .scope(std::to_string(pid)); | |||||
| } | |||||
| if (event.use_count == 0) { | |||||
| INC_COUNTER(redundant_tensor_count, 1); | |||||
| } | |||||
| }); | |||||
| HANDLE_EVENT(TensorGetPropEvent, { | |||||
| auto& tensor_state = state.tensors[event.tensor_id]; | |||||
| NEW_HOST("TensorGetProp", 'X') | |||||
| .dur(0).args(TENSOR_PROPS); | |||||
| }); | |||||
| HANDLE_EVENT(TensorWaitPropEvent, { | |||||
| NEW_HOST("TensorWaitProp", 'B'); | |||||
| if (event.prop == TensorProp::HostValue) { | |||||
| INC_COUNTER(wait_value_count, 1); | |||||
| } else if (event.prop == TensorProp::Shape) { | |||||
| INC_COUNTER(wait_shape_count, 1); | |||||
| } | |||||
| INC_COUNTER(wait_prop_count, 1); | |||||
| }); | |||||
| HANDLE_EVENT(TensorWaitPropFinishEvent, { | |||||
| auto& tensor_state = state.tensors[event.tensor_id]; | |||||
| if (event.notified) { | |||||
| NEW_HOST(ssprintf("%d", pid), 'f') | |||||
| .id(event.tensor_id) | |||||
| .bp('e') | |||||
| .cat("TensorProp") | |||||
| .scope(std::to_string(pid)); | |||||
| } | |||||
| NEW_HOST("TensorWaitProp", 'E') | |||||
| .args(TENSOR_PROPS); | |||||
| }); | |||||
| HANDLE_EVENT(TensorNotifyPropEvent, { | |||||
| NEW_HOST(ssprintf("%d", pid), 's') | |||||
| .id(event.tensor_id) | |||||
| .cat("TensorProp") | |||||
| .scope(std::to_string(pid)); | |||||
| }); | |||||
| HANDLE_EVENT(ShapeInferEvent, { | |||||
| if (event.success) { | |||||
| INC_COUNTER(infer_shape_valid_count, 1); | |||||
| } else { | |||||
| INC_COUNTER(infer_shape_invalid_count, 1); | |||||
| } | |||||
| }); | |||||
| HANDLE_EVENT(SampleDeviceEvent, { | |||||
| NEW_HOST("TopKTensor", 'B'); | |||||
| }); | |||||
| HANDLE_EVENT(SampleDeviceFinishEvent, { | |||||
| std::string device_name = event.device.locator().to_string(); | |||||
| std::string prop_name = ssprintf("%s_alloc_memory", device_name.c_str()); | |||||
| NEW_HOST(prop_name, 'C') | |||||
| .arg(prop_name, event.total_memory - event.free_memory); | |||||
| auto top_k_tensors = state.top_k_tensor_in_device(event.device, options.at("num_tensor_watch")); | |||||
| auto& top_k_event = NEW_HOST("TopKTensor", 'E'); | |||||
| for (size_t i = 0; i < top_k_tensors.size(); ++i) { | |||||
| auto tensor_id = top_k_tensors[i]; | |||||
| auto& tensor_state = state.tensors[tensor_id]; | |||||
| top_k_event.arg(ssprintf("top%03d", (int)i), TENSOR_INFO); //%03d is always enough | |||||
| } | |||||
| }); | |||||
| HANDLE_EVENT(WorkerExceptionEvent, { | |||||
| INC_COUNTER(exception_count, 1); | |||||
| }); | |||||
| HANDLE_EVENT(TensorCommandEvent, { | |||||
| NEW_HOST(ssprintf("%s %zu", TENSOR_COMMAND_KIND, event.tensor_id), 'B'); | |||||
| }); | |||||
| HANDLE_EVENT(TensorCommandFinishEvent, { | |||||
| auto& tensor_state = state.tensors[event.tensor_id]; | |||||
| NEW_HOST(ssprintf("%s %zu", TENSOR_COMMAND_KIND, event.tensor_id), 'E') | |||||
| .args(TENSOR_PROPS); | |||||
| }); | |||||
| HANDLE_EVENT(ScopeEvent, { | |||||
| NEW_HOST(event.name, 'B'); | |||||
| state.threads[tid].scope_stack.push_back(event.name); | |||||
| }); | |||||
| HANDLE_EVENT(ScopeFinishEvent, { | |||||
| NEW_HOST(event.name, 'E'); | |||||
| mgb_assert(state.threads[tid].scope_stack.back() == event.name); | |||||
| state.threads[tid].scope_stack.pop_back(); | |||||
| }); | |||||
| HANDLE_TENSOR_EVENT(OpInputEvent, OpInputFinishEvent, ssprintf("Input %zu", event.tensor_id)); | |||||
| HANDLE_TENSOR_EVENT(OpOutputEvent, OpOutputFinishEvent, ssprintf("Output %zu", event.tensor_id)); | |||||
| HANDLE_TENSOR_EVENT(OpDelEvent, OpDelFinishEvent, ssprintf("Del %zu", event.tensor_id)); | |||||
| HANDLE_PLAIN_EVENT(StartProfileEvent, StartProfileFinishEvent, "StartProfile"); | |||||
| HANDLE_PLAIN_EVENT(StopProfileEvent, StopProfileFinishEvent, "StopProfile"); | |||||
| HANDLE_PLAIN_EVENT(CustomEvent, CustomFinishEvent, event.title); | |||||
| HANDLE_PLAIN_EVENT(AutoEvictEvent, AutoEvictFinishEvent, "AutoEvict"); | |||||
| if (results.size() > 0) { | |||||
| uint64_t time = results[0].second.time; | |||||
| trace_events.new_event().name("Metadata").ph('I').pid(pid).ts(0).arg("localTime", time/1e3); | |||||
| } | } | ||||
| for (auto&& result: results) { | |||||
| collector(result.second.id, result.first, result.second.time, result.second.data); | |||||
| } | |||||
| for (auto&& [tid, thread]: state.threads) { | |||||
| if (!thread_dict.count(tid)) { | |||||
| continue; | |||||
| template <typename TEvent> | |||||
| void visit_event(const TEvent &event) { | |||||
| if constexpr (std::is_same_v<TEvent, OpDispatchEvent>) { | |||||
| new_host_event("OpDispatch", 'B'); | |||||
| new_host_event(pid_str, 's') | |||||
| .cat("OpDispatch") | |||||
| .id(event.op_id) | |||||
| .scope(pid_str); | |||||
| new_host_event("OpDispatch", 'E').args(current_op->detail()); | |||||
| } else if constexpr (std::is_same_v<TEvent, OpExecuteEvent>) { | |||||
| mgb_assert(event.op_id != 0); | |||||
| current_op->execute_begin = current->time; | |||||
| new_host_event(current_op->name, 'B'); | |||||
| new_host_event(pid_str, 't') | |||||
| .cat("OpDispatch") | |||||
| .id(current_op->id) | |||||
| .scope(pid_str); | |||||
| } else if constexpr (std::is_same_v<TEvent, OpExecuteFinishEvent>) { | |||||
| current_op->execute_end = current->time; | |||||
| new_host_event(current_op->name, 'E') | |||||
| .args(current_op->detail()); | |||||
| } else if constexpr (std::is_same_v<TEvent, KernelLaunchEvent>) { | |||||
| new_host_event(pid_str, 's') | |||||
| .id(event.kernel_id) | |||||
| .cat("KernelLaunch") | |||||
| .scope(pid_str); | |||||
| new_device_event(current_op->name, 'B', event.device) | |||||
| .cat("Kernel"); | |||||
| new_device_event(pid_str, 'f', event.device) | |||||
| .id(event.kernel_id) | |||||
| .bp('e') | |||||
| .cat("KernelLaunch") | |||||
| .scope(pid_str); | |||||
| } else if constexpr (std::is_same_v<TEvent, KernelLaunchFinishEvent>) { | |||||
| new_device_event(current_op->name, 'E', event.device) | |||||
| .cat("Kernel") | |||||
| .args(current_op->detail()); | |||||
| } else if constexpr (std::is_same_v<TEvent, TensorProduceEvent>) { | |||||
| if (current_tensor->living_time != profiler::Duration::zero()) { | |||||
| new_host_event(pid_str, 's') | |||||
| .id(event.tensor_id) | |||||
| .cat("TensorLink") | |||||
| .scope(pid_str); | |||||
| } else { | |||||
| new_host_event(pid_str, 't') | |||||
| .id(event.tensor_id) | |||||
| .cat("TensorLink") | |||||
| .scope(pid_str); | |||||
| } | |||||
| } else if constexpr (std::is_same_v<TEvent, TensorUsageEvent>) { | |||||
| new_host_event(pid_str, 't') | |||||
| .id(event.tensor_id) | |||||
| .cat("TensorLink") | |||||
| .scope(pid_str); | |||||
| } else if constexpr (std::is_same_v<TEvent, TensorReleaseEvent>) { | |||||
| current_tensor->living_time += current->time - current_tensor->produced; | |||||
| current_tensor->produced = {}; | |||||
| new_host_event(pid_str, 't') | |||||
| .id(event.tensor_id) | |||||
| .cat("TensorLink") | |||||
| .scope(pid_str); | |||||
| } else if constexpr (std::is_same_v<TEvent, TensorEraseEvent>) { | |||||
| if (current_tensor->living_time != profiler::Duration::zero()) { | |||||
| new_host_event(pid_str, 'f') | |||||
| .id(event.tensor_id) | |||||
| .bp('e') | |||||
| .cat("TensorLink") | |||||
| .scope(pid_str); | |||||
| } | |||||
| } else if constexpr (std::is_same_v<TEvent, TensorGetPropEvent>) { | |||||
| new_host_event("TensorGetProp", 'X') | |||||
| .dur(0).args(current_tensor->detail(current->time)); | |||||
| } else if constexpr (std::is_same_v<TEvent, TensorWaitPropFinishEvent>) { | |||||
| new_host_event(pid_str, 'f') | |||||
| .id(event.tensor_id) | |||||
| .bp('e') | |||||
| .cat("TensorProp") | |||||
| .scope(pid_str); | |||||
| new_host_event("TensorWaitProp", 'E') | |||||
| .args(current_tensor->detail(current->time)); | |||||
| } else if constexpr (std::is_same_v<TEvent, TensorNotifyPropEvent>) { | |||||
| new_host_event(pid_str, 's') | |||||
| .id(event.tensor_id) | |||||
| .cat("TensorProp") | |||||
| .scope(pid_str); | |||||
| } else if constexpr (std::is_same_v<TEvent, SampleDeviceFinishEvent>) { | |||||
| std::string device_name = event.device.locator().to_string(); | |||||
| new_host_event("memory", 'C') | |||||
| .arg(ssprintf("%s_alloc_mem", device_name.c_str()), event.total_memory - event.free_memory); | |||||
| } else if constexpr (std::is_same_v<TEvent, TensorCommandEvent>) { | |||||
| new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'B'); | |||||
| } else if constexpr (std::is_same_v<TEvent, TensorCommandFinishEvent>) { | |||||
| new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'E') | |||||
| .args(current_tensor->detail(current->time)); | |||||
| } else if constexpr (std::is_same_v<TEvent, ScopeEvent>) { | |||||
| new_host_event(event.name, 'B'); | |||||
| } else if constexpr (std::is_same_v<TEvent, ScopeFinishEvent>) { | |||||
| new_host_event(event.name, 'E'); | |||||
| } else if constexpr (std::is_same_v<TEvent, OpInputEvent>) { | |||||
| new_host_event(ssprintf("Input %zu", event.tensor_id), 'B') | |||||
| .args(current_tensor->detail(current->time)); | |||||
| } else if constexpr (std::is_same_v<TEvent, OpInputFinishEvent>) { | |||||
| new_host_event(ssprintf("Input %zu", event.tensor_id), 'E') | |||||
| .args(current_tensor->detail(current->time)); | |||||
| } else if constexpr (std::is_same_v<TEvent, OpOutputEvent>) { | |||||
| new_host_event(ssprintf("Output %zu", event.tensor_id), 'B') | |||||
| .args(current_tensor->detail(current->time)); | |||||
| } else if constexpr (std::is_same_v<TEvent, OpOutputFinishEvent>) { | |||||
| new_host_event(ssprintf("Output %zu", event.tensor_id), 'E') | |||||
| .args(current_tensor->detail(current->time)); | |||||
| } else if constexpr (std::is_same_v<TEvent, StartProfileEvent>) { | |||||
| new_host_event("StartProfile", 'B'); | |||||
| } else if constexpr (std::is_same_v<TEvent, StartProfileFinishEvent>) { | |||||
| new_host_event("StartProfile", 'E'); | |||||
| } else if constexpr (std::is_same_v<TEvent, StopProfileEvent>) { | |||||
| new_host_event("StopProfile", 'B'); | |||||
| } else if constexpr (std::is_same_v<TEvent, StopProfileFinishEvent>) { | |||||
| new_host_event("StopProfile", 'E'); | |||||
| } else if constexpr (std::is_same_v<TEvent, CustomEvent>) { | |||||
| new_host_event(event.title, 'B'); | |||||
| } else if constexpr (std::is_same_v<TEvent, CustomFinishEvent>) { | |||||
| new_host_event(event.title, 'E'); | |||||
| } else if constexpr (std::is_same_v<TEvent, AutoEvictEvent>) { | |||||
| new_host_event("AutoEvict", 'B'); | |||||
| } else if constexpr (std::is_same_v<TEvent, AutoEvictFinishEvent>) { | |||||
| new_host_event("AutoEvict", 'E'); | |||||
| } | } | ||||
| trace_events.new_event().ts(0).name("thread_name").ph('M').pid(pid).tid(thread.index).arg("name", thread_dict[tid]); | |||||
| } | } | ||||
| for (auto&& [device, device_state]: state.devices) { | |||||
| trace_events.new_event().ts(0).name("thread_name").ph('M').pid(pid).tid(256+device_state.index).arg("name", device.to_string()); | |||||
| void notify_counter(std::string key, int64_t old_val, int64_t new_val) { | |||||
| new_host_event(key, 'C').arg("value", new_val); | |||||
| } | } | ||||
| }; | |||||
| trace_events.to_json()->writeto_fpath(filename); | |||||
| void dump_chrome_timeline(std::string filename, Profiler::bundle_t result){ | |||||
| ChromeTimelineEventVisitor visitor; | |||||
| visitor.process_events(result); | |||||
| visitor.trace_events.metadata("localTime") = std::to_string(result.start_at.time_since_epoch().count()); | |||||
| std::string json_repr = visitor.trace_events.to_string(); | |||||
| mgb::debug::write_to_file(filename.c_str(), json_repr); | |||||
| } | } | ||||
| } | } | ||||
| @@ -12,7 +12,9 @@ | |||||
| #pragma once | #pragma once | ||||
| #include "megbrain/utils/small_vector.h" | #include "megbrain/utils/small_vector.h" | ||||
| #include "megbrain/imperative/profiler.h" | |||||
| #include "../interpreter/stack_manager.h" | |||||
| #include "../op_trait.h" | #include "../op_trait.h" | ||||
| namespace mgb::imperative::profiler { | namespace mgb::imperative::profiler { | ||||
| @@ -52,6 +54,11 @@ struct ToStringTrait<profiler::TensorProp>{ | |||||
| namespace mgb::imperative::profiler { | namespace mgb::imperative::profiler { | ||||
| using Trace = interpreter::intl::StackManager::Trace; | |||||
| struct ProfileOperatorState; | |||||
| struct ProfileTensorState; | |||||
| #define DEF_EVENT(X, ...) struct X##Event __VA_ARGS__; | #define DEF_EVENT(X, ...) struct X##Event __VA_ARGS__; | ||||
| #define DEF_DUR_EVENT(X, ...) struct X##Event __VA_ARGS__; struct X##FinishEvent __VA_ARGS__; | #define DEF_DUR_EVENT(X, ...) struct X##Event __VA_ARGS__; struct X##FinishEvent __VA_ARGS__; | ||||
| @@ -61,6 +68,7 @@ DEF_EVENT(OpDispatch, { | |||||
| std::function<OpParams()> op_params; | std::function<OpParams()> op_params; | ||||
| SmallVector<uint64_t> inputs; | SmallVector<uint64_t> inputs; | ||||
| SmallVector<uint64_t> outputs; | SmallVector<uint64_t> outputs; | ||||
| Trace trace; | |||||
| }); | }); | ||||
| DEF_DUR_EVENT(OpInput, { | DEF_DUR_EVENT(OpInput, { | ||||
| @@ -68,11 +76,6 @@ DEF_DUR_EVENT(OpInput, { | |||||
| TensorShape shape; | TensorShape shape; | ||||
| }); | }); | ||||
| DEF_DUR_EVENT(OpDel, { | |||||
| uint64_t tensor_id; | |||||
| TensorShape shape; | |||||
| }); | |||||
| DEF_DUR_EVENT(OpOutput, { | DEF_DUR_EVENT(OpOutput, { | ||||
| uint64_t tensor_id; | uint64_t tensor_id; | ||||
| TensorShape shape; | TensorShape shape; | ||||
| @@ -80,16 +83,13 @@ DEF_DUR_EVENT(OpOutput, { | |||||
| DEF_DUR_EVENT(OpExecute, { | DEF_DUR_EVENT(OpExecute, { | ||||
| uint64_t op_id; | uint64_t op_id; | ||||
| SmallVector<CompNode> device_list; | |||||
| }); | }); | ||||
| DEF_DUR_EVENT(OpPostExecute, { | |||||
| uint64_t op_id; | |||||
| }); | |||||
| DEF_DUR_EVENT(KernelExecute, { | |||||
| DEF_DUR_EVENT(KernelLaunch, { | |||||
| uint64_t op_id; | uint64_t op_id; | ||||
| uint64_t kernel_id; | uint64_t kernel_id; | ||||
| std::shared_ptr<CompNode::Event> event; | |||||
| CompNode device; | |||||
| }); | }); | ||||
| DEF_EVENT(TensorDeclare, { | DEF_EVENT(TensorDeclare, { | ||||
| @@ -128,19 +128,12 @@ DEF_EVENT(TensorNotifyProp, { | |||||
| TensorProp prop; | TensorProp prop; | ||||
| }); | }); | ||||
| DEF_EVENT(TensorWaitProp, { | |||||
| DEF_DUR_EVENT(TensorWaitProp, { | |||||
| uint64_t tensor_id; | uint64_t tensor_id; | ||||
| uint64_t wait_id; | uint64_t wait_id; | ||||
| TensorProp prop; | TensorProp prop; | ||||
| }); | }); | ||||
| DEF_EVENT(TensorWaitPropFinish, { | |||||
| uint64_t tensor_id; | |||||
| uint64_t wait_id; | |||||
| TensorProp prop; | |||||
| bool notified; | |||||
| }); | |||||
| DEF_DUR_EVENT(SampleDevice, { | DEF_DUR_EVENT(SampleDevice, { | ||||
| CompNode device; | CompNode device; | ||||
| size_t total_memory; | size_t total_memory; | ||||
| @@ -157,13 +150,10 @@ DEF_DUR_EVENT(Scope, { | |||||
| std::string name; | std::string name; | ||||
| }); | }); | ||||
| DEF_DUR_EVENT(DeviceScope, { | |||||
| std::string name; | |||||
| std::shared_ptr<CompNode::Event> event; | |||||
| DEF_DUR_EVENT(Sync, { | |||||
| Trace trace; | |||||
| }); | }); | ||||
| DEF_DUR_EVENT(Sync, {}); | |||||
| DEF_DUR_EVENT(StartProfile, { | DEF_DUR_EVENT(StartProfile, { | ||||
| size_t capture_count; | size_t capture_count; | ||||
| }); | }); | ||||
| @@ -172,10 +162,13 @@ DEF_DUR_EVENT(StopProfile, { | |||||
| size_t escape_count; | size_t escape_count; | ||||
| }); | }); | ||||
| enum class TensorCommandKind { | |||||
| Put, Del, SwapIn, SwapOut, Drop, ReGen, RecFree, GetValue | |||||
| }; | |||||
| DEF_DUR_EVENT(TensorCommand, { | DEF_DUR_EVENT(TensorCommand, { | ||||
| enum Kind { | |||||
| Put, Del, SwapIn, SwapOut, Drop, ReGen, RecFree, GetValue | |||||
| }; | |||||
| using Kind = TensorCommandKind; | |||||
| uint64_t tensor_id; | uint64_t tensor_id; | ||||
| Kind kind; | Kind kind; | ||||
| }); | }); | ||||
| @@ -187,6 +180,17 @@ DEF_DUR_EVENT(Custom, { | |||||
| std::string content; | std::string content; | ||||
| }); | }); | ||||
| DEF_EVENT(RecordDevice, { | |||||
| std::shared_ptr<CompNode::Event> event; | |||||
| }); | |||||
| DEF_DUR_EVENT(HostToDevice, { | |||||
| TensorLayout layout; | |||||
| CompNode device; | |||||
| void* host_ptr; | |||||
| void* device_ptr; | |||||
| }); | |||||
| #undef DEF_EVENT | #undef DEF_EVENT | ||||
| #undef DEF_DUR_EVENT | #undef DEF_DUR_EVENT | ||||
| @@ -15,10 +15,12 @@ | |||||
| #include "megbrain/imperative/profiler.h" | #include "megbrain/imperative/profiler.h" | ||||
| #include "./states.h" | |||||
| namespace mgb::imperative::profiler { | namespace mgb::imperative::profiler { | ||||
| void dump_chrome_timeline(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results); | |||||
| void dump_chrome_timeline(std::string filename, Profiler::bundle_t result); | |||||
| void dump_memory_flow(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results); | |||||
| void dump_memory_flow(std::string filename, Profiler::bundle_t result); | |||||
| } | } | ||||
| @@ -89,7 +89,8 @@ struct MemoryChunk { | |||||
| std::array<uintptr_t, 2> address; | std::array<uintptr_t, 2> address; | ||||
| std::string name; | std::string name; | ||||
| TensorLayout layout; | TensorLayout layout; | ||||
| std::array<uint64_t, 2> time; | |||||
| std::array<profiler::Duration, 2> time; | |||||
| std::optional<uint64_t> group; | |||||
| bool empty() const { | bool empty() const { | ||||
| return address[1] - address[0] == 0; | return address[1] - address[0] == 0; | ||||
| @@ -111,9 +112,9 @@ struct MemoryFlow { | |||||
| return {addr_begin, addr_end}; | return {addr_begin, addr_end}; | ||||
| } | } | ||||
| std::pair<uint64_t, uint64_t> time_range() const { | |||||
| auto time_begin = std::numeric_limits<uint64_t>::max(); | |||||
| auto time_end = std::numeric_limits<uint64_t>::min(); | |||||
| std::pair<profiler::Duration, profiler::Duration> time_range() const { | |||||
| auto time_begin = profiler::Duration::max(); | |||||
| auto time_end = profiler::Duration::min(); | |||||
| for(auto&& [id, chunk]: chunks) { | for(auto&& [id, chunk]: chunks) { | ||||
| MGB_MARK_USED_VAR(id); | MGB_MARK_USED_VAR(id); | ||||
| if (chunk.empty()) continue; | if (chunk.empty()) continue; | ||||
| @@ -123,27 +124,6 @@ struct MemoryFlow { | |||||
| return {time_begin, time_end}; | return {time_begin, time_end}; | ||||
| } | } | ||||
| std::shared_ptr<json::Array> to_json() const { | |||||
| auto results = json::Array::make(); | |||||
| for(auto&& [id, chunk]: chunks) { | |||||
| MGB_MARK_USED_VAR(id); | |||||
| if (chunk.empty()) continue; | |||||
| auto address = json::Array::make(); | |||||
| auto time = json::Array::make(); | |||||
| address->add(json::String::make(std::to_string(chunk.address[0]))); | |||||
| address->add(json::String::make(std::to_string(chunk.address[1]))); | |||||
| time->add(json::String::make(std::to_string(chunk.time[0]))); | |||||
| time->add(json::String::make(std::to_string(chunk.time[1]))); | |||||
| results->add(json::Object::make({ | |||||
| {"address", address}, | |||||
| {"name", json::String::make(chunk.name)}, | |||||
| {"layout", json::String::make(chunk.layout.to_string())}, | |||||
| {"time", time} | |||||
| })); | |||||
| } | |||||
| return results; | |||||
| } | |||||
| XMLWriter to_svg() const { | XMLWriter to_svg() const { | ||||
| XMLWriter writer; | XMLWriter writer; | ||||
| auto&& [addr_begin, addr_end] = address_range(); | auto&& [addr_begin, addr_end] = address_range(); | ||||
| @@ -157,13 +137,13 @@ struct MemoryFlow { | |||||
| svg.attr("xmlns:tag", std::string{"https://megengine.org.cn"}); | svg.attr("xmlns:tag", std::string{"https://megengine.org.cn"}); | ||||
| double time_scale = 1e5; | double time_scale = 1e5; | ||||
| double addr_scale = 1e6; | double addr_scale = 1e6; | ||||
| svg.attr("width", (time_end-time_begin)/time_scale); | |||||
| svg.attr("width", (time_end-time_begin).count()/time_scale); | |||||
| svg.attr("height", (addr_end-addr_begin)/addr_scale); | svg.attr("height", (addr_end-addr_begin)/addr_scale); | ||||
| { | { | ||||
| auto rect = writer.element("rect"); | auto rect = writer.element("rect"); | ||||
| rect.attr("x", 0); | rect.attr("x", 0); | ||||
| rect.attr("y", 0); | rect.attr("y", 0); | ||||
| rect.attr("width", (time_end-time_begin)/time_scale); | |||||
| rect.attr("width", (time_end-time_begin).count()/time_scale); | |||||
| rect.attr("height", (addr_end-addr_begin)/addr_scale); | rect.attr("height", (addr_end-addr_begin)/addr_scale); | ||||
| rect.attr("fill", std::string{"blue"}); | rect.attr("fill", std::string{"blue"}); | ||||
| } | } | ||||
| @@ -177,7 +157,7 @@ struct MemoryFlow { | |||||
| {1000 * ms, "#888888"}, | {1000 * ms, "#888888"}, | ||||
| {std::numeric_limits<double>::infinity(), "#555555"}, | {std::numeric_limits<double>::infinity(), "#555555"}, | ||||
| }; | }; | ||||
| auto time2str = [](uint64_t ns){ | |||||
| auto time2str = [](profiler::Duration ns){ | |||||
| using pair_t = std::pair<uint64_t, const char*>; | using pair_t = std::pair<uint64_t, const char*>; | ||||
| static pair_t units[] = { | static pair_t units[] = { | ||||
| {1, "ns "}, | {1, "ns "}, | ||||
| @@ -189,9 +169,9 @@ struct MemoryFlow { | |||||
| auto comparator = [](const pair_t& lhs, const pair_t& rhs) { | auto comparator = [](const pair_t& lhs, const pair_t& rhs) { | ||||
| return lhs.first < rhs.first; | return lhs.first < rhs.first; | ||||
| }; | }; | ||||
| while (ns > 0) { | |||||
| auto iter = std::upper_bound(std::begin(units), std::end(units), std::make_pair(ns, ""), comparator) - 1; | |||||
| builder += std::to_string(ns / iter->first) + iter->second; | |||||
| while (ns.count() > 0) { | |||||
| auto iter = std::upper_bound(std::begin(units), std::end(units), std::make_pair(ns.count(), ""), comparator) - 1; | |||||
| builder += std::to_string(ns.count() / iter->first) + iter->second; | |||||
| ns = ns % iter->first; | ns = ns % iter->first; | ||||
| } | } | ||||
| return builder; | return builder; | ||||
| @@ -218,11 +198,11 @@ struct MemoryFlow { | |||||
| for (auto&& [id, chunk]: chunks) { | for (auto&& [id, chunk]: chunks) { | ||||
| MGB_MARK_USED_VAR(id); | MGB_MARK_USED_VAR(id); | ||||
| if (chunk.empty()) continue; | if (chunk.empty()) continue; | ||||
| double left = (chunk.time[0]-time_begin)/time_scale; | |||||
| double right = (chunk.time[1]-time_begin)/time_scale; | |||||
| double left = (chunk.time[0]-time_begin).count()/time_scale; | |||||
| double right = (chunk.time[1]-time_begin).count()/time_scale; | |||||
| double top = (chunk.address[0]-addr_begin)/addr_scale; | double top = (chunk.address[0]-addr_begin)/addr_scale; | ||||
| double bottom = (chunk.address[1]-addr_begin)/addr_scale; | double bottom = (chunk.address[1]-addr_begin)/addr_scale; | ||||
| double duration = chunk.time[1] - chunk.time[0]; | |||||
| double duration = (chunk.time[1] - chunk.time[0]).count(); | |||||
| { | { | ||||
| auto rect = writer.element("rect"); | auto rect = writer.element("rect"); | ||||
| rect.attr("x", left); | rect.attr("x", left); | ||||
| @@ -241,70 +221,48 @@ struct MemoryFlow { | |||||
| mge_attr("produced", time2str(chunk.time[0])); | mge_attr("produced", time2str(chunk.time[0])); | ||||
| mge_attr("erased", time2str(chunk.time[1])); | mge_attr("erased", time2str(chunk.time[1])); | ||||
| mge_attr("duration", time2str(chunk.time[1] - chunk.time[0])); | mge_attr("duration", time2str(chunk.time[1] - chunk.time[0])); | ||||
| if (chunk.group) { | |||||
| mge_attr("group", std::to_string(*chunk.group)); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| return writer; | return writer; | ||||
| } | } | ||||
| }; | }; | ||||
| void dump_memory_flow(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results) { | |||||
| MemoryFlow flow; | |||||
| ProfileDataCollector collector; | |||||
| ProfileState state; | |||||
| #define HANDLE_EVENT(type, ...) \ | |||||
| collector.handle<type>([&](uint64_t id, std::thread::id tid, uint64_t time, type event) __VA_ARGS__ ); | |||||
| HANDLE_EVENT(TensorDeclareEvent, { | |||||
| auto& tensor_state = state.tensors[event.tensor_id] = {}; | |||||
| tensor_state.id = event.tensor_id; | |||||
| tensor_state.name = event.name; | |||||
| }); | |||||
| struct MemoryFlowVisitor: EventVisitor<MemoryFlowVisitor> { | |||||
| MemoryFlow memory_flow; | |||||
| HANDLE_EVENT(TensorProduceEvent, { | |||||
| auto& tensor_state = state.tensors[event.tensor_id]; | |||||
| tensor_state.device = event.device; | |||||
| tensor_state.layout = event.layout; | |||||
| tensor_state.produced = time; | |||||
| state.tensors_by_size.insert({tensor_state.id, tensor_state.size_in_bytes()}); | |||||
| state.tensors_by_produced.insert({tensor_state.id, tensor_state.produced}); | |||||
| auto& chunk = flow.chunks[event.tensor_id]; | |||||
| uintptr_t address = reinterpret_cast<uintptr_t>(event.ptr); | |||||
| auto span = event.layout.span(); | |||||
| auto dtype = event.layout.dtype; | |||||
| // assume dtype is not lowbit | |||||
| if (!address) { | |||||
| chunk.address = {0, 0}; | |||||
| } else { | |||||
| chunk.address = {address+span.low_elem*dtype.size(), address+span.high_elem*dtype.size()}; | |||||
| template <typename TEvent> | |||||
| void visit_event(const TEvent &event) { | |||||
| if constexpr (std::is_same_v<TEvent, TensorProduceEvent>) { | |||||
| auto& chunk = memory_flow.chunks[event.tensor_id]; | |||||
| uint64_t address = reinterpret_cast<uintptr_t>(event.ptr); | |||||
| auto span = event.layout.span(); | |||||
| auto dtype = event.layout.dtype; | |||||
| // assume dtype is not lowbit | |||||
| if (!address) { | |||||
| chunk.address = {0, 0}; | |||||
| } else { | |||||
| chunk.address = {address+span.low_elem*dtype.size(), address+span.high_elem*dtype.size()}; | |||||
| } | |||||
| chunk.layout = event.layout; | |||||
| chunk.time[0] = since_start(to_device_time(current->time, current_tensor->device)); | |||||
| chunk.name = current_tensor->name; | |||||
| chunk.group = current_tensor->source; | |||||
| } else if constexpr (std::is_same_v<TEvent, TensorReleaseEvent>) { | |||||
| auto& chunk = memory_flow.chunks[event.tensor_id]; | |||||
| chunk.time[1] = since_start(to_device_time(current->time, current_tensor->device)); | |||||
| } | } | ||||
| chunk.layout = tensor_state.layout; | |||||
| chunk.time[0] = time; | |||||
| chunk.name = tensor_state.name; | |||||
| }); | |||||
| HANDLE_EVENT(TensorReleaseEvent, { | |||||
| auto& tensor_state = state.tensors[event.tensor_id]; | |||||
| state.tensors_by_size.erase({tensor_state.id, tensor_state.size_in_bytes()}); | |||||
| state.tensors_by_produced.erase({tensor_state.id, tensor_state.produced}); | |||||
| auto& chunk = flow.chunks[event.tensor_id]; | |||||
| chunk.time[1] = time; | |||||
| }); | |||||
| HANDLE_EVENT(ScopeEvent, { | |||||
| state.threads[tid].scope_stack.push_back(event.name); | |||||
| }); | |||||
| HANDLE_EVENT(ScopeFinishEvent, { | |||||
| mgb_assert(state.threads[tid].scope_stack.back() == event.name); | |||||
| state.threads[tid].scope_stack.pop_back(); | |||||
| }); | |||||
| for (auto&& result: results) { | |||||
| collector(result.second.id, result.first, result.second.time, result.second.data); | |||||
| } | } | ||||
| debug::write_to_file(filename.c_str(), flow.to_svg().to_string()); | |||||
| void notify_counter(std::string key, int64_t old_val, int64_t new_val) {} | |||||
| }; | |||||
| void dump_memory_flow(std::string filename, Profiler::bundle_t result) { | |||||
| MemoryFlowVisitor visitor; | |||||
| visitor.process_events(std::move(result)); | |||||
| debug::write_to_file(filename.c_str(), visitor.memory_flow.to_svg().to_string()); | |||||
| } | } | ||||
| } | } | ||||
| @@ -3,6 +3,9 @@ | |||||
| #include <set> | #include <set> | ||||
| #include <any> | #include <any> | ||||
| #include <typeindex> | #include <typeindex> | ||||
| #include <sstream> | |||||
| #include "nlohmann/json.hpp" | |||||
| #include "megbrain/tensor.h" | #include "megbrain/tensor.h" | ||||
| @@ -10,24 +13,16 @@ | |||||
| namespace mgb::imperative::profiler { | namespace mgb::imperative::profiler { | ||||
| struct ProfileDeviceState { | |||||
| int64_t index; | |||||
| CompNode device; | |||||
| std::shared_ptr<CompNode::Event> base_event; | |||||
| uint64_t base_time; //in ns | |||||
| }; | |||||
| struct ProfileWorkerState { | |||||
| }; | |||||
| using StackManager = interpreter::intl::StackManager; | |||||
| struct ProfileTensorState { | struct ProfileTensorState { | ||||
| uint64_t id; | |||||
| uint64_t id = 0; | |||||
| std::optional<uint64_t> source; | |||||
| TensorLayout layout; | TensorLayout layout; | ||||
| CompNode device; | CompNode device; | ||||
| std::string name; | std::string name; | ||||
| uint64_t produced = 0; | |||||
| uint64_t living_time = 0; | |||||
| profiler::HostTime produced = profiler::HostTime::min(); | |||||
| profiler::Duration living_time = profiler::Duration::zero(); | |||||
| size_t size_in_bytes() const { | size_t size_in_bytes() const { | ||||
| if (!layout.dtype.valid()) { | if (!layout.dtype.valid()) { | ||||
| @@ -35,41 +30,51 @@ struct ProfileTensorState { | |||||
| } | } | ||||
| return layout.dtype.size(layout.total_nr_elems()); | return layout.dtype.size(layout.total_nr_elems()); | ||||
| } | } | ||||
| }; | |||||
| struct ProfileStaticsState { | |||||
| size_t op_enqueue_count = 0; | |||||
| size_t op_execute_count = 0; | |||||
| size_t wait_value_count = 0; | |||||
| size_t wait_shape_count = 0; | |||||
| size_t exception_count = 0; | |||||
| size_t infer_shape_valid_count = 0; | |||||
| size_t infer_shape_invalid_count = 0; | |||||
| size_t alive_tensor_count = 0; | |||||
| size_t produce_tensor_count = 0; | |||||
| size_t erase_tensor_count = 0; | |||||
| size_t wait_prop_count = 0; | |||||
| size_t redundant_tensor_count = 0; | |||||
| std::string info(HostTime current_time) { | |||||
| std::string shape = layout.TensorShape::to_string(); | |||||
| std::string dtype = layout.dtype.name(); | |||||
| return ssprintf("%s(%s:%s:%s)", name.c_str(), shape.c_str(), dtype.c_str(), device.to_string().c_str()); | |||||
| } | |||||
| nlohmann::json detail(HostTime current_time) { | |||||
| nlohmann::json args; | |||||
| args["id"] = id; | |||||
| args["name"] = name; | |||||
| args["shape"] = layout.TensorShape::to_string(); | |||||
| args["dtype"] = layout.dtype.name(); | |||||
| args["nr_elements"] = layout.total_nr_elems(); | |||||
| args["device"] = device.to_string(); | |||||
| if (produced != produced.min()) { | |||||
| double ms_count = std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(current_time - produced + living_time).count(); | |||||
| args["living_time"] = ssprintf("%lf ms", ms_count); | |||||
| } | |||||
| return args; | |||||
| } | |||||
| }; | }; | ||||
| struct ProfileOperatorState { | struct ProfileOperatorState { | ||||
| uint64_t id; | |||||
| uint64_t id = 0; | |||||
| std::string name; | std::string name; | ||||
| OpParams params; | OpParams params; | ||||
| SmallVector<uint64_t> inputs; | SmallVector<uint64_t> inputs; | ||||
| SmallVector<uint64_t> outputs; | SmallVector<uint64_t> outputs; | ||||
| CompNode device; | CompNode device; | ||||
| Trace trace; | |||||
| uint64_t host_begin; | |||||
| uint64_t host_end; | |||||
| std::shared_ptr<CompNode::Event> device_begin; | |||||
| std::shared_ptr<CompNode::Event> device_end; | |||||
| }; | |||||
| profiler::HostTime execute_begin; | |||||
| profiler::HostTime execute_end; | |||||
| struct ProfileThreadState { | |||||
| std::thread::id tid; | |||||
| int64_t index; | |||||
| std::vector<std::string> scope_stack; | |||||
| nlohmann::json detail() { | |||||
| nlohmann::json args; | |||||
| for (auto&& [name, value]: params) { | |||||
| args[name] = value; | |||||
| } | |||||
| args["__id__"] = id; | |||||
| args["__name__"] = name; | |||||
| args["__device__"] = device.to_string(); | |||||
| return args; | |||||
| } | |||||
| }; | }; | ||||
| template <typename TProp> | template <typename TProp> | ||||
| @@ -93,37 +98,12 @@ struct ProfileTensorPropPair { | |||||
| using ProfileTensorSizePair = ProfileTensorPropPair<size_t>; | using ProfileTensorSizePair = ProfileTensorPropPair<size_t>; | ||||
| using ProfileTensorProducedPair = ProfileTensorPropPair<uint64_t>; | using ProfileTensorProducedPair = ProfileTensorPropPair<uint64_t>; | ||||
| struct GeneralTensorEvent { | |||||
| uint64_t tensor_id; | |||||
| std::type_index type; | |||||
| }; | |||||
| struct ProfileState { | struct ProfileState { | ||||
| std::unordered_map<uint64_t, ProfileTensorState> tensors; | std::unordered_map<uint64_t, ProfileTensorState> tensors; | ||||
| std::unordered_map<uint64_t, ProfileOperatorState> operators; | std::unordered_map<uint64_t, ProfileOperatorState> operators; | ||||
| std::unordered_map<std::string, uint64_t> tensor_name_counter; | std::unordered_map<std::string, uint64_t> tensor_name_counter; | ||||
| std::set<ProfileTensorSizePair> tensors_by_size; | std::set<ProfileTensorSizePair> tensors_by_size; | ||||
| std::set<ProfileTensorSizePair> tensors_by_produced; | std::set<ProfileTensorSizePair> tensors_by_produced; | ||||
| ProfileWorkerState worker; | |||||
| ProfileStaticsState statics; | |||||
| std::unordered_map<std::thread::id, ProfileThreadState> threads; | |||||
| CompNode::UnorderedMap<ProfileDeviceState> devices; | |||||
| ProfileThreadState& operator[](std::thread::id tid) { | |||||
| if (threads.count(tid) == 0) { | |||||
| threads[tid].tid = tid; | |||||
| threads[tid].index = threads.size(); | |||||
| } | |||||
| return threads[tid]; | |||||
| } | |||||
| ProfileDeviceState& operator[](CompNode device) { | |||||
| if (devices.count(device) == 0) { | |||||
| devices[device].device = device; | |||||
| devices[device].index = devices.size(); | |||||
| } | |||||
| return devices[device]; | |||||
| } | |||||
| std::vector<uint64_t> top_k_tensor_in_device(CompNode device, size_t k) { | std::vector<uint64_t> top_k_tensor_in_device(CompNode device, size_t k) { | ||||
| std::vector<uint64_t> results; | std::vector<uint64_t> results; | ||||
| @@ -138,19 +118,233 @@ struct ProfileState { | |||||
| } | } | ||||
| return results; | return results; | ||||
| } | } | ||||
| }; | |||||
| std::string concat_scope(std::thread::id tid) { | |||||
| auto& scope_stack = threads[tid].scope_stack; | |||||
| if (scope_stack.empty()) { | |||||
| return {}; | |||||
| } | |||||
| std::string result = scope_stack[0]; | |||||
| for (size_t i = 1; i < scope_stack.size(); ++i) { | |||||
| result += "::"; | |||||
| result += scope_stack[i]; | |||||
| template<typename T, typename = void> | |||||
| struct is_op_event : std::false_type { }; | |||||
| template<typename T> | |||||
| struct is_op_event<T, decltype(std::declval<T>().op_id, void())> : std::true_type { }; | |||||
| template<typename T, typename = void> | |||||
| struct is_tensor_event : std::false_type { }; | |||||
| template<typename T> | |||||
| struct is_tensor_event<T, decltype(std::declval<T>().tensor_id, void())> : std::true_type { }; | |||||
| template<typename T, typename = void> | |||||
| struct is_trace_event : std::false_type { }; | |||||
| template<typename T> | |||||
| struct is_trace_event<T, decltype(std::declval<T>().trace, void())> : std::true_type { }; | |||||
| template <typename... TItems> | |||||
| class AnyToVariantConverter { | |||||
| public: | |||||
| using any_t = std::any; | |||||
| using variant_t = std::variant<TItems...>; | |||||
| private: | |||||
| std::unordered_map<std::type_index, std::function<variant_t(any_t)>> m_table; | |||||
| template <typename TItem> | |||||
| void register_converter() { | |||||
| m_table[typeid(TItem)] = [](any_t input) { | |||||
| return variant_t(std::any_cast<TItem>(std::move(input))); | |||||
| }; | |||||
| } | |||||
| public: | |||||
| AnyToVariantConverter() { | |||||
| (register_converter<TItems>(), ...); | |||||
| } | |||||
| variant_t operator()(any_t input) { | |||||
| return m_table[input.type()](std::move(input)); | |||||
| } | |||||
| }; | |||||
| template <typename TSelf> | |||||
| class EventVisitor { | |||||
| private: | |||||
| std::unordered_map<size_t, ProfileOperatorState> m_operators; | |||||
| std::unordered_map<size_t, ProfileTensorState> m_tensors; | |||||
| std::unordered_map<size_t, std::vector<Profiler::Record>> m_duration_stack; | |||||
| HostTime m_start_time; | |||||
| CompNode::UnorderedMap<size_t> m_device_tid_table; | |||||
| std::unordered_map<std::thread::id, size_t> m_host_tid_table; | |||||
| CompNode::UnorderedMap<std::map<profiler::HostTime, profiler::RealDuration>> m_device_timeline; | |||||
| std::unordered_map<std::thread::id, std::vector<Trace>> m_trace_stack; | |||||
| std::unordered_map<std::string, int64_t> m_counter_table; | |||||
| protected: | |||||
| Profiler::Record* current; | |||||
| ProfileOperatorState* current_op; | |||||
| ProfileTensorState* current_tensor; | |||||
| protected: | |||||
| profiler::Duration since_start(profiler::HostTime time) { | |||||
| return time - m_start_time; | |||||
| } | |||||
| profiler::HostTime to_device_time(profiler::HostTime time, CompNode device) { | |||||
| auto& device_timeline = m_device_timeline[device]; | |||||
| auto upper = device_timeline.lower_bound(time); | |||||
| if (upper == device_timeline.end()) { | |||||
| if (upper == device_timeline.begin()) { | |||||
| return time; | |||||
| } else { | |||||
| --upper; | |||||
| return time + std::chrono::duration_cast<profiler::Duration>(upper->second); | |||||
| } | |||||
| } else if (upper->first == time) { | |||||
| return time + std::chrono::duration_cast<profiler::Duration>(upper->second); | |||||
| } else if (upper == device_timeline.begin()) { | |||||
| return time + std::chrono::duration_cast<profiler::Duration>(upper->second); | |||||
| } | } | ||||
| auto lower = upper; | |||||
| -- lower; | |||||
| double ratio = ((double)(time - lower->first).count() / (double)(upper->first - lower->first).count()); | |||||
| mgb_assert(ratio > 0 && ratio < 1, "invalid ratio"); | |||||
| mgb_assert(lower->first + lower->second <= upper->first + upper->second, "device time corr"); | |||||
| auto shift = lower->second + ratio * (upper->second - lower->second); | |||||
| auto result = time + std::chrono::duration_cast<profiler::Duration>(shift); | |||||
| return result; | return result; | ||||
| } | } | ||||
| size_t to_tid(std::thread::id host_tid) { | |||||
| return m_host_tid_table.at(host_tid); | |||||
| } | |||||
| size_t to_tid(CompNode device) { | |||||
| return m_device_tid_table.at(device); | |||||
| } | |||||
| void inc_counter(const char* key, int64_t delta) { | |||||
| if (!m_counter_table.count(key)) { | |||||
| m_counter_table[key] = 0; | |||||
| } | |||||
| auto& value = m_counter_table[key]; | |||||
| static_cast<TSelf&>(*this).notify_counter(key, value, value + delta); | |||||
| value += delta; | |||||
| } | |||||
| public: | |||||
| void process_events(Profiler::bundle_t bundle) { | |||||
| m_start_time = bundle.start_at; | |||||
| auto& self = static_cast<TSelf&>(*this); | |||||
| AnyToVariantConverter<OpDispatchEvent, OpExecuteEvent, OpExecuteFinishEvent, | |||||
| KernelLaunchEvent, KernelLaunchFinishEvent, | |||||
| OpInputEvent, OpInputFinishEvent, OpOutputEvent, OpOutputFinishEvent, | |||||
| TensorDeclareEvent, TensorProduceEvent, TensorUsageEvent, TensorReleaseEvent, TensorEraseEvent, | |||||
| TensorGetPropEvent, TensorNotifyPropEvent, TensorWaitPropEvent, TensorWaitPropFinishEvent, | |||||
| SampleDeviceEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent, | |||||
| StartProfileEvent, StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent, | |||||
| TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, AutoEvictFinishEvent, | |||||
| CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent> converter; | |||||
| auto for_each_entry = [&](auto&& handler) { | |||||
| for (auto& entry: bundle.entries) { | |||||
| current = &entry; | |||||
| std::visit(handler, converter(entry.data)); | |||||
| } | |||||
| current = nullptr; | |||||
| }; | |||||
| // build device timeline | |||||
| struct DeviceStartPair { | |||||
| profiler::HostTime host; | |||||
| std::shared_ptr<CompNode::Event> device; | |||||
| }; | |||||
| CompNode::UnorderedMap<DeviceStartPair> device_start_table; | |||||
| for_each_entry([&](auto&& event){ | |||||
| using T = std::decay_t<decltype(event)>; | |||||
| if constexpr (std::is_same_v<T, RecordDeviceEvent>) { | |||||
| using namespace std::chrono_literals; | |||||
| DeviceStartPair& device_start = device_start_table[event.event->comp_node()]; | |||||
| if (!device_start.device) { | |||||
| device_start = { current->time, event.event }; | |||||
| } | |||||
| event.event->host_wait(); | |||||
| auto device_time = (device_start.host - current->time) + std::chrono::duration_cast<profiler::RealDuration>(device_start.device->elapsed_time_until(*event.event) * 1s); | |||||
| m_device_timeline[event.event->comp_node()][current->time] = device_time; | |||||
| } | |||||
| }); | |||||
| // register host threads | |||||
| for_each_entry([&](auto&& event){ | |||||
| if (!m_host_tid_table.count(current->tid)) { | |||||
| m_host_tid_table[current->tid] = {m_device_tid_table.size() + m_host_tid_table.size()}; | |||||
| } | |||||
| }); | |||||
| for_each_entry([&](auto&& event){ | |||||
| using T = std::decay_t<decltype(event)>; | |||||
| if constexpr (std::is_same_v<T, OpDispatchEvent>) { | |||||
| auto& op = m_operators[event.op_id]; | |||||
| mgb_assert(op.id == 0, "duplicate operator id"); | |||||
| op.id = event.op_id; | |||||
| op.name = event.op_name; | |||||
| op.params = event.op_params(); | |||||
| op.inputs = event.inputs; | |||||
| op.outputs = event.outputs; | |||||
| op.trace = event.trace; | |||||
| for (auto&& output: event.outputs) { | |||||
| m_tensors.at(output).source = op.id; | |||||
| } | |||||
| } else if constexpr (std::is_same_v<T, TensorDeclareEvent>) { | |||||
| auto& tensor = m_tensors[event.tensor_id]; | |||||
| mgb_assert(tensor.id == 0, "duplicated tensor id"); | |||||
| tensor.id = event.tensor_id; | |||||
| tensor.name = event.name; | |||||
| } else if constexpr (std::is_same_v<T, TensorProduceEvent>) { | |||||
| auto& tensor = m_tensors.at(event.tensor_id); | |||||
| if (!m_device_tid_table.count(event.device)) { | |||||
| m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()}; | |||||
| } | |||||
| tensor.device = event.device; | |||||
| } | |||||
| }); | |||||
| // replay execution | |||||
| using namespace std::placeholders; | |||||
| for_each_entry([&](auto&& event){ | |||||
| using T = std::decay_t<decltype(event)>; | |||||
| // update current_op/tensor | |||||
| if constexpr (is_op_event<T>::value) { | |||||
| current_op = &m_operators.at(event.op_id); | |||||
| } else if constexpr (is_tensor_event<T>::value) { | |||||
| current_tensor = &m_tensors.at(event.tensor_id); | |||||
| } | |||||
| if constexpr (std::is_same_v<T, OpExecuteEvent>) { | |||||
| current_op->execute_begin = current->time; | |||||
| } else if constexpr (std::is_same_v<T, OpExecuteFinishEvent>) { | |||||
| current_op->execute_end = current->time; | |||||
| } | |||||
| // update counters | |||||
| if constexpr (std::is_same_v<T, OpDispatchEvent>) { | |||||
| inc_counter("nr_op_pending", 1); | |||||
| } else if constexpr (std::is_same_v<T, OpExecuteEvent>) { | |||||
| inc_counter("nr_op_pending", -1); | |||||
| } else if constexpr (std::is_same_v<T, TensorProduceEvent>) { | |||||
| inc_counter("nr_alive_tensor", 1); | |||||
| } else if constexpr (std::is_same_v<T, TensorReleaseEvent>) { | |||||
| inc_counter("nr_alive_tensor", -1); | |||||
| } else if constexpr (std::is_same_v<T, TensorEraseEvent>) { | |||||
| if (event.use_count == 0) { | |||||
| inc_counter("nr_redunant_tensor", 1); | |||||
| } | |||||
| } else if constexpr (std::is_same_v<T, ShapeInferEvent>) { | |||||
| if (!event.success) { | |||||
| inc_counter("nr_shape_infer_failure", 1); | |||||
| } | |||||
| } else if constexpr (std::is_same_v<T, WorkerExceptionEvent>) { | |||||
| inc_counter("nr_exception", 1); | |||||
| } | |||||
| // visit_event_impl | |||||
| self.visit_event(event); | |||||
| // reset current_op/tensor | |||||
| if constexpr (is_op_event<T>::value) { | |||||
| current_op = nullptr; | |||||
| } else if constexpr (is_tensor_event<T>::value) { | |||||
| current_tensor = nullptr; | |||||
| } | |||||
| }); | |||||
| } | |||||
| }; | }; | ||||
| } | } | ||||
| @@ -25,6 +25,9 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) { | |||||
| auto on_seq_start = [this](CompSeqExecBeforeStart const& event) { | auto on_seq_start = [this](CompSeqExecBeforeStart const& event) { | ||||
| // reset | // reset | ||||
| mgb_assert(!event.graph->options().imperative_proxy_graph); | mgb_assert(!event.graph->options().imperative_proxy_graph); | ||||
| CompNode::foreach([](CompNode device){ | |||||
| Profiler::record<RecordDeviceEvent>(Timer::record_device(device)); | |||||
| }); | |||||
| if (m_opr_dict.empty() && m_var_dict.empty()) { | if (m_opr_dict.empty() && m_var_dict.empty()) { | ||||
| init_seq(event.exec); | init_seq(event.exec); | ||||
| } | } | ||||
| @@ -122,11 +125,13 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) { | |||||
| }; | }; | ||||
| auto on_before_kern = [this](BeforeKernel const& event) { | auto on_before_kern = [this](BeforeKernel const& event) { | ||||
| OperatorNodeBase* opr = event.opr; | OperatorNodeBase* opr = event.opr; | ||||
| Profiler::record<KernelExecuteEvent>(get_opr_info(opr).id, get_opr_info(opr).id, Timer::record_event(event.comp_node)); | |||||
| Profiler::record<KernelLaunchEvent>(get_opr_info(opr).id, get_opr_info(opr).id, event.comp_node); | |||||
| Profiler::record<RecordDeviceEvent>(Timer::record_device(event.comp_node)); | |||||
| }; | }; | ||||
| auto on_after_kern = [this](AfterKernel const& event) { | auto on_after_kern = [this](AfterKernel const& event) { | ||||
| OperatorNodeBase* opr = event.opr; | OperatorNodeBase* opr = event.opr; | ||||
| Profiler::record<KernelExecuteFinishEvent>(get_opr_info(opr).id, get_opr_info(opr).id, Timer::record_event(event.comp_node)); | |||||
| Profiler::record<RecordDeviceEvent>(Timer::record_device(event.comp_node)); | |||||
| Profiler::record<KernelLaunchEvent>(get_opr_info(opr).id, get_opr_info(opr).id, event.comp_node); | |||||
| }; | }; | ||||
| auto on_graph_compile = [this](const CompSeqOrderDetermined&) { | auto on_graph_compile = [this](const CompSeqOrderDetermined&) { | ||||
| m_opr_dict.clear(); | m_opr_dict.clear(); | ||||
| @@ -32,15 +32,22 @@ | |||||
| namespace mgb { | namespace mgb { | ||||
| namespace imperative { | namespace imperative { | ||||
| namespace profiler { | |||||
| using HostTime = std::chrono::time_point<std::chrono::high_resolution_clock>; | |||||
| using Duration = std::chrono::nanoseconds; | |||||
| using RealDuration = std::chrono::duration<double, std::nano>; | |||||
| using Time = HostTime; | |||||
| } // namespace profiler | |||||
| class Timer { | class Timer { | ||||
| public: | public: | ||||
| void reset(); | |||||
| uint64_t get_nsecs(); | |||||
| uint64_t get_started_at(); | |||||
| static std::shared_ptr<CompNode::Event> record_event(CompNode device); | |||||
| private: | |||||
| decltype(std::chrono::steady_clock::now()) m_start; | |||||
| uint64_t m_started_at; | |||||
| using Time = profiler::Time; | |||||
| static profiler::Time record_host(); | |||||
| static std::shared_ptr<CompNode::Event> record_device(CompNode device); | |||||
| }; | }; | ||||
| @@ -48,7 +55,8 @@ class Profiler { | |||||
| public: | public: | ||||
| struct Record { | struct Record { | ||||
| uint64_t id; | uint64_t id; | ||||
| uint64_t time; //in ns | |||||
| std::thread::id tid; | |||||
| profiler::Time time; | |||||
| std::any data; | std::any data; | ||||
| }; | }; | ||||
| enum Status: uint8_t { | enum Status: uint8_t { | ||||
| @@ -56,23 +64,32 @@ public: | |||||
| Recording = 1, | Recording = 1, | ||||
| Collecting = 2, | Collecting = 2, | ||||
| }; | }; | ||||
| using ProfileCollector = std::function<void(std::thread::id, Record)>; | |||||
| struct ResultBundle; | |||||
| using ProfileCollector = std::function<void(Record)>; | |||||
| using option_t = uint64_t; | using option_t = uint64_t; | ||||
| using options_t = std::unordered_map<std::string, option_t>; | using options_t = std::unordered_map<std::string, option_t>; | ||||
| using result_t = std::pair<std::thread::id, Record>; | |||||
| using results_t = std::vector<result_t>; | |||||
| using entry_t = Record; | |||||
| using bundle_t = ResultBundle; | |||||
| using thread_dict_t = std::unordered_map<std::thread::id, std::string>; | using thread_dict_t = std::unordered_map<std::thread::id, std::string>; | ||||
| struct ResultBundle { | |||||
| profiler::HostTime start_at; | |||||
| thread_dict_t thread_dict; | |||||
| options_t options; | |||||
| std::vector<entry_t> entries; | |||||
| }; | |||||
| private: | private: | ||||
| std::thread::id m_thread_id; | std::thread::id m_thread_id; | ||||
| std::vector<Record> m_records; | std::vector<Record> m_records; | ||||
| std::vector<std::any> m_duration_stack; | |||||
| std::atomic<Status> m_status = Running; | std::atomic<Status> m_status = Running; | ||||
| uint64_t m_last_time = 0; | |||||
| std::string m_thread_name; | std::string m_thread_name; | ||||
| static options_t sm_profile_options; | static options_t sm_profile_options; | ||||
| static std::mutex sm_mutex; | static std::mutex sm_mutex; | ||||
| static std::unordered_map<std::thread::id, Profiler*> sm_profilers; | static std::unordered_map<std::thread::id, Profiler*> sm_profilers; | ||||
| static Timer sm_timer; | static Timer sm_timer; | ||||
| static profiler::HostTime sm_start_at; | |||||
| static std::atomic_uint64_t sm_last_id; | static std::atomic_uint64_t sm_last_id; | ||||
| static std::atomic_size_t sm_preferred_capacity; | static std::atomic_size_t sm_preferred_capacity; | ||||
| static bool sm_profiling; | static bool sm_profiling; | ||||
| @@ -100,7 +117,7 @@ public: | |||||
| static void reset() { | static void reset() { | ||||
| mgb_assert(sm_profilers.size() == 0, "profiler already running"); | mgb_assert(sm_profilers.size() == 0, "profiler already running"); | ||||
| sm_timer.reset(); | |||||
| sm_start_at = profiler::HostTime::min(); | |||||
| } | } | ||||
| static uint64_t next_id() { | static uint64_t next_id() { | ||||
| @@ -110,16 +127,13 @@ public: | |||||
| template <typename T, typename... TArgs> | template <typename T, typename... TArgs> | ||||
| static uint64_t record(TArgs&&... args) { | static uint64_t record(TArgs&&... args) { | ||||
| auto& profiler = get_instance(); | auto& profiler = get_instance(); | ||||
| auto last_time = profiler.m_last_time; | |||||
| if constexpr (sm_debug) { | if constexpr (sm_debug) { | ||||
| Status expected = Running; | Status expected = Running; | ||||
| mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording)); | mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording)); | ||||
| } | } | ||||
| uint64_t id = next_id(); | uint64_t id = next_id(); | ||||
| uint64_t time = sm_timer.get_nsecs(); | |||||
| time = std::max(time, last_time + 2000); | |||||
| profiler.m_last_time = time; | |||||
| profiler.m_records.push_back({id, time, T{std::forward<TArgs>(args)...}}); | |||||
| profiler::Time time = sm_timer.record_host(); | |||||
| profiler.m_records.push_back({id, std::this_thread::get_id(), time, T{std::forward<TArgs>(args)...}}); | |||||
| if constexpr (sm_debug) { | if constexpr (sm_debug) { | ||||
| Status expected = Recording; | Status expected = Recording; | ||||
| mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running)); | mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running)); | ||||
| @@ -127,7 +141,8 @@ public: | |||||
| return id; | return id; | ||||
| } | } | ||||
| static results_t collect() { | |||||
| static bundle_t collect() { | |||||
| bundle_t bundle; | |||||
| MGB_LOCK_GUARD(sm_mutex); | MGB_LOCK_GUARD(sm_mutex); | ||||
| if constexpr (sm_debug) { | if constexpr (sm_debug) { | ||||
| for (auto&& [tid, profiler]: sm_profilers) { | for (auto&& [tid, profiler]: sm_profilers) { | ||||
| @@ -136,17 +151,17 @@ public: | |||||
| mgb_assert(profiler->m_status.compare_exchange_strong(expected, Collecting)); | mgb_assert(profiler->m_status.compare_exchange_strong(expected, Collecting)); | ||||
| } | } | ||||
| } | } | ||||
| std::vector<std::pair<std::thread::id, Record>> profile_data; | |||||
| std::vector<entry_t> profile_data; | |||||
| for (auto&& [tid, profiler]: sm_profilers) { | for (auto&& [tid, profiler]: sm_profilers) { | ||||
| sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size()); | sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size()); | ||||
| for (auto& record: profiler->m_records) { | for (auto& record: profiler->m_records) { | ||||
| profile_data.push_back({tid, std::move(record)}); | |||||
| profile_data.push_back(std::move(record)); | |||||
| } | } | ||||
| profiler->m_records.clear(); | profiler->m_records.clear(); | ||||
| profiler->m_records.reserve(sm_preferred_capacity); | profiler->m_records.reserve(sm_preferred_capacity); | ||||
| } | } | ||||
| std::sort(profile_data.begin(), profile_data.end(), [](auto& lhs, auto& rhs){ | std::sort(profile_data.begin(), profile_data.end(), [](auto& lhs, auto& rhs){ | ||||
| return lhs.second.id < rhs.second.id; | |||||
| return lhs.id < rhs.id; | |||||
| }); | }); | ||||
| if constexpr (sm_debug) { | if constexpr (sm_debug) { | ||||
| for (auto&& [tid, profiler]: sm_profilers) { | for (auto&& [tid, profiler]: sm_profilers) { | ||||
| @@ -155,7 +170,11 @@ public: | |||||
| mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running)); | mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running)); | ||||
| } | } | ||||
| } | } | ||||
| return profile_data; | |||||
| bundle.entries = profile_data; | |||||
| bundle.options = get_options(); | |||||
| bundle.start_at = sm_start_at; | |||||
| bundle.thread_dict = get_thread_dict(); | |||||
| return bundle; | |||||
| } | } | ||||
| static option_t get_option(std::string key, option_t default_val) { | static option_t get_option(std::string key, option_t default_val) { | ||||
| @@ -179,6 +198,7 @@ public: | |||||
| static void start_profile() { | static void start_profile() { | ||||
| mgb_assert(!sm_profiling); | mgb_assert(!sm_profiling); | ||||
| sm_start_at = Timer::record_host(); | |||||
| sm_profiling = true; | sm_profiling = true; | ||||
| } | } | ||||
| @@ -189,7 +209,7 @@ public: | |||||
| static thread_dict_t get_thread_dict(); | static thread_dict_t get_thread_dict(); | ||||
| static void dump_profile(std::string basename, std::string format, results_t results, options_t options); | |||||
| static void dump_profile(std::string basename, std::string format, bundle_t result); | |||||
| }; | }; | ||||
| @@ -19,7 +19,7 @@ target_include_directories(imperative_test PRIVATE ${MODULE_SRC_INCLUDE} ${PYTHO | |||||
| target_compile_definitions(imperative_test PRIVATE MODULE_NAME=C) | target_compile_definitions(imperative_test PRIVATE MODULE_NAME=C) | ||||
| target_compile_options(imperative_test PRIVATE -Wno-unused-parameter) | target_compile_options(imperative_test PRIVATE -Wno-unused-parameter) | ||||
| set(LINK_LIBS megbrain megdnn ${MGE_CUDA_LIBS} gtest gmock pybind11::embed range-v3) | |||||
| set(LINK_LIBS megbrain megdnn ${MGE_CUDA_LIBS} gtest gmock pybind11::embed range-v3 nlohmann_json::nlohmann_json) | |||||
| if(MGE_WITH_CUDA) | if(MGE_WITH_CUDA) | ||||
| list(APPEND LINK_LIBS cudart) | list(APPEND LINK_LIBS cudart) | ||||