| @@ -616,7 +616,7 @@ void ChannelImpl::release_tensor(TensorInfo* dest) { | |||||
| void ChannelImpl::regenerate(TensorInfo* dest) { | void ChannelImpl::regenerate(TensorInfo* dest) { | ||||
| if (dest->evict_type == EvictType::DROP) { | if (dest->evict_type == EvictType::DROP) { | ||||
| auto &&path = dest->producer; | auto &&path = dest->producer; | ||||
| m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest}); | |||||
| m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest, "dtr"}); | |||||
| if (!m_applying) flush_apply_stack(); | if (!m_applying) flush_apply_stack(); | ||||
| } else if (dest->evict_type == EvictType::SWAP) { | } else if (dest->evict_type == EvictType::SWAP) { | ||||
| MGB_RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen); | MGB_RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen); | ||||
| @@ -625,7 +625,7 @@ void ChannelImpl::regenerate(TensorInfo* dest) { | |||||
| } | } | ||||
| } | } | ||||
| void ChannelImpl::do_apply_op(const ApplyOp& cmd) { | |||||
| void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) { | |||||
| using namespace ranges; | using namespace ranges; | ||||
| using namespace ranges::views; | using namespace ranges::views; | ||||
| auto& state = get_worker_state(); | auto& state = get_worker_state(); | ||||
| @@ -689,7 +689,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) { | |||||
| } | } | ||||
| return outputs; | return outputs; | ||||
| }; | }; | ||||
| MGB_RECORD_EVENT(OpExecuteEvent, apply_id); | |||||
| MGB_RECORD_EVENT(OpExecuteEvent, apply_id, {}, reason); | |||||
| // Begin profiling operator | // Begin profiling operator | ||||
| SmallVector<std::pair<CompNode, uint64_t>> kernels; | SmallVector<std::pair<CompNode, uint64_t>> kernels; | ||||
| if (profiling_device) { | if (profiling_device) { | ||||
| @@ -769,7 +769,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) { | |||||
| } | } | ||||
| m_dtr.unpin(cmd.inputs); | m_dtr.unpin(cmd.inputs); | ||||
| } | } | ||||
| MGB_RECORD_EVENT(OpExecuteFinishEvent, apply_id); | |||||
| MGB_RECORD_EVENT(OpExecuteFinishEvent, apply_id, {}, reason); | |||||
| // End profiling operator | // End profiling operator | ||||
| } | } | ||||
| @@ -777,7 +777,7 @@ void ChannelImpl::flush_apply_stack() { | |||||
| m_applying = true; | m_applying = true; | ||||
| auto& state = get_worker_state(); | auto& state = get_worker_state(); | ||||
| while (!m_apply_stack.empty()) { | while (!m_apply_stack.empty()) { | ||||
| auto& [cmd, idx, recomp] = m_apply_stack.top(); // cmd.inputs[0~idx-1] is in memory | |||||
| auto& [cmd, idx, recomp, reason] = m_apply_stack.top(); // cmd.inputs[0~idx-1] is in memory | |||||
| if (idx == 0) { | if (idx == 0) { | ||||
| if (state.options.enable_dtr_auto_drop) { | if (state.options.enable_dtr_auto_drop) { | ||||
| m_dtr.pin(cmd.inputs); | m_dtr.pin(cmd.inputs); | ||||
| @@ -801,10 +801,9 @@ void ChannelImpl::flush_apply_stack() { | |||||
| } | } | ||||
| if (regen) continue; | if (regen) continue; | ||||
| // the required input tensors are already in memory | // the required input tensors are already in memory | ||||
| auto cmd_backup = cmd; | |||||
| auto recomp_backup = recomp; | |||||
| auto [cmd_backup, recomp_backup, reason_backup] = std::make_tuple(cmd, recomp, reason); | |||||
| m_apply_stack.pop(); | m_apply_stack.pop(); | ||||
| do_apply_op(cmd_backup); | |||||
| do_apply_op(cmd_backup, reason_backup); | |||||
| if (recomp_backup) { | if (recomp_backup) { | ||||
| MGB_RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen); | MGB_RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen); | ||||
| for (auto o : cmd_backup.outputs) { | for (auto o : cmd_backup.outputs) { | ||||
| @@ -829,6 +828,7 @@ bool ChannelImpl::auto_evict(size_t force_num) { | |||||
| sample_on_device(m_dtr.comp_node, false); | sample_on_device(m_dtr.comp_node, false); | ||||
| auto best = m_dtr.find_best_tensor(state.options.enable_dtr_sqrt_sampling && !force_num); | auto best = m_dtr.find_best_tensor(state.options.enable_dtr_sqrt_sampling && !force_num); | ||||
| if (!best) { | if (!best) { | ||||
| MGB_RECORD_EVENT(AutoEvictFinishEvent); | |||||
| break; | break; | ||||
| } | } | ||||
| if (best->ptr.unique() && best->ptr->blob().unique()) { | if (best->ptr.unique() && best->ptr->blob().unique()) { | ||||
| @@ -947,7 +947,9 @@ void ChannelImpl::alloc_tensor_with_evict(Blob* x) { | |||||
| set_log_level(pre_level); | set_log_level(pre_level); | ||||
| mgb_log_warn("reallocating all cuda memory to alleviate fragmentation, the performance may be affected"); | mgb_log_warn("reallocating all cuda memory to alleviate fragmentation, the performance may be affected"); | ||||
| set_log_level(LogLevel::NO_LOG); | set_log_level(LogLevel::NO_LOG); | ||||
| imperative_log_profile_begin("defrag"); | |||||
| BlobManager::inst()->defrag(x->comp_node()); | BlobManager::inst()->defrag(x->comp_node()); | ||||
| imperative_log_profile_end("defrag"); | |||||
| BlobManager::inst()->alloc_direct(x, x->size()); | BlobManager::inst()->alloc_direct(x, x->size()); | ||||
| } | } | ||||
| }); | }); | ||||
| @@ -1025,7 +1027,7 @@ void ChannelImpl::process_one_task(Command& icmd) { | |||||
| return; | return; | ||||
| } | } | ||||
| } | } | ||||
| m_apply_stack.push({cmd, 0, nullptr}); | |||||
| m_apply_stack.push({cmd, 0, nullptr, "cmd"}); | |||||
| flush_apply_stack(); | flush_apply_stack(); | ||||
| for (size_t i = 0; i < cmd.outputs.size(); ++i) { | for (size_t i = 0; i < cmd.outputs.size(); ++i) { | ||||
| auto output = cmd.outputs[i]; | auto output = cmd.outputs[i]; | ||||
| @@ -104,8 +104,8 @@ private: | |||||
| void release_tensor(TensorInfo* dest); | void release_tensor(TensorInfo* dest); | ||||
| void regenerate(TensorInfo* dest); | void regenerate(TensorInfo* dest); | ||||
| void do_apply_op(const ApplyOp& cmd); | |||||
| void flush_apply_stack(); | void flush_apply_stack(); | ||||
| void do_apply_op(const ApplyOp& cmd, std::string reason); | |||||
| std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPtr>> init_output_and_workspace( | std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPtr>> init_output_and_workspace( | ||||
| const OpDef& def, | const OpDef& def, | ||||
| @@ -150,7 +150,8 @@ private: | |||||
| std::exception_ptr m_worker_exc; | std::exception_ptr m_worker_exc; | ||||
| std::function<void(std::string, std::string)> m_profile_dump_callback; | std::function<void(std::string, std::string)> m_profile_dump_callback; | ||||
| size_t m_storage_id = 0; | size_t m_storage_id = 0; | ||||
| std::stack<std::tuple<ApplyOp, size_t, TensorInfo*>> m_apply_stack; | |||||
| // TODO: use explicit struct | |||||
| std::stack<std::tuple<ApplyOp, size_t, TensorInfo*, std::string>> m_apply_stack; | |||||
| bool m_applying = false; | bool m_applying = false; | ||||
| bool m_closed = false; | bool m_closed = false; | ||||
| @@ -200,13 +200,13 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> { | |||||
| ChromeTraceEvent& new_host_event(std::string name, char ph) { | ChromeTraceEvent& new_host_event(std::string name, char ph) { | ||||
| return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(current->tid)).ts(since_start(current->time)); | return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(current->tid)).ts(since_start(current->time)); | ||||
| }; | |||||
| } | |||||
| ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { | ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { | ||||
| using namespace std::literals::chrono_literals; | using namespace std::literals::chrono_literals; | ||||
| auto time = since_start(to_device_time(current->time, device)); | auto time = since_start(to_device_time(current->time, device)); | ||||
| return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(device)).ts(time); | return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(device)).ts(time); | ||||
| }; | |||||
| } | |||||
| const char* to_cstr(TensorCommandKind kind) { | const char* to_cstr(TensorCommandKind kind) { | ||||
| switch(kind) { | switch(kind) { | ||||
| @@ -241,14 +241,12 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> { | |||||
| new_host_event("OpDispatch", 'E').args(current_op->detail()); | new_host_event("OpDispatch", 'E').args(current_op->detail()); | ||||
| } else if constexpr (std::is_same_v<TEvent, OpExecuteEvent>) { | } else if constexpr (std::is_same_v<TEvent, OpExecuteEvent>) { | ||||
| mgb_assert(event.op_id != 0); | mgb_assert(event.op_id != 0); | ||||
| current_op->execute_begin = current->time; | |||||
| new_host_event(current_op->name, 'B'); | new_host_event(current_op->name, 'B'); | ||||
| new_host_event(pid_str, 't') | new_host_event(pid_str, 't') | ||||
| .cat("OpDispatch") | .cat("OpDispatch") | ||||
| .id(current_op->id) | .id(current_op->id) | ||||
| .scope(pid_str); | .scope(pid_str); | ||||
| } else if constexpr (std::is_same_v<TEvent, OpExecuteFinishEvent>) { | } else if constexpr (std::is_same_v<TEvent, OpExecuteFinishEvent>) { | ||||
| current_op->execute_end = current->time; | |||||
| new_host_event(current_op->name, 'E') | new_host_event(current_op->name, 'E') | ||||
| .args(current_op->detail()); | .args(current_op->detail()); | ||||
| } else if constexpr (std::is_same_v<TEvent, KernelLaunchEvent>) { | } else if constexpr (std::is_same_v<TEvent, KernelLaunchEvent>) { | ||||
| @@ -84,6 +84,7 @@ DEF_DUR_EVENT(OpOutput, { | |||||
| DEF_DUR_EVENT(OpExecute, { | DEF_DUR_EVENT(OpExecute, { | ||||
| uint64_t op_id; | uint64_t op_id; | ||||
| SmallVector<CompNode> device_list; | SmallVector<CompNode> device_list; | ||||
| std::string reason; | |||||
| }); | }); | ||||
| DEF_DUR_EVENT(KernelLaunch, { | DEF_DUR_EVENT(KernelLaunch, { | ||||
| @@ -62,8 +62,13 @@ struct ProfileOperatorState { | |||||
| CompNode device; | CompNode device; | ||||
| Trace trace; | Trace trace; | ||||
| profiler::HostTime execute_begin; | |||||
| profiler::HostTime execute_end; | |||||
| struct Execution { | |||||
| std::string reason; | |||||
| profiler::HostTime begin; | |||||
| profiler::HostTime end; | |||||
| }; | |||||
| SmallVector<Execution> executions; | |||||
| nlohmann::json detail() { | nlohmann::json detail() { | ||||
| nlohmann::json args; | nlohmann::json args; | ||||
| @@ -285,7 +290,7 @@ public: | |||||
| op.outputs = event.outputs; | op.outputs = event.outputs; | ||||
| op.trace = event.trace; | op.trace = event.trace; | ||||
| for (auto&& output: event.outputs) { | for (auto&& output: event.outputs) { | ||||
| m_tensors.at(output).source = op.id; | |||||
| m_tensors[output].source = op.id; | |||||
| } | } | ||||
| } else if constexpr (std::is_same_v<T, TensorDeclareEvent>) { | } else if constexpr (std::is_same_v<T, TensorDeclareEvent>) { | ||||
| auto& tensor = m_tensors[event.tensor_id]; | auto& tensor = m_tensors[event.tensor_id]; | ||||
| @@ -293,7 +298,7 @@ public: | |||||
| tensor.id = event.tensor_id; | tensor.id = event.tensor_id; | ||||
| tensor.name = event.name; | tensor.name = event.name; | ||||
| } else if constexpr (std::is_same_v<T, TensorProduceEvent>) { | } else if constexpr (std::is_same_v<T, TensorProduceEvent>) { | ||||
| auto& tensor = m_tensors.at(event.tensor_id); | |||||
| auto& tensor = m_tensors[event.tensor_id]; | |||||
| if (!m_device_tid_table.count(event.device)) { | if (!m_device_tid_table.count(event.device)) { | ||||
| m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()}; | m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()}; | ||||
| } | } | ||||
| @@ -308,15 +313,24 @@ public: | |||||
| using T = std::decay_t<decltype(event)>; | using T = std::decay_t<decltype(event)>; | ||||
| // update current_op/tensor | // update current_op/tensor | ||||
| if constexpr (is_op_event<T>::value) { | if constexpr (is_op_event<T>::value) { | ||||
| current_op = &m_operators.at(event.op_id); | |||||
| current_op = &m_operators[event.op_id]; | |||||
| if (current_op->id == 0) { | |||||
| current_op->id = event.op_id; | |||||
| current_op->name = "UnknownOperator"; | |||||
| } | |||||
| } else if constexpr (is_tensor_event<T>::value) { | } else if constexpr (is_tensor_event<T>::value) { | ||||
| mgb_assert(m_tensors.count(event.tensor_id) != 0, "tensor not found"); | |||||
| current_tensor = &m_tensors.at(event.tensor_id); | |||||
| current_tensor = &m_tensors[event.tensor_id]; | |||||
| if (current_tensor->id == 0) { | |||||
| current_tensor->id = event.tensor_id; | |||||
| current_tensor->name = "UnknownTensor"; | |||||
| } | |||||
| } | } | ||||
| if constexpr (std::is_same_v<T, OpExecuteEvent>) { | if constexpr (std::is_same_v<T, OpExecuteEvent>) { | ||||
| current_op->execute_begin = current->time; | |||||
| current_op->executions.emplace_back(); | |||||
| current_op->executions.back().reason = event.reason; | |||||
| current_op->executions.back().begin = current->time; | |||||
| } else if constexpr (std::is_same_v<T, OpExecuteFinishEvent>) { | } else if constexpr (std::is_same_v<T, OpExecuteFinishEvent>) { | ||||
| current_op->execute_end = current->time; | |||||
| current_op->executions.back().end = current->time; | |||||
| } | } | ||||
| // update counters | // update counters | ||||
| if constexpr (std::is_same_v<T, OpDispatchEvent>) { | if constexpr (std::is_same_v<T, OpDispatchEvent>) { | ||||
| @@ -337,6 +351,12 @@ public: | |||||
| } | } | ||||
| } else if constexpr (std::is_same_v<T, WorkerExceptionEvent>) { | } else if constexpr (std::is_same_v<T, WorkerExceptionEvent>) { | ||||
| inc_counter("nr_exception", 1); | inc_counter("nr_exception", 1); | ||||
| } else if constexpr (std::is_same_v<T, KernelLaunchFinishEvent>) { | |||||
| auto& execution = current_op->executions.back(); | |||||
| if (execution.reason == "dtr") { | |||||
| auto overhead = to_device_time(current->time, event.device) - to_device_time(execution.begin, event.device); | |||||
| inc_counter("dtr_overhead_us", std::chrono::duration_cast<std::chrono::microseconds>(overhead).count()); | |||||
| } | |||||
| } | } | ||||
| // visit_event_impl | // visit_event_impl | ||||
| self.visit_event(event); | self.visit_event(event); | ||||