| @@ -1012,9 +1012,10 @@ void init_tensor(py::module m) { | |||
| interpreter_for_py->stop_profile(); | |||
| interpreter_for_py->sync(); | |||
| imperative::Profiler::stop_profile(); | |||
| auto results = imperative::Profiler::collect(); | |||
| return [results=std::move(results)](std::string basename, std::string format){ | |||
| imperative::Profiler::dump_profile(basename, format, results); | |||
| auto results = std::make_shared<imperative::Profiler::bundle_t>(imperative::Profiler::collect()); | |||
| return [results=results](std::string basename, std::string format) mutable { | |||
| imperative::Profiler::dump_profile(basename, format, std::move(*results)); | |||
| results = nullptr; | |||
| }; | |||
| }, py::call_guard<py::gil_scoped_release>()); | |||
| m.def("sync", | |||
| @@ -1133,9 +1133,7 @@ void ChannelImpl::process_one_task(Command& icmd) { | |||
| } | |||
| } | |||
| CompNode::foreach([&](CompNode device){ | |||
| if (Profiler::get_option("sample_rate", 0)) { | |||
| sample_on_device(device, true); | |||
| } | |||
| sample_on_device(device, true); | |||
| MGB_RECORD_EVENT_IF((Profiler::get_option("profile_device", 0)), RecordDeviceEvent, Timer::record_device(device)); | |||
| }); | |||
| MGB_RECORD_EVENT(StartProfileFinishEvent); | |||
| @@ -1149,9 +1147,7 @@ void ChannelImpl::process_one_task(Command& icmd) { | |||
| MGB_RECORD_EVENT(TensorEraseEvent, info->id); | |||
| } | |||
| CompNode::foreach([&](CompNode device){ | |||
| if (Profiler::get_option("sample_rate", 0)) { | |||
| sample_on_device(device, true); | |||
| } | |||
| sample_on_device(device, true); | |||
| }); | |||
| MGB_RECORD_EVENT(StopProfileFinishEvent); | |||
| } else if constexpr (std::is_same_v<T, PushScope>) { | |||
| @@ -28,7 +28,7 @@ namespace mgb { | |||
| namespace imperative { | |||
| profiler::Time Timer::record_host() { | |||
| return std::chrono::high_resolution_clock::now(); | |||
| return std::chrono::system_clock::now(); | |||
| } | |||
| std::shared_ptr<CompNode::Event> Timer::record_device(CompNode device) { | |||
| @@ -40,12 +40,12 @@ std::shared_ptr<CompNode::Event> Timer::record_device(CompNode device) { | |||
| std::vector<Profiler::entry_t> Profiler::sm_records; | |||
| Profiler::options_t Profiler::sm_profile_options; | |||
| std::mutex Profiler::sm_mutex; | |||
| std::unordered_map<std::thread::id, Profiler*> Profiler::sm_profilers; | |||
| std::unordered_map<std::thread::id, std::unique_ptr<Profiler>> Profiler::sm_profilers; | |||
| Timer Profiler::sm_timer; | |||
| profiler::HostTime Profiler::sm_start_at = profiler::HostTime::min(); | |||
| std::atomic_uint64_t Profiler::sm_last_id = 0; | |||
| bool Profiler::sm_profiling = false; | |||
| thread_local std::unique_ptr<Profiler> Profiler::tm_profiler = std::make_unique<Profiler>(); | |||
| thread_local Profiler* Profiler::tm_profiler = nullptr; | |||
| std::atomic_size_t Profiler::sm_preferred_capacity; | |||
| auto Profiler::get_thread_dict() -> thread_dict_t { | |||
| @@ -65,7 +65,7 @@ void Profiler::dump_profile(std::string basename, std::string format, bundle_t r | |||
| if (iter == format_table.end()) { | |||
| mgb_log_error("unsupported profiling format %s", format.c_str()); | |||
| } | |||
| return (iter->second)(basename, result); | |||
| return (iter->second)(basename, std::move(result)); | |||
| } | |||
| } // namespace imperative | |||
| @@ -268,7 +268,7 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> { | |||
| .cat("Kernel") | |||
| .args(current_op->detail()); | |||
| } else if constexpr (std::is_same_v<TEvent, TensorProduceEvent>) { | |||
| if (current_tensor->living_time != profiler::Duration::zero()) { | |||
| if (current_tensor->living_time == profiler::Duration::zero()) { | |||
| new_host_event(pid_str, 's') | |||
| .id(event.tensor_id) | |||
| .cat("TensorLink") | |||
| @@ -319,8 +319,8 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> { | |||
| .scope(pid_str); | |||
| } else if constexpr (std::is_same_v<TEvent, SampleDeviceFinishEvent>) { | |||
| std::string device_name = event.device.locator().to_string(); | |||
| new_host_event("memory", 'C') | |||
| .arg(ssprintf("%s_alloc_mem", device_name.c_str()), event.total_memory - event.free_memory); | |||
| new_host_event(ssprintf("%s_alloc_mem", device_name.c_str()), 'C') | |||
| .arg("value", event.total_memory - event.free_memory); | |||
| } else if constexpr (std::is_same_v<TEvent, TensorCommandEvent>) { | |||
| new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'B'); | |||
| } else if constexpr (std::is_same_v<TEvent, TensorCommandFinishEvent>) { | |||
| @@ -366,6 +366,12 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> { | |||
| .arg("dtype", event.layout.dtype.name()) | |||
| .arg("nr_elements", event.layout.total_nr_elems()) | |||
| .arg("device", event.device.to_string()); | |||
| } else if constexpr (std::is_same_v<TEvent, RecordDeviceEvent>) { | |||
| auto current_host_time = current->time; | |||
| auto current_device_time = to_device_time(current->time, event.event->comp_node()); | |||
| auto device_ahead = std::chrono::duration_cast<std::chrono::milliseconds>(current_device_time-current_host_time); | |||
| new_host_event("device_ahead_ms", 'C') | |||
| .arg("value", device_ahead.count()); | |||
| } | |||
| } | |||
| @@ -261,7 +261,7 @@ struct MemoryFlowVisitor: EventVisitor<MemoryFlowVisitor> { | |||
| void dump_memory_flow(std::string filename, Profiler::bundle_t result) { | |||
| MemoryFlowVisitor visitor; | |||
| visitor.process_events(std::move(result)); | |||
| visitor.process_events(result); | |||
| debug::write_to_file(filename.c_str(), visitor.memory_flow.to_svg().to_string()); | |||
| } | |||
| @@ -139,22 +139,22 @@ struct is_trace_event<T, decltype(std::declval<T>().trace, void())> : std::true_ | |||
| template <typename... TItems> | |||
| class AnyToVariantConverter { | |||
| public: | |||
| using any_t = std::any; | |||
| using any_t = AnyPtr; | |||
| using variant_t = std::variant<TItems...>; | |||
| private: | |||
| std::unordered_map<std::type_index, std::function<variant_t(any_t)>> m_table; | |||
| std::unordered_map<std::type_index, std::function<variant_t(const any_t&)>> m_table; | |||
| template <typename TItem> | |||
| void register_converter() { | |||
| m_table[typeid(TItem)] = [](any_t input) { | |||
| return variant_t(std::any_cast<TItem>(std::move(input))); | |||
| m_table[typeid(TItem)] = [](const any_t& input) { | |||
| return variant_t(*input.as<TItem>()); | |||
| }; | |||
| } | |||
| public: | |||
| AnyToVariantConverter() { | |||
| (register_converter<TItems>(), ...); | |||
| } | |||
| variant_t operator()(any_t input) { | |||
| variant_t operator()(const any_t& input) { | |||
| return m_table[input.type()](std::move(input)); | |||
| } | |||
| }; | |||
| @@ -222,7 +222,7 @@ protected: | |||
| value += delta; | |||
| } | |||
| public: | |||
| void process_events(Profiler::bundle_t bundle) { | |||
| void process_events(Profiler::bundle_t& bundle) { | |||
| m_start_time = bundle.start_at; | |||
| auto& self = static_cast<TSelf&>(*this); | |||
| @@ -231,7 +231,7 @@ public: | |||
| OpInputEvent, OpInputFinishEvent, OpOutputEvent, OpOutputFinishEvent, | |||
| TensorDeclareEvent, TensorProduceEvent, TensorUsageEvent, TensorReleaseEvent, TensorEraseEvent, | |||
| TensorGetPropEvent, TensorNotifyPropEvent, TensorWaitPropEvent, TensorWaitPropFinishEvent, | |||
| SampleDeviceEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent, | |||
| SampleDeviceEvent, SampleDeviceFinishEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent, | |||
| StartProfileEvent, StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent, | |||
| TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, AutoEvictFinishEvent, | |||
| CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent, | |||
| @@ -298,6 +298,7 @@ public: | |||
| m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()}; | |||
| } | |||
| tensor.device = event.device; | |||
| tensor.layout = event.layout; | |||
| } | |||
| }); | |||
| @@ -34,9 +34,10 @@ namespace imperative { | |||
| namespace profiler { | |||
| using HostTime = std::chrono::time_point<std::chrono::high_resolution_clock>; | |||
| using HostTime = std::chrono::time_point<std::chrono::system_clock>; | |||
| using Duration = std::chrono::nanoseconds; | |||
| using RealDuration = std::chrono::duration<double, std::nano>; | |||
| using Time = HostTime; | |||
| @@ -50,6 +51,52 @@ public: | |||
| static std::shared_ptr<CompNode::Event> record_device(CompNode device); | |||
| }; | |||
| class AnyPtr { | |||
| public: | |||
| struct Deleter { | |||
| void* object; | |||
| void (*method)(void*, void*); | |||
| void operator() (void* ptr) { | |||
| method(object, ptr); | |||
| } | |||
| }; | |||
| private: | |||
| using holder_t = std::unique_ptr<void, Deleter>; | |||
| const std::type_info* m_type = nullptr; | |||
| holder_t m_holder = nullptr; | |||
| public: | |||
| AnyPtr() = default; | |||
| template <typename T, typename=std::enable_if_t<!std::is_same_v<std::decay_t<T>, AnyPtr>>> | |||
| explicit AnyPtr(T* value, Deleter deleter) { | |||
| m_type = &typeid(T); | |||
| m_holder = {value, deleter}; | |||
| } | |||
| template <typename T> | |||
| T* as() { | |||
| mgb_assert(is_exactly<T>(), "type mismatch"); | |||
| return reinterpret_cast<T*>(m_holder.get()); | |||
| } | |||
| template <typename T> | |||
| const T* as() const { | |||
| mgb_assert(is_exactly<T>(), "type mismatch"); | |||
| return reinterpret_cast<const T*>(m_holder.get()); | |||
| } | |||
| template <typename T> | |||
| bool is_exactly() const { | |||
| return std::type_index{typeid(T)} == std::type_index{*m_type}; | |||
| } | |||
| const std::type_info& type() const { | |||
| return *m_type; | |||
| } | |||
| bool operator==(std::nullptr_t nptr) const { | |||
| return m_holder == nullptr; | |||
| } | |||
| operator bool() const { | |||
| return m_holder != nullptr; | |||
| } | |||
| }; | |||
| class Profiler { | |||
| public: | |||
| @@ -57,7 +104,10 @@ public: | |||
| uint64_t id; | |||
| std::thread::id tid; | |||
| profiler::Time time; | |||
| std::any data; | |||
| AnyPtr data; | |||
| Record() = default; | |||
| Record(uint64_t id, std::thread::id tid, profiler::Time time, AnyPtr data): | |||
| id{id}, tid{tid}, time{time}, data{std::move(data)} {}; | |||
| }; | |||
| enum Status: uint8_t { | |||
| Running = 0, | |||
| @@ -82,36 +132,52 @@ private: | |||
| std::thread::id m_thread_id; | |||
| std::vector<Record> m_records; | |||
| std::atomic<Status> m_status = Running; | |||
| std::unordered_map<std::type_index, AnyPtr> m_mem_pools; | |||
| static std::vector<entry_t> sm_records; | |||
| static options_t sm_profile_options; | |||
| static std::mutex sm_mutex; | |||
| static std::unordered_map<std::thread::id, Profiler*> sm_profilers; | |||
| // assume std::thread::id is unique | |||
| static std::unordered_map<std::thread::id, std::unique_ptr<Profiler>> sm_profilers; | |||
| static Timer sm_timer; | |||
| static profiler::HostTime sm_start_at; | |||
| static std::atomic_uint64_t sm_last_id; | |||
| static std::atomic_size_t sm_preferred_capacity; | |||
| static bool sm_profiling; | |||
| static constexpr bool sm_debug = false; | |||
| thread_local static std::unique_ptr<Profiler> tm_profiler; | |||
| thread_local static Profiler* tm_profiler; | |||
| public: | |||
| Profiler() { | |||
| m_thread_id = std::this_thread::get_id(); | |||
| MGB_LOCK_GUARD(sm_mutex); | |||
| mgb_assert(sm_profilers.count(m_thread_id) == 0); | |||
| sm_profilers[m_thread_id] = this; | |||
| } | |||
| ~Profiler() { | |||
| MGB_LOCK_GUARD(sm_mutex); | |||
| mgb_assert(sm_profilers.count(m_thread_id) == 1); | |||
| sm_profilers.erase(m_thread_id); | |||
| sm_records.insert(sm_records.end(), m_records.begin(), m_records.end()); | |||
| explicit Profiler(std::thread::id tid): m_thread_id{tid} { | |||
| mgb_assert(tid == std::this_thread::get_id(), "thread id mismatch"); | |||
| } | |||
| public: | |||
| static Profiler& get_instance() { | |||
| if (!tm_profiler) { | |||
| MGB_LOCK_GUARD(sm_mutex); | |||
| auto& profiler = sm_profilers[std::this_thread::get_id()]; | |||
| if (!profiler) { | |||
| profiler = std::make_unique<Profiler>(std::this_thread::get_id()); | |||
| } | |||
| tm_profiler = profiler.get(); | |||
| } | |||
| return *tm_profiler; | |||
| } | |||
| template <typename T> | |||
| static MemPool<T>& get_mem_pool() { | |||
| thread_local MemPool<T>* t_pool = nullptr; | |||
| if (t_pool == nullptr) { | |||
| auto& pool = get_instance().m_mem_pools[typeid(MemPool<T>)]; | |||
| if (pool == nullptr) { | |||
| pool = AnyPtr(new MemPool<T>(), {nullptr, [](void*, void* ptr){ | |||
| delete reinterpret_cast<MemPool<T>*>(ptr); | |||
| }}); | |||
| } | |||
| t_pool = pool.as<MemPool<T>>(); | |||
| } | |||
| return *t_pool; | |||
| } | |||
| static uint64_t next_id() { | |||
| return sm_last_id++; | |||
| } | |||
| @@ -119,13 +185,19 @@ public: | |||
| template <typename T, typename... TArgs> | |||
| static uint64_t record(TArgs&&... args) { | |||
| auto& profiler = get_instance(); | |||
| auto& mem_pool = get_mem_pool<T>(); | |||
| if constexpr (sm_debug) { | |||
| Status expected = Running; | |||
| mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording)); | |||
| } | |||
| uint64_t id = next_id(); | |||
| profiler::Time time = sm_timer.record_host(); | |||
| profiler.m_records.push_back({id, std::this_thread::get_id(), time, T{std::forward<TArgs>(args)...}}); | |||
| auto deleter = [](void* obj, void* ptr){ | |||
| reinterpret_cast<MemPool<T>*>(obj)->free(reinterpret_cast<T*>(ptr)); | |||
| }; | |||
| profiler.m_records.emplace_back(id, profiler.m_thread_id, time, AnyPtr{ | |||
| mem_pool.alloc(T{std::forward<TArgs>(args)...}), {&mem_pool, deleter} | |||
| }); | |||
| if constexpr (sm_debug) { | |||
| Status expected = Recording; | |||
| mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running)); | |||
| @@ -146,7 +218,9 @@ public: | |||
| std::vector<entry_t> profile_data = std::move(sm_records); | |||
| for (auto&& [tid, profiler]: sm_profilers) { | |||
| sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size()); | |||
| profile_data.insert(profile_data.end(), profiler->m_records.begin(), profiler->m_records.end()); | |||
| profile_data.insert(profile_data.end(), | |||
| std::make_move_iterator(profiler->m_records.begin()), | |||
| std::make_move_iterator(profiler->m_records.end())); | |||
| profiler->m_records.clear(); | |||
| profiler->m_records.reserve(sm_preferred_capacity); | |||
| } | |||
| @@ -160,11 +234,11 @@ public: | |||
| mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running)); | |||
| } | |||
| } | |||
| bundle.entries = profile_data; | |||
| bundle.entries = std::move(profile_data); | |||
| bundle.options = get_options(); | |||
| bundle.start_at = sm_start_at; | |||
| bundle.thread_dict = get_thread_dict(); | |||
| return bundle; | |||
| return std::move(bundle); | |||
| } | |||
| static option_t get_option(std::string key, option_t default_val) { | |||
| @@ -203,31 +277,6 @@ public: | |||
| }; | |||
| class ProfileDataCollector { | |||
| public: | |||
| template <typename T> | |||
| using SubCollector = std::function<void(uint64_t, std::thread::id, uint64_t, T)>; | |||
| private: | |||
| std::unordered_map<std::type_index, SubCollector<std::any>> m_collectors; | |||
| public: | |||
| template <typename T> | |||
| ProfileDataCollector& handle(SubCollector<T> collector) { | |||
| auto erased = [collector](uint64_t id, std::thread::id tid, uint64_t time, std::any data){ | |||
| collector(id, tid, time, std::any_cast<T>(std::move(data))); | |||
| }; | |||
| m_collectors[typeid(T)] = erased; | |||
| return *this; | |||
| } | |||
| void operator()(uint64_t id, std::thread::id tid, uint64_t time, std::any event) { | |||
| std::type_index type = event.type(); | |||
| if (m_collectors.count(type) == 0) { | |||
| return; | |||
| } | |||
| auto& handler = m_collectors.at(type); | |||
| handler(id, tid, time, std::move(event)); | |||
| } | |||
| }; | |||
| #define MGB_RECORD_EVENT(type, ...) \ | |||
| if (mgb::imperative::Profiler::is_profiling()) { \ | |||
| mgb::imperative::Profiler::record<type>(type{__VA_ARGS__}); \ | |||
| @@ -25,11 +25,11 @@ TEST(TestProfiler, ImperativeLogProfile) { | |||
| imperative_log_profile("XXX"); | |||
| auto results = imperative::Profiler::collect(); | |||
| imperative::Profiler::stop_profile(); | |||
| mgb_assert(results.size() == 2); | |||
| auto* event_start = std::any_cast<profiler::CustomEvent>(&results[0].second.data); | |||
| auto* event_finish = std::any_cast<profiler::CustomFinishEvent>(&results[1].second.data); | |||
| mgb_assert(results.entries.size() == 2); | |||
| auto* event_start = results.entries[0].data.as<profiler::CustomEvent>(); | |||
| auto* event_finish = results.entries[1].data.as<profiler::CustomFinishEvent>(); | |||
| mgb_assert(event_start && event_start->title == "XXX"); | |||
| mgb_assert(event_finish && event_finish->title == "XXX"); | |||
| mgb_assert(results[0].second.time < results[1].second.time); | |||
| mgb_assert(results[0].second.id < results[1].second.id); | |||
| mgb_assert(results.entries[0].time < results.entries[1].time); | |||
| mgb_assert(results.entries[0].id < results.entries[1].id); | |||
| } | |||