| @@ -6,24 +6,148 @@ | |||
| # Unless required by applicable law or agreed to in writing, | |||
| # software distributed under the License is distributed on an | |||
| # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| from typing import Optional | |||
| import base64 | |||
| import json | |||
| import os | |||
| from typing import List, Optional | |||
| from ..core._imperative_rt import ProfilerImpl | |||
| from ..core._imperative_rt import OperatorNodeConfig, ProfileEntry | |||
| from ..core._imperative_rt import ProfilerImpl as _Profiler | |||
| from ..core._imperative_rt.imperative import sync | |||
| from ..core._imperative_rt.ops import CollectiveCommMode | |||
| from ..core.ops.builtin import GetVarShape | |||
| class Profiler: | |||
| def __init__(self, path: Optional[str] = None): | |||
| self.impl = ProfilerImpl(path) | |||
| r""" | |||
| Profile graph execution in imperative mode. | |||
| :type path: Optional[str] | |||
| :param path: default path for profiler to dump | |||
| Examples: | |||
| .. testcode:: | |||
| import megengine as mge | |||
| import megengine.module as M | |||
| import megengine.utils.profiler.Profiler | |||
| # With Learnable Parameters | |||
| for iter in range(0, 10): | |||
| # Only profile record of last iter would be saved | |||
| with Profiler("profile.json"): | |||
| # your code here | |||
| # Then open the profile file in chrome timeline window | |||
| """ | |||
| # see https://github.com/catapult-project/catapult/blob/master/tracing/tracing/base/color_scheme.html | |||
| GOOD = "good" | |||
| BAD = "bad" | |||
| TERRIBLE = "terrible" | |||
| BLACK = "black" | |||
| GREY = "grey" | |||
| WHITE = "white" | |||
| YELLOW = "yellow" | |||
| OLIVE = "olive" | |||
| def __init__(self, path: str = "profile.json"): | |||
| self._impl = _Profiler() | |||
| self._path = path | |||
| self._color_map = {} | |||
| self._type_map = { | |||
| OperatorNodeConfig: lambda x: self.print_opnode_config(x), | |||
| bytes: lambda x: base64.encodebytes(x).decode("ascii"), | |||
| CollectiveCommMode: lambda x: str(x), | |||
| } | |||
| def __enter__(self): | |||
| sync() | |||
| self.impl.enable() | |||
| self._impl.start() | |||
| return self | |||
| def __exit__(self, val, type, trace): | |||
| sync() | |||
| self.impl.disable() | |||
| self._impl.stop() | |||
| if self._path is not None: | |||
| self.dump() | |||
| def recolor(self, target: str, color: str): | |||
| self._color_map[target] = color | |||
| return self | |||
| def print_opnode_config(self, config): | |||
| return self.make_dict( | |||
| name=config.name, dtype=config.dtype, comp_node_arr=config.comp_node_arr, | |||
| ) | |||
| def fetch_attrs(self, op): | |||
| attrs = dir(op) | |||
| results = {} | |||
| for attr in attrs: | |||
| if attr.startswith("_"): | |||
| continue | |||
| value = op.__getattribute__(attr) | |||
| if callable(value): | |||
| continue | |||
| value_type = type(value) | |||
| if value_type in self._type_map: | |||
| value = self._type_map[value_type](value) | |||
| results[attr] = value | |||
| return results | |||
| def make_dict(self, **kwargs): | |||
| unused_keys = [] | |||
| for k, v in kwargs.items(): | |||
| if v is None: | |||
| unused_keys.append(k) | |||
| for k in unused_keys: | |||
| del kwargs[k] | |||
| return kwargs | |||
| def dump(self, path: Optional[str] = None): | |||
| self.impl.dump(path) | |||
| pid = os.getpid() | |||
| if path is None: | |||
| path = self._path | |||
| trace_events = [] | |||
| def append_event(**kwargs): | |||
| trace_events.append(self.make_dict(**kwargs)) | |||
| entries: List[ProfileEntry] = self._impl.dump() | |||
| for id, entry in enumerate(entries): | |||
| op = entry.op | |||
| name = type(op).__name__ | |||
| host_begin, host_end = entry.host | |||
| device_list = entry.device_list | |||
| args = self.fetch_attrs(op) | |||
| args["__id__"] = "[{}]".format(id) | |||
| cname = self._color_map[name] if name in self._color_map else None | |||
| cat = name | |||
| for ts, ph in [(host_begin, "B"), (host_end, "E")]: | |||
| append_event( | |||
| name=name, | |||
| ph=ph, | |||
| ts=ts * 1000, | |||
| pid=pid, | |||
| tid="host", | |||
| args=args, | |||
| cname=cname, | |||
| cat=cat, | |||
| ) | |||
| for device, device_begin, device_end in device_list: | |||
| for ts, ph in [(device_begin(), "B"), (device_end(), "E")]: | |||
| append_event( | |||
| name=name, | |||
| ph=ph, | |||
| ts=ts * 1000, | |||
| pid=pid, | |||
| tid=str(device), | |||
| args=args, | |||
| cname=cname, | |||
| ) | |||
| with open(path, "w") as f: | |||
| json.dump(trace_events, f, indent=2) | |||
| @@ -651,9 +651,14 @@ PyObject* npy::dtype_mgb2np(mgb::DType dtype) { | |||
| // https://docs.scipy.org/doc/numpy/reference/c-api.array.html#c.PyArray_TypeObjectFromType | |||
| // the following is equivalent to PyArray_TypeObjectFromType for built-in | |||
| // types. | |||
| if(!dtype.valid()){ | |||
| Py_XINCREF(Py_None); | |||
| return Py_None; | |||
| } | |||
| auto descr = dtype_mgb2np_descr(dtype); | |||
| if (descr == nullptr) { | |||
| return nullptr; | |||
| Py_XINCREF(Py_None); | |||
| return Py_None; | |||
| } | |||
| if (dtype.has_param()) { | |||
| return reinterpret_cast<PyObject*>(descr.release()); | |||
| @@ -199,32 +199,22 @@ void init_utils(py::module m) { | |||
| m.def("_get_device_count", &mgb::CompNode::get_device_count, | |||
| "Get total number of specific devices on this system"); | |||
| using mgb::imperative::Profiler; | |||
| using mgb::imperative::ProfileEntry; | |||
| py::class_<Profiler>(m, "ProfilerImpl") | |||
| py::class_<ProfileEntry>(m, "ProfileEntry") | |||
| .def_readwrite("op", &ProfileEntry::op) | |||
| .def_readwrite("host", &ProfileEntry::host) | |||
| .def_readwrite("device_list", &ProfileEntry::device_list); | |||
| py::class_<mgb::imperative::Profiler>(m, "ProfilerImpl") | |||
| .def(py::init<>()) | |||
| .def(py::init<const std::string&>()) | |||
| .def("enable", | |||
| [](Profiler& profiler) -> Profiler& { | |||
| profiler.enable(); | |||
| return profiler; | |||
| }) | |||
| .def("disable", | |||
| [](Profiler& profiler) { | |||
| if (profiler.get_dump_count() == 0) { | |||
| profiler.dump(); | |||
| } | |||
| profiler.disable(); | |||
| }) | |||
| .def("dump", | |||
| [](Profiler& profiler, std::optional<std::string> path) { | |||
| if (path.has_value()) { | |||
| profiler.dump(path.value()); | |||
| } else { | |||
| profiler.dump(); | |||
| } | |||
| }, | |||
| py::arg("path") = std::optional<std::string>()); | |||
| .def("start", | |||
| [](mgb::imperative::Profiler& profiler) { profiler.start(); }) | |||
| .def("stop", | |||
| [](mgb::imperative::Profiler& profiler) { profiler.stop(); }) | |||
| .def("dump", [](mgb::imperative::Profiler& profiler) { | |||
| return profiler.get_profile(); | |||
| }); | |||
| using mgb::imperative::TensorSanityCheck; | |||
| py::class_<TensorSanityCheck>(m, "TensorSanityCheckImpl") | |||
| @@ -0,0 +1,68 @@ | |||
| #include "./event_pool.h" | |||
| namespace mgb { | |||
| namespace imperative { | |||
| EventPool::EventPool(size_t flags) : m_flags{flags} {} | |||
| EventPool& EventPool::with_timer() { | |||
| static Spinlock lock; | |||
| static std::unique_ptr<EventPool> ptr; | |||
| MGB_LOCK_GUARD(lock); | |||
| if (!ptr || ptr->is_finalized()) { | |||
| ptr.reset(new EventPool(CompNode::Event::NEED_TIMER)); | |||
| } | |||
| return *ptr; | |||
| } | |||
| EventPool& EventPool::without_timer() { | |||
| static Spinlock lock; | |||
| static std::unique_ptr<EventPool> ptr; | |||
| MGB_LOCK_GUARD(lock); | |||
| if (!ptr || ptr->is_finalized()) { | |||
| ptr.reset(new EventPool()); | |||
| } | |||
| return *ptr; | |||
| } | |||
| CompNode::Event* EventPool::alloc(CompNode cn) { | |||
| CompNode::EventPool* pool; | |||
| { | |||
| MGB_LOCK_GUARD(m_lock); | |||
| auto iter = m_cn2pool.find(cn); | |||
| if (iter == m_cn2pool.end()) { | |||
| iter = m_cn2pool | |||
| .emplace(std::piecewise_construct, | |||
| std::forward_as_tuple(cn), | |||
| std::forward_as_tuple(cn, m_flags)) | |||
| .first; | |||
| } | |||
| pool = &iter->second; | |||
| } | |||
| return pool->alloc(); | |||
| } | |||
| std::shared_ptr<CompNode::Event> EventPool::alloc_shared(CompNode cn) { | |||
| auto* raw_event = alloc(cn); | |||
| return {raw_event, [this](CompNode::Event* event){ this->free(event); }}; | |||
| } | |||
| void EventPool::free(CompNode::Event* event) { | |||
| CompNode::EventPool* pool; | |||
| { | |||
| MGB_LOCK_GUARD(m_lock); | |||
| pool = &m_cn2pool.at(event->comp_node()); | |||
| } | |||
| pool->free(event); | |||
| } | |||
| std::shared_ptr<void> EventPool::on_comp_node_finalize() { | |||
| MGB_LOCK_GUARD(m_lock); | |||
| for (auto&& i : m_cn2pool) { | |||
| i.second.assert_all_freed(); | |||
| } | |||
| return {}; | |||
| } | |||
| EventPool::~EventPool() { | |||
| for (auto&& i : m_cn2pool) { | |||
| i.second.assert_all_freed(); | |||
| } | |||
| } | |||
| } // namespace imperative | |||
| } // namespace mgb | |||
| @@ -0,0 +1,25 @@ | |||
| #pragma once | |||
| #include "megbrain/comp_node.h" | |||
| namespace mgb { | |||
| namespace imperative { | |||
| class EventPool : CompNodeDepedentObject { | |||
| CompNode::UnorderedMap<CompNode::EventPool> m_cn2pool; | |||
| Spinlock m_lock; | |||
| size_t m_flags; | |||
| EventPool(size_t flags = 0); | |||
| public: | |||
| static EventPool& with_timer(); | |||
| static EventPool& without_timer(); | |||
| CompNode::Event* alloc(CompNode cn); | |||
| std::shared_ptr<CompNode::Event> alloc_shared(CompNode cn); | |||
| void free(CompNode::Event* event); | |||
| std::shared_ptr<void> on_comp_node_finalize(); | |||
| ~EventPool(); | |||
| }; | |||
| } // namespace imperative | |||
| } // namespace mgb | |||
| @@ -11,6 +11,7 @@ | |||
| #include "megbrain/imperative.h" | |||
| #include "megbrain/imperative/blob_manager.h" | |||
| #include "./event_pool.h" | |||
| #include <mutex> | |||
| namespace mgb { | |||
| @@ -18,86 +19,31 @@ namespace imperative { | |||
| namespace { | |||
| class EventPool : CompNodeDepedentObject { | |||
| CompNode::UnorderedMap<CompNode::EventPool> m_cn2pool; | |||
| Spinlock m_lock; | |||
| EventPool() = default; | |||
| public: | |||
| static EventPool& inst() { | |||
| static Spinlock lock; | |||
| static std::unique_ptr<EventPool> ptr; | |||
| MGB_LOCK_GUARD(lock); | |||
| if (!ptr || ptr->is_finalized()) { | |||
| ptr.reset(new EventPool()); | |||
| } | |||
| return *ptr; | |||
| } | |||
| CompNode::Event* alloc(CompNode cn) { | |||
| CompNode::EventPool *pool; | |||
| { | |||
| MGB_LOCK_GUARD(m_lock); | |||
| auto iter = m_cn2pool.find(cn); | |||
| if (iter == m_cn2pool.end()) { | |||
| iter = m_cn2pool.emplace( | |||
| std::piecewise_construct, | |||
| std::forward_as_tuple(cn), | |||
| std::forward_as_tuple(cn)).first; | |||
| } | |||
| pool = &iter->second; | |||
| } | |||
| return pool->alloc(); | |||
| } | |||
| void free(CompNode::Event* event) { | |||
| CompNode::EventPool* pool; | |||
| { | |||
| MGB_LOCK_GUARD(m_lock); | |||
| pool = &m_cn2pool.at(event->comp_node()); | |||
| } | |||
| pool->free(event); | |||
| } | |||
| std::shared_ptr<void> on_comp_node_finalize() override { | |||
| MGB_LOCK_GUARD(m_lock); | |||
| for (auto&& i : m_cn2pool) { | |||
| i.second.assert_all_freed(); | |||
| } | |||
| return {}; | |||
| } | |||
| ~EventPool() { | |||
| for (auto&& i : m_cn2pool) { | |||
| i.second.assert_all_freed(); | |||
| } | |||
| } | |||
| }; | |||
| class AsyncReleaser : public CompNodeDepedentObject { | |||
| struct WaiterParam { | |||
| CompNode cn; | |||
| CompNode::Event *event; | |||
| CompNode::Event* event; | |||
| BlobPtr blob; | |||
| HostTensorStorage::RawStorage storage; | |||
| }; | |||
| class Waiter final: public AsyncQueueSC<WaiterParam, Waiter> { | |||
| AsyncReleaser *m_par_releaser; | |||
| public: | |||
| Waiter(AsyncReleaser *releaser): | |||
| m_par_releaser(releaser) | |||
| { | |||
| class Waiter final : public AsyncQueueSC<WaiterParam, Waiter> { | |||
| AsyncReleaser* m_par_releaser; | |||
| public: | |||
| Waiter(AsyncReleaser* releaser) : m_par_releaser(releaser) {} | |||
| void process_one_task(WaiterParam& param) { | |||
| if (param.event->finished()) { | |||
| param.blob.reset(); | |||
| param.storage.reset(); | |||
| EventPool::without_timer().free(param.event); | |||
| return; | |||
| } | |||
| void process_one_task(WaiterParam ¶m) { | |||
| if (param.event->finished()) { | |||
| param.blob.reset(); | |||
| param.storage.reset(); | |||
| EventPool::inst().free(param.event); | |||
| return; | |||
| } | |||
| using namespace std::literals; | |||
| std::this_thread::sleep_for(1us); | |||
| add_task(std::move(param)); | |||
| } | |||
| using namespace std::literals; | |||
| std::this_thread::sleep_for(1us); | |||
| add_task(std::move(param)); | |||
| } | |||
| }; | |||
| Waiter m_waiter{this}; | |||
| @@ -113,20 +59,17 @@ public: | |||
| return &releaser; | |||
| } | |||
| ~AsyncReleaser() { | |||
| m_waiter.wait_task_queue_empty(); | |||
| } | |||
| ~AsyncReleaser() { m_waiter.wait_task_queue_empty(); } | |||
| void add(BlobPtr blob, CompNode cn) { | |||
| add(cn, std::move(blob), {}); | |||
| } | |||
| void add(BlobPtr blob, CompNode cn) { add(cn, std::move(blob), {}); } | |||
| void add(const HostTensorND& hv) { | |||
| add(hv.comp_node(), {}, hv.storage().raw_storage()); | |||
| } | |||
| void add(CompNode cn, BlobPtr blob, HostTensorStorage::RawStorage storage = {}) { | |||
| auto event = EventPool::inst().alloc(cn); | |||
| void add(CompNode cn, BlobPtr blob, | |||
| HostTensorStorage::RawStorage storage = {}) { | |||
| auto event = EventPool::without_timer().alloc(cn); | |||
| event->record(); | |||
| m_waiter.add_task({cn, event, std::move(blob), std::move(storage)}); | |||
| } | |||
| @@ -290,10 +233,10 @@ struct MultiCNConstTensorCache : CompNodeDepedentObject { | |||
| MultiCNConstTensorCache const_tensor_cache; | |||
| } // namespace | |||
| } // namespace | |||
| void EventDeleter::operator()(CompNode::Event* event) { | |||
| EventPool::inst().free(event); | |||
| EventPool::without_timer().free(event); | |||
| } | |||
| Blob::Blob(const DeviceTensorStorage& s): | |||
| @@ -373,7 +316,7 @@ void Tensor::fetch_value() { | |||
| MGB_LOCK_GUARD(m_mtx); | |||
| if (m_value.empty()) { | |||
| m_value.copy_from(dev_tensor()); | |||
| m_value_ready.reset(EventPool::inst().alloc(comp_node())); | |||
| m_value_ready.reset(EventPool::without_timer().alloc(comp_node())); | |||
| m_value_ready->record(); | |||
| } | |||
| } | |||
| @@ -421,7 +364,7 @@ CompNode::Event* Tensor::get_or_create_event() { | |||
| return e; | |||
| } | |||
| } // namespace imperative | |||
| } // namespace mgb | |||
| } // namespace imperative | |||
| } // namespace mgb | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -11,63 +11,18 @@ | |||
| #include "megbrain/imperative/profiler.h" | |||
| #if defined(_MSC_VER) || defined(WIN32) | |||
| #include <windows.h> | |||
| #define getpid GetCurrentProcessId | |||
| #else | |||
| #include <sys/unistd.h> | |||
| #endif | |||
| #if defined(__APPLE__) || defined(__MACOSX) | |||
| #include <unistd.h> | |||
| #endif | |||
| #include <variant> | |||
| #include "megbrain/imperative/ops/opr_attr.h" | |||
| #include "megbrain/imperative/physical_tensor.h" | |||
| #include "./event_pool.h" | |||
| #include "./op_trait.h" | |||
| namespace mgb { | |||
| namespace imperative { | |||
| class OpDefInfo{ | |||
| public: | |||
| size_t id; | |||
| std::string name; | |||
| }; | |||
| class ProfilerEntry { | |||
| public: | |||
| ProfilerEntry(size_t index, Profiler::EventKind type, std::unique_ptr<CompNode::Event> device) | |||
| : index{index}, type{type}, device{std::move(device)}{ | |||
| } | |||
| ProfilerEntry(size_t index, Profiler::EventKind type, double host): index{index}, type{type}, host{host}{ | |||
| } | |||
| size_t index; | |||
| Profiler::EventKind type; | |||
| std::unique_ptr<CompNode::Event> device = nullptr; | |||
| double host = 0; | |||
| }; | |||
| class ProfilerPrivate { | |||
| public: | |||
| std::vector<OpDefInfo> op_list; | |||
| std::vector<ProfilerEntry> entry_list; | |||
| std::vector<std::unique_ptr<CompNode::Event>> event_list; | |||
| std::vector<std::tuple<OpTrait*, std::unique_ptr<ApplyOnPhysicalTensor>>> | |||
| hook_list; | |||
| ThinHashMap<CompNode, std::tuple<CompNode::Event*, double>> | |||
| comp_node_begin_map; | |||
| ThinHashMap<CompNode, CompNode::Event*> comp_node_end_map; | |||
| RealTimer timer; | |||
| size_t dump_count = 0; | |||
| bool enabled = false; | |||
| std::string path; | |||
| }; | |||
| namespace { | |||
| CompNode::UnorderedSet collect_comp_nodes( | |||
| const OpDef& def, const SmallVector<TensorPtr>& inputs) { | |||
| @@ -80,145 +35,65 @@ CompNode::UnorderedSet collect_comp_nodes( | |||
| } | |||
| return comp_nodes; | |||
| } | |||
| } // namespace | |||
| std::unique_ptr<CompNode::Event> Profiler::create_event(CompNode comp_node){ | |||
| auto event = comp_node.create_event(CompNode::Event::NEED_TIMER); | |||
| event->record(); | |||
| auto& [begin, time] = m_private->comp_node_begin_map[comp_node]; | |||
| if (begin == nullptr) { | |||
| begin = event.get(); | |||
| time = m_private->timer.get_msecs(); | |||
| } | |||
| return event; | |||
| } | |||
| double Profiler::get_host_time_now(){ | |||
| return m_private->timer.get_msecs(); | |||
| } | |||
| double Profiler::get_device_time(CompNode::Event& event) { | |||
| auto [base_event, host_time] = | |||
| m_private->comp_node_begin_map[event.comp_node()]; | |||
| if (base_event == &event) { | |||
| return host_time; | |||
| } else { | |||
| return host_time + base_event->elapsed_time_until(event) * 1000; | |||
| } | |||
| } | |||
| size_t Profiler::get_dump_count(){ | |||
| return m_private->dump_count; | |||
| } | |||
| Profiler::Profiler() { | |||
| m_private = std::make_unique<ProfilerPrivate>(); | |||
| } | |||
| Profiler::Profiler(const std::string& path): Profiler() { | |||
| m_private->path = path; | |||
| } | |||
| } // namespace | |||
| void Profiler::enable() { | |||
| m_private->enabled = true; | |||
| CompNode::sync_all(); | |||
| OpTrait::for_each_trait([this](OpTrait& trait) { | |||
| auto backup = std::make_unique<ApplyOnPhysicalTensor>( | |||
| std::move(trait.apply_on_physical_tensor)); | |||
| trait.apply_on_physical_tensor = | |||
| [this, backup = backup.get()] ( | |||
| const OpDef& def, | |||
| const SmallVector<TensorPtr>& inputs){ | |||
| size_t index = m_private->op_list.size(); | |||
| std::string name = "[" + std::to_string(index) + "]" + print_op(def); | |||
| m_private->op_list.push_back({reinterpret_cast<size_t>(&def), name}); | |||
| m_private->entry_list.emplace_back(index, OprBegin, get_host_time_now()); | |||
| auto&& comp_nodes = collect_comp_nodes(def, inputs); | |||
| for (auto&& comp_node : comp_nodes) { | |||
| m_private->entry_list.emplace_back(index, OprBegin, create_event(comp_node)); | |||
| } | |||
| auto output = (*backup)(def, inputs); | |||
| for (auto&& comp_node : comp_nodes) { | |||
| m_private->entry_list.emplace_back(index, OprEnd, create_event(comp_node)); | |||
| } | |||
| m_private->entry_list.emplace_back(index, OprEnd, get_host_time_now()); | |||
| return output; | |||
| }; | |||
| m_private->hook_list.push_back({&trait, std::move(backup)}); | |||
| void DeviceTimer::reset(thin_function<double()> host_timer) { | |||
| CompNode::foreach ([this, host_timer](CompNode device) { | |||
| auto base_event = EventPool::with_timer().alloc_shared(device); | |||
| base_event->record(); | |||
| m_base_event_table[device] = {std::move(base_event), host_timer()}; | |||
| }); | |||
| } | |||
| void Profiler::disable() { | |||
| for (auto&& hook : m_private->hook_list) { | |||
| std::get<0>(hook)->apply_on_physical_tensor = | |||
| std::move(*std::get<1>(hook)); | |||
| } | |||
| m_private->hook_list.clear(); | |||
| m_private->enabled = false; | |||
| } | |||
| Profiler::~Profiler() { | |||
| } | |||
| void Profiler::dump(){ | |||
| dump(m_private->path); | |||
| thin_function<double()> DeviceTimer::get_device_time(CompNode device) { | |||
| auto event = EventPool::with_timer().alloc_shared(device); | |||
| event->record(); | |||
| auto base = m_base_event_table[device]; | |||
| return [base, event] { | |||
| auto [base_event, host_time] = base; | |||
| //TODO: sync once for each compnode | |||
| event->host_wait(); | |||
| return base_event->elapsed_time_until(*event) * 1000 + host_time; | |||
| }; | |||
| } | |||
| void Profiler::dump(const std::string& path) { | |||
| using namespace json; | |||
| auto obj = json::Object::make(); | |||
| if (!(*obj)["traceEvents"]) { | |||
| (*obj)["traceEvents"] = Array::make(); | |||
| } | |||
| auto& trace_events = (*obj)["traceEvents"]->cast_final<Array>(); | |||
| for (auto&& entry : m_private->entry_list) { | |||
| auto trace_event_ptr = Object::make(); | |||
| auto& trace_event = *trace_event_ptr; | |||
| std::string name; | |||
| size_t id; | |||
| int pid; | |||
| std::string tid; | |||
| double ts; | |||
| const char* ph; | |||
| name = m_private->op_list[entry.index].name; | |||
| id = entry.index; | |||
| pid = getpid(); | |||
| if (entry.device) { | |||
| entry.device->host_wait(); | |||
| ts = get_device_time(*entry.device); | |||
| tid = entry.device->comp_node().to_string(); | |||
| } else { | |||
| ts = entry.host; | |||
| tid = "host"; | |||
| } | |||
| switch (entry.type) { | |||
| case OprBegin: { | |||
| ph = "B"; | |||
| break; | |||
| void Profiler::start() { | |||
| m_host_timer.reset(); | |||
| m_device_timer.reset([&]{ return m_host_timer.get_msecs();} ); | |||
| OpTrait::for_each_trait([this](OpTrait& trait) { | |||
| FunctionHooker hooker{&trait.apply_on_physical_tensor}; | |||
| hooker.apply_hook([this](auto&& apply, const OpDef& def, | |||
| const SmallVector<TensorPtr>& inputs) { | |||
| ProfileEntry entry; | |||
| entry.op = def.copy(); | |||
| double host_begin = m_host_timer.get_msecs(); | |||
| auto&& comp_nodes = collect_comp_nodes(def, inputs); | |||
| for (auto&& comp_node : comp_nodes) { | |||
| entry.device_list.push_back( | |||
| {comp_node, | |||
| m_device_timer.get_device_time(comp_node), | |||
| {}}); | |||
| } | |||
| case OprEnd: { | |||
| ph = "E"; | |||
| break; | |||
| auto outputs = apply(def, inputs); | |||
| for (auto& [cn, dev_begin, dev_end] : entry.device_list) { | |||
| MGB_MARK_USED_VAR(cn); | |||
| MGB_MARK_USED_VAR(dev_begin); | |||
| dev_end = m_device_timer.get_device_time(cn); | |||
| } | |||
| } | |||
| trace_event["name"] = String::make(name); | |||
| trace_event["id"] = Number::make(id); | |||
| trace_event["pid"] = Number::make(pid); | |||
| trace_event["tid"] = String::make(tid); | |||
| trace_event["ts"] = Number::make(ts * 1000); | |||
| trace_event["ph"] = String::make(ph); | |||
| trace_events.add(std::move(trace_event_ptr)); | |||
| } | |||
| obj->writeto_fpath(path.empty() ? path : m_private->path); | |||
| m_private->dump_count++; | |||
| entry.host = {host_begin, m_host_timer.get_msecs()}; | |||
| m_profile->push_back(std::move(entry)); | |||
| return outputs; | |||
| }); | |||
| m_hooker_list.push_back(std::move(hooker)); | |||
| }); | |||
| } | |||
| std::string Profiler::print_op(const OpDef& def){ | |||
| auto* opr_attr = def.try_cast_final<const OprAttr>(); | |||
| if(opr_attr){ | |||
| return std::string("OprAttr:") + opr_attr->type; | |||
| void Profiler::stop() { | |||
| m_hooker_list.clear(); | |||
| for (auto& entry : *m_profile) { | |||
| entry.wait_device(); | |||
| } | |||
| return def.dyn_typeinfo()->name; | |||
| } | |||
| } // namespace imperative | |||
| @@ -0,0 +1,55 @@ | |||
| /** | |||
| * \file imperative/src/include/megbrain/imperative/function_hook.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #pragma once | |||
| #include "megbrain/utils/thin/function.h" | |||
| namespace mgb { | |||
| namespace imperative { | |||
| template <typename TFunction> | |||
| class FunctionHooker; | |||
| template <typename TRet, typename... TArgs> | |||
| class FunctionHooker<TRet(TArgs...)> { | |||
| public: | |||
| using FunctionType = thin_function<TRet(TArgs&&...)>; | |||
| using HookType = thin_function<TRet(FunctionType, TArgs&&...)>; | |||
| explicit FunctionHooker(FunctionType* fptr) : m_fptr{fptr} {} | |||
| public: | |||
| FunctionHooker& apply_hook(HookType&& hook) { | |||
| if (!m_backup) { | |||
| FunctionType* backup = new FunctionType(*m_fptr); | |||
| std::function<void(FunctionType*)> restorer = | |||
| [fptr = m_fptr](FunctionType* bkp) -> void { | |||
| *fptr = *bkp; | |||
| delete bkp; | |||
| }; | |||
| m_backup = decltype(m_backup)(backup, restorer); | |||
| } | |||
| *m_fptr = [func = *m_fptr, hook](TArgs&&... args) -> TRet { | |||
| return hook(func, std::forward<TArgs>(args)...); | |||
| }; | |||
| return *this; | |||
| } | |||
| private: | |||
| FunctionType* m_fptr; | |||
| std::unique_ptr<FunctionType, std::function<void(FunctionType*)>> m_backup; | |||
| }; | |||
| template <typename TRet, typename... TArgs> | |||
| FunctionHooker(thin_function<TRet(TArgs...)>* f) | |||
| ->FunctionHooker<TRet(TArgs...)>; | |||
| } // namespace imperative | |||
| } // namespace mgb | |||
| @@ -11,6 +11,8 @@ | |||
| #pragma once | |||
| #include <variant> | |||
| #include "megbrain/comp_node.h" | |||
| #include "megbrain/graph/event.h" | |||
| #include "megbrain/utils/json.h" | |||
| @@ -18,37 +20,59 @@ | |||
| #include "megbrain/imperative/op_def.h" | |||
| #include "megbrain/imperative/function_hook.h" | |||
| namespace mgb { | |||
| namespace imperative { | |||
| class ProfilerPrivate; | |||
| struct ProfileEntry{ | |||
| using TimeClosure = std::function<double()>; | |||
| std::shared_ptr<OpDef> op; | |||
| std::tuple<double, double> host; | |||
| std::vector<std::tuple<CompNode, TimeClosure, TimeClosure>> device_list; | |||
| void wait_device(){ | |||
| for(auto& [cn, begin, end]: device_list){ | |||
| MGB_MARK_USED_VAR(cn); | |||
| begin = [begin=begin()]{ return begin; }; | |||
| end = [end = end()]{ return end; }; | |||
| } | |||
| } | |||
| }; | |||
| using Profile = std::vector<ProfileEntry>; | |||
| using OpDefPrinter = thin_function<std::string(const OpDef&)>; | |||
| class DeviceTimer { | |||
| public: | |||
| using SharedEvent = std::shared_ptr<CompNode::Event>; | |||
| DeviceTimer() = default; | |||
| void reset(thin_function<double()> host_timer); | |||
| thin_function<double()> get_device_time(CompNode device); | |||
| class Profiler { | |||
| private: | |||
| std::unique_ptr<ProfilerPrivate> m_private; | |||
| CompNode::UnorderedMap<std::tuple<SharedEvent, double>> m_base_event_table; | |||
| }; | |||
| class Profiler { | |||
| public: | |||
| enum EventKind { OprBegin, OprEnd }; | |||
| Profiler(Profile* profile = nullptr) { | |||
| if (!profile) { | |||
| m_owned_profile = std::make_unique<Profile>(); | |||
| profile = m_owned_profile.get(); | |||
| } | |||
| m_profile = profile; | |||
| } | |||
| void start(); | |||
| void stop(); | |||
| Profile& get_profile() { return *m_profile; } | |||
| public: | |||
| Profiler(); | |||
| Profiler(const std::string& path); | |||
| ~Profiler(); | |||
| void enable(); | |||
| void disable(); | |||
| void dump(); | |||
| void dump(const std::string& path); | |||
| void record_host(size_t id, std::string name, EventKind type, | |||
| double host_time); | |||
| void record_device(size_t id, std::string name, EventKind type, | |||
| double host_time, CompNode comp_node); | |||
| double get_device_time(CompNode::Event& event); | |||
| size_t get_dump_count(); | |||
| std::unique_ptr<CompNode::Event> create_event(CompNode comp_node); | |||
| double get_host_time_now(); | |||
| std::string print_op(const OpDef& def); | |||
| private: | |||
| DeviceTimer m_device_timer; | |||
| RealTimer m_host_timer; | |||
| Profile* m_profile; | |||
| std::unique_ptr<Profile> m_owned_profile; | |||
| std::vector<FunctionHooker<decltype(OpDef::apply_on_physical_tensor)>> | |||
| m_hooker_list; | |||
| }; | |||
| } // namespace imperative | |||
| } // namespace mgb | |||
| @@ -89,8 +89,8 @@ namespace { | |||
| /* ==================== EventPool ==================== */ | |||
| CompNode::EventPool::EventPool(CompNode cn): | |||
| m_cn{cn} | |||
| CompNode::EventPool::EventPool(CompNode cn, size_t flags): | |||
| m_cn{cn}, m_flags{flags} | |||
| { | |||
| } | |||
| @@ -105,7 +105,7 @@ CompNode::Event* CompNode::EventPool::alloc() { | |||
| m_free.pop_back(); | |||
| return rst; | |||
| } | |||
| m_allocated.push_back(m_cn.create_event()); | |||
| m_allocated.push_back(m_cn.create_event(m_flags)); | |||
| return m_allocated.back().get(); | |||
| } | |||
| @@ -643,9 +643,10 @@ class CompNode::EventPool { | |||
| std::vector<std::unique_ptr<CompNode::Event>> m_allocated; | |||
| std::vector<CompNode::Event*> m_free; | |||
| Spinlock m_lock; | |||
| size_t m_flags; | |||
| public: | |||
| explicit EventPool(CompNode cn); | |||
| explicit EventPool(CompNode cn, size_t flags = 0); | |||
| ~EventPool(); | |||
| CompNode::Event* alloc(); | |||