GitOrigin-RevId: fd0814fdb3
tags/v1.6.0
| @@ -13,6 +13,7 @@ | |||
| #include "megbrain/gopt/reformat_emitter.h" | |||
| #include <numeric> | |||
| #include "megbrain/opr/tensor_manip.h" | |||
| #include "megbrain/opr/io.h" | |||
| using namespace mgb; | |||
| using namespace gopt; | |||
| @@ -243,4 +244,63 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const { | |||
| } | |||
| return builders; | |||
| } | |||
| /* ============== PaddingEmitter ================= */ | |||
| PaddingEmitter::EmitResult PaddingEmitter::emit() const { | |||
| auto&& const_extent = m_const_extent; | |||
| auto&& axis = m_axis; | |||
| auto builder = [const_extent, axis](const VarNodeArray& vars) { | |||
| auto i = vars[0]; | |||
| auto padding_shp_var = vars[1]; | |||
| TensorShape shape; | |||
| shape.ndim = i->shape().ndim; | |||
| for (size_t ax = 0; ax < shape.ndim; ++ax) | |||
| shape[ax] = 1; | |||
| shape[axis] = const_extent; | |||
| auto host_val = | |||
| std::make_shared<HostTensorND>(i->comp_node(), i->dtype()); | |||
| host_val->resize(shape); | |||
| auto ptr = host_val->raw_ptr(); | |||
| size_t size_bytes = TensorLayout{shape, i->dtype()}.span().dist_byte(); | |||
| std::memset(ptr, 0, size_bytes); | |||
| auto padding = | |||
| opr::ImmutableTensor::make(*i->owner_graph(), *host_val); | |||
| padding = opr::Broadcast::make(padding, padding_shp_var); | |||
| auto o = opr::Concat::make({i, padding}, axis); | |||
| return o.node(); | |||
| }; | |||
| auto checker = [axis](const VarNodeArray& vars) { | |||
| mgb_assert(vars.size() == 2); | |||
| return vars[0]->shape().ndim > axis; | |||
| }; | |||
| return std::make_tuple(builder, checker); | |||
| } | |||
| /* ============== SubtensorEmitter ================= */ | |||
| SubtensorEmitter::EmitResult SubtensorEmitter::emit() const { | |||
| auto&& const_extent = m_const_extent; | |||
| auto&& axis = m_axis; | |||
| auto builder = [const_extent, axis](const VarNodeArray& vars) { | |||
| auto i = vars[0]; | |||
| auto x = SymbolVar(i); | |||
| auto cv = [&x](int v) { return x.make_scalar(v); }; | |||
| using AIdx = opr::Subtensor::AxisIndexer; | |||
| std::vector<AIdx> index(i->shape().ndim); | |||
| for (size_t ax = 0; ax < index.size(); ++ax) { | |||
| if (ax == axis) | |||
| index[ax] = | |||
| AIdx::make_interval(ax, None, cv(const_extent), None); | |||
| else | |||
| index[ax] = AIdx::make_interval(ax, None, None, cv(1)); | |||
| } | |||
| auto o = opr::Subtensor::make(x, index); | |||
| return o.node(); | |||
| }; | |||
| auto checker = [axis](const VarNodeArray& vars) { | |||
| mgb_assert(vars.size() == 2); | |||
| return vars[0]->shape().ndim > axis; | |||
| }; | |||
| return std::make_tuple(builder, checker); | |||
| } | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -12,12 +12,27 @@ | |||
| #include "megbrain/gopt/reformat_manager.h" | |||
| #include "megbrain/opr/tensor_manip.h" | |||
| #include "megbrain/utils/arith_helper.h" | |||
| using namespace mgb; | |||
| using namespace gopt; | |||
| using NamedTensorShape = megdnn::NamedTensorShape; | |||
| using Dimension = megdnn::Dimension; | |||
| namespace { | |||
| int gcd(const int& p, const int& q) { | |||
| int x = p, y = q; | |||
| while (y != 0) { | |||
| if (x < y) { | |||
| y = (y % x); | |||
| } else { | |||
| x = (x % y); | |||
| std::swap(x, y); | |||
| } | |||
| } | |||
| return x; | |||
| } | |||
| NamedTensorShape tensor_formats_to_named_tensor_shape(TensorFormats format) { | |||
| switch (format) { | |||
| case TensorFormats::NCHW: | |||
| @@ -371,6 +386,170 @@ ReformatManager::ReformatImpl ReformatManager::get( | |||
| }) | |||
| } | |||
| ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( | |||
| const VarNode* orig_var, TensorFormats orig_format, | |||
| const ReformatKey& key) const { | |||
| NamedTensorShape input_shape = | |||
| tensor_formats_to_named_tensor_shape(key.input_format); | |||
| NamedTensorShape output_shape = | |||
| tensor_formats_to_named_tensor_shape(key.output_format); | |||
| size_t input_alignment, output_alignment; | |||
| size_t input_channel_idx, output_channel_idx; | |||
| for (size_t i = 0; i < input_shape.ndim; ++i) { | |||
| if (input_shape[i].name() == Dimension::Name::C && | |||
| input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
| input_channel_idx = i; | |||
| input_alignment = input_shape[i].stride(); | |||
| break; | |||
| } | |||
| } | |||
| for (size_t i = 0; i < output_shape.ndim; ++i) { | |||
| if (output_shape[i].name() == Dimension::Name::C && | |||
| output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
| output_channel_idx = i; | |||
| output_alignment = output_shape[i].stride(); | |||
| break; | |||
| } | |||
| } | |||
| NamedTensorShape orig_shape = | |||
| tensor_formats_to_named_tensor_shape(orig_format); | |||
| size_t orig_channel = 0; | |||
| for (size_t i = 0; i < orig_shape.ndim; ++i) { | |||
| if (orig_shape[i].name() == Dimension::Name::C && | |||
| orig_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
| orig_channel = orig_var->shape()[i] * orig_shape[i].stride(); | |||
| break; | |||
| } | |||
| } | |||
| mgb_assert(orig_channel > 0, | |||
| "incompatible NamedTensorShape for feature(got:%s)", | |||
| orig_shape.to_string().c_str()); | |||
| size_t aligned_in_channel = | |||
| divup(orig_channel, input_alignment) * input_alignment; | |||
| size_t aligned_out_channel = | |||
| divup(orig_channel, output_alignment) * output_alignment; | |||
| size_t common_alignment = input_alignment * output_alignment / | |||
| gcd(input_alignment, output_alignment); | |||
| size_t aligned_channel = | |||
| divup(orig_channel, common_alignment) * common_alignment; | |||
| auto builder = [key, aligned_channel, aligned_in_channel, | |||
| aligned_out_channel, input_shape, input_channel_idx, | |||
| output_shape, | |||
| output_channel_idx](const VarNodeArray& vars) { | |||
| VarNode *x, *cur; | |||
| x = cur = vars[0]; | |||
| if (aligned_channel > aligned_in_channel) { | |||
| auto padding_shape = input_shape; | |||
| auto&& dim = padding_shape[input_channel_idx]; | |||
| size_t const_extent = | |||
| (aligned_channel - aligned_in_channel) / dim.stride(); | |||
| padding_shape[input_channel_idx] = | |||
| Dimension(dim.name(), dim.stride(), const_extent); | |||
| auto make_shape = std::get<0>( | |||
| MakeShapeEmitter{input_shape, padding_shape}.emit()); | |||
| auto padding_shp_var = make_shape({x}); | |||
| auto padding = std::get<0>( | |||
| PaddingEmitter{const_extent, input_channel_idx}.emit()); | |||
| cur = padding({cur, padding_shp_var}); | |||
| } | |||
| cur = ReformatManager::instance().get(key)({cur}); | |||
| if (aligned_channel > aligned_out_channel) { | |||
| auto&& dim = output_shape[output_channel_idx]; | |||
| size_t const_extent = aligned_out_channel / dim.stride(); | |||
| auto sub = std::get<0>( | |||
| SubtensorEmitter{const_extent, output_channel_idx}.emit()); | |||
| cur = sub({cur}); | |||
| } | |||
| return cur; | |||
| }; | |||
| return builder; | |||
| } | |||
| ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( | |||
| const VarNode* orig_var, const ReformatKey& key, | |||
| const AlignmentDesc& extra_alignment) const { | |||
| size_t in_channels = 0, out_channels = 0; | |||
| size_t input_channel_idx, output_channel_idx; | |||
| Dimension::Name out_channel_name; | |||
| auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format); | |||
| for (size_t i = 0; i < input_shape.ndim; ++i) { | |||
| if (input_shape[i].name() == Dimension::Name::C && | |||
| input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
| in_channels = orig_var->shape()[i]; | |||
| input_channel_idx = i; | |||
| mgb_assert(input_shape[i].stride() == 1, | |||
| "unsupport weight format(got:%s)", | |||
| input_shape.to_string().c_str()); | |||
| } else if ((input_shape[i].name() == Dimension::Name::K || | |||
| input_shape[i].name() == Dimension::Name::N) && | |||
| input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
| out_channels = orig_var->shape()[i]; | |||
| out_channel_name = input_shape[i].name(); | |||
| output_channel_idx = i; | |||
| mgb_assert(input_shape[i].stride() == 1, | |||
| "unsupport weight format(got:%s)", | |||
| input_shape.to_string().c_str()); | |||
| } | |||
| } | |||
| size_t in_channel_alignment, out_channel_alignment = 1; | |||
| auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format); | |||
| for (size_t i = 0; i < output_shape.ndim; ++i) { | |||
| if (output_shape[i].name() == Dimension::Name::C && | |||
| output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
| in_channel_alignment = output_shape[i].stride(); | |||
| } else if (output_shape[i].name() == out_channel_name && | |||
| output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { | |||
| out_channel_alignment = output_shape[i].stride(); | |||
| } | |||
| } | |||
| size_t aligned_in_channel = | |||
| divup(in_channels, in_channel_alignment) * in_channel_alignment; | |||
| if (extra_alignment.name == out_channel_name) { | |||
| out_channel_alignment = | |||
| extra_alignment.alignment * out_channel_alignment / | |||
| gcd(extra_alignment.alignment, out_channel_alignment); | |||
| } | |||
| size_t aligned_out_channel = | |||
| divup(out_channels, out_channel_alignment) * out_channel_alignment; | |||
| auto builder = [key, input_shape, in_channels, input_channel_idx, | |||
| aligned_in_channel, out_channels, output_channel_idx, | |||
| aligned_out_channel](const VarNodeArray& vars) { | |||
| VarNode *x, *cur; | |||
| x = cur = vars[0]; | |||
| if (aligned_in_channel > in_channels) { | |||
| auto padding_shape = input_shape; | |||
| auto&& dim = padding_shape[input_channel_idx]; | |||
| size_t const_extent = | |||
| (aligned_in_channel - in_channels) / dim.stride(); | |||
| padding_shape[input_channel_idx] = | |||
| Dimension(dim.name(), dim.stride(), const_extent); | |||
| auto make_shape = std::get<0>( | |||
| MakeShapeEmitter{input_shape, padding_shape}.emit()); | |||
| auto padding_shp_var = make_shape({x}); | |||
| auto padding = std::get<0>( | |||
| PaddingEmitter{const_extent, input_channel_idx}.emit()); | |||
| cur = padding({cur, padding_shp_var}); | |||
| } | |||
| if (aligned_out_channel > out_channels) { | |||
| auto padding_shape = input_shape; | |||
| auto&& dim = padding_shape[output_channel_idx]; | |||
| size_t const_extent = | |||
| (aligned_out_channel - out_channels) / dim.stride(); | |||
| padding_shape[output_channel_idx] = | |||
| Dimension(dim.name(), dim.stride(), const_extent); | |||
| auto make_shape = std::get<0>( | |||
| MakeShapeEmitter{input_shape, padding_shape}.emit()); | |||
| auto padding_shp_var = make_shape({cur}); | |||
| auto padding = std::get<0>( | |||
| PaddingEmitter{const_extent, output_channel_idx}.emit()); | |||
| cur = padding({cur, padding_shp_var}); | |||
| } | |||
| cur = ReformatManager::instance().get(key)({cur}); | |||
| return cur; | |||
| }; | |||
| return builder; | |||
| } | |||
| const ReformatManager& ReformatManager::instance() { | |||
| static ReformatManager inst; | |||
| return inst; | |||
| @@ -77,6 +77,26 @@ private: | |||
| }; | |||
| UnderlyingBuilders analyze() const; | |||
| }; | |||
| class PaddingEmitter final : public Emitter { | |||
| public: | |||
| PaddingEmitter(size_t const_extent, size_t axis) | |||
| : m_const_extent{const_extent}, m_axis{axis} {} | |||
| EmitResult emit() const override; | |||
| private: | |||
| size_t m_const_extent, m_axis; | |||
| }; | |||
| class SubtensorEmitter final : public Emitter { | |||
| public: | |||
| SubtensorEmitter(size_t const_extent, size_t axis) | |||
| : m_const_extent{const_extent}, m_axis{axis} {} | |||
| EmitResult emit() const override; | |||
| private: | |||
| size_t m_const_extent, m_axis; | |||
| }; | |||
| } // namespace gopt | |||
| } // namespace mgb | |||
| @@ -101,12 +101,21 @@ public: | |||
| ReformatKey::Equal>; | |||
| ReformatImpl get(const ReformatKey& key) const; | |||
| ReformatImpl get(ReformatKey&& key) const { return get(key); } | |||
| ReformatImpl auto_aligned_reformat_featrue(const VarNode* orig_var, | |||
| TensorFormats orig_format, | |||
| const ReformatKey& key) const; | |||
| struct AlignmentDesc { | |||
| megdnn::Dimension::Name name; | |||
| size_t alignment; | |||
| }; | |||
| ReformatImpl auto_aligned_reformat_weight( | |||
| const VarNode* orig_var, const ReformatKey& key, | |||
| const AlignmentDesc& extra_alignment = {}) const; | |||
| static const ReformatManager& instance(); | |||
| private: | |||
| ReformatCache m_cache; | |||
| }; | |||
| } // namespace gopt | |||
| } // namespace mgb | |||
| @@ -13,7 +13,10 @@ | |||
| #include "./helper.h" | |||
| #include "megbrain/gopt/reformat_manager.h" | |||
| #include "megbrain/graph/event.h" | |||
| #include "megbrain/opr/tensor_manip.h" | |||
| #include "megbrain/plugin/base.h" | |||
| #include "megbrain/plugin/profiler.h" | |||
| using namespace mgb; | |||
| using namespace gopt; | |||
| @@ -168,4 +171,287 @@ TEST(TestReformatManager, InputChannelSmall) { | |||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | |||
| } | |||
| TEST(TestReformatManager, AutoAlignedFeature) { | |||
| constexpr size_t N = 16, C = 22, H = 55, W = 55; | |||
| HostTensorGenerator<> gen; | |||
| using ReformatKey = ReformatManager::ReformatKey; | |||
| auto src_format = TensorFormats::NCHWc4, | |||
| dst_format = TensorFormats::NCHWc32; | |||
| ReformatKey key{src_format, dst_format}; | |||
| auto graph = ComputingGraph::make(); | |||
| graph->options().graph_opt_level = 0; | |||
| std::shared_ptr<HostTensorND> host_orig_x = gen({N, C, H, W}); | |||
| std::shared_ptr<HostTensorND> host_x = gen({N, (C + 3) / 4, H, W, 4}); | |||
| auto mkvar = [&](const char* name, | |||
| const std::shared_ptr<HostTensorND>& host_val) { | |||
| return opr::Host2DeviceCopy::make(*graph, host_val).rename(name); | |||
| }; | |||
| auto orig_x = mkvar("orig_x", host_orig_x); | |||
| auto x = mkvar("x", host_x); | |||
| auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( | |||
| orig_x.node(), TensorFormats::NCHW, key); | |||
| auto y = builder({x.node()}); | |||
| HostTensorND t; | |||
| auto func = graph->compile({make_callback_copy(y, t)}); | |||
| func->execute(); | |||
| *host_x = *gen({(N + 5), (C + 3) / 4, H, W, 4}); | |||
| func->execute(); | |||
| *host_x = *gen({(N - 5), (C + 3) / 4, H, W, 4}); | |||
| func->execute(); | |||
| auto shp = TensorShape{(N - 5), (C + 31) / 32, H, W, 32}; | |||
| ASSERT_TRUE(shp.eq_shape(t.shape())); | |||
| } | |||
| TEST(TestReformatManager, AutoAlignedFeatureB4) { | |||
| constexpr size_t N = 16, C = 94, H = 55, W = 55; | |||
| HostTensorGenerator<> gen; | |||
| using ReformatKey = ReformatManager::ReformatKey; | |||
| auto src_format = TensorFormats::NCHWc4, | |||
| dst_format = TensorFormats::NCHWc64; | |||
| ReformatKey key{src_format, dst_format}; | |||
| auto graph = ComputingGraph::make(); | |||
| graph->options().graph_opt_level = 0; | |||
| std::shared_ptr<HostTensorND> host_orig_x = gen({N, C, H, W}); | |||
| std::shared_ptr<HostTensorND> host_x = gen({N, (C + 3) / 4, H, W, 4}); | |||
| auto mkvar = [&](const char* name, | |||
| const std::shared_ptr<HostTensorND>& host_val, | |||
| const DType& dtype) { | |||
| return opr::TypeCvt::make( | |||
| opr::Host2DeviceCopy::make(*graph, host_val).rename(name), | |||
| dtype); | |||
| }; | |||
| auto orig_x = mkvar("orig_x", host_orig_x, | |||
| dtype::Quantized4Asymm(20.f, static_cast<uint8_t>(8))); | |||
| auto x = mkvar("x", host_x, | |||
| dtype::Quantized4Asymm(25.f, static_cast<uint8_t>(4))); | |||
| auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( | |||
| orig_x.node(), TensorFormats::NCHW, key); | |||
| auto y = builder({x.node()}); | |||
| HostTensorND t; | |||
| auto func = graph->compile({make_callback_copy(y, t)}); | |||
| func->execute(); | |||
| } | |||
| TEST(TestReformatManager, AutoAlignedWeight) { | |||
| constexpr size_t K = 32, C = 32, R = 3, S = 3; | |||
| HostTensorGenerator<> gen; | |||
| using ReformatKey = ReformatManager::ReformatKey; | |||
| auto src_format = TensorFormats::NCHW, dst_format = TensorFormats::NCHWc64; | |||
| ReformatKey key{src_format, dst_format}; | |||
| auto graph = ComputingGraph::make(); | |||
| graph->options().graph_opt_level = 0; | |||
| auto mkvar = [&](const char* name, const TensorShape& shp) { | |||
| return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name); | |||
| }; | |||
| auto w = mkvar("w", {K, C, R, S}); | |||
| auto builder = ReformatManager::instance().auto_aligned_reformat_weight( | |||
| w.node(), key, | |||
| ReformatManager::AlignmentDesc{megdnn::Dimension::Name::N, 64}); | |||
| auto y = builder({w.node()}); | |||
| HostTensorND t; | |||
| auto func = graph->compile({make_callback_copy(y, t)}); | |||
| func->execute(); | |||
| } | |||
| #if MGB_CUDA | |||
| #include "megbrain/comp_node_env.h" | |||
| namespace { | |||
| class ReformatProfiler : public PluginBase { | |||
| using CompNodeEventPtr = std::unique_ptr<CompNode::Event>; | |||
| public: | |||
| class MarkInputContiguous; | |||
| ReformatProfiler(cg::ComputingGraph* graph, cg::OperatorNodeBase* opr_start, | |||
| cg::OperatorNodeBase* opr_end); | |||
| ~ReformatProfiler() noexcept; | |||
| double duration() const; | |||
| private: | |||
| CompNodeEventPtr m_start, m_end; | |||
| cg::OperatorNodeBase *m_opr_start, *m_opr_end; | |||
| }; | |||
| ReformatProfiler::ReformatProfiler(cg::ComputingGraph* graph, | |||
| cg::OperatorNodeBase* opr_start, | |||
| cg::OperatorNodeBase* opr_end) | |||
| : PluginBase(graph), m_opr_start(opr_start), m_opr_end(opr_end) { | |||
| using namespace cg::event; | |||
| auto on_reformat_start = [this](BeforeKernel const& event) { | |||
| auto opr = event.opr; | |||
| if (opr != m_opr_start) | |||
| return; | |||
| if (m_start == nullptr) { | |||
| m_start = event.comp_node.create_event(CompNode::Event::NEED_TIMER); | |||
| } | |||
| m_start->record(); | |||
| }; | |||
| auto on_reformat_end = [this](AfterKernel const& event) { | |||
| auto opr = event.opr; | |||
| if (opr != m_opr_end) | |||
| return; | |||
| if (m_end == nullptr) { | |||
| m_end = event.comp_node.create_event(CompNode::Event::NEED_TIMER); | |||
| } | |||
| m_end->record(); | |||
| }; | |||
| auto&& ev = graph->event(); | |||
| add_event_handler(ev.register_receiver<BeforeKernel>(on_reformat_start)); | |||
| add_event_handler(ev.register_receiver<AfterKernel>(on_reformat_end)); | |||
| } | |||
| ReformatProfiler::~ReformatProfiler() noexcept { | |||
| if (m_start) | |||
| m_start->host_wait(); | |||
| if (m_end) | |||
| m_end->host_wait(); | |||
| } | |||
| double ReformatProfiler::duration() const { | |||
| mgb_assert(m_end); | |||
| m_end->host_wait(); | |||
| return m_start->elapsed_time_until(*m_end) - | |||
| m_start->elapsed_time_until(*m_start); | |||
| } | |||
| MGB_DEFINE_OPR_CLASS(ReformatProfiler::MarkInputContiguous, | |||
| cg::SingleCNOperatorNodeBase) // { | |||
| void scn_do_execute() override{}; | |||
| void init_output_static_infer_desc() override; | |||
| void add_input_layout_constraint() override; | |||
| public: | |||
| MarkInputContiguous(VarNode* node, const OperatorNodeConfig& config); | |||
| static SymbolVar make(SymbolVar node, const OperatorNodeConfig& config = {}); | |||
| }; // namespace | |||
| MGB_DYN_TYPE_OBJ_FINAL_IMPL(ReformatProfiler::MarkInputContiguous); | |||
| ReformatProfiler::MarkInputContiguous::MarkInputContiguous( | |||
| VarNode* node, const OperatorNodeConfig& config) | |||
| : Super(node->owner_graph(), config, "mark_contiguous", {node}) { | |||
| add_input({node}); | |||
| add_output(None); | |||
| } | |||
| SymbolVar ReformatProfiler::MarkInputContiguous::make( | |||
| SymbolVar node, const OperatorNodeConfig& config) { | |||
| return node.insert_single_output_opr<MarkInputContiguous>(node.node(), | |||
| config); | |||
| } | |||
| void ReformatProfiler::MarkInputContiguous::init_output_static_infer_desc() { | |||
| using namespace cg::static_infer; | |||
| auto&& mgr = owner_graph()->static_infer_manager(); | |||
| mgr.register_shape_infer(output(0), | |||
| ShapeInferDesc::make_identity(input(0))); | |||
| } | |||
| void ReformatProfiler::MarkInputContiguous::add_input_layout_constraint() { | |||
| input(0)->add_layout_constraint_contiguous(); | |||
| } | |||
| class CUTimer { | |||
| public: | |||
| CUTimer(cudaStream_t& stream, cudaEvent_t& evt0, cudaEvent_t& evt1) | |||
| : m_stream{stream}, m_evt0{evt0}, m_evt1{evt1} { | |||
| reset(); | |||
| } | |||
| void reset() { | |||
| m_started = false; | |||
| m_stopped = false; | |||
| } | |||
| void start() { | |||
| mgb_assert(!m_started); | |||
| mgb_assert(!m_stopped); | |||
| m_started = true; | |||
| cudaEventRecord(m_evt0, m_stream); | |||
| } | |||
| void stop() { | |||
| mgb_assert(m_started); | |||
| mgb_assert(!m_stopped); | |||
| m_stopped = true; | |||
| cudaEventRecord(m_evt1, m_stream); | |||
| } | |||
| size_t get_time_in_us() const { | |||
| cudaStreamSynchronize(m_stream); | |||
| float t = -1; | |||
| cudaEventElapsedTime(&t, m_evt0, m_evt1); | |||
| return static_cast<size_t>(t * 1e3); | |||
| } | |||
| private: | |||
| bool m_started, m_stopped; | |||
| size_t m_start_point, m_stop_point; | |||
| cudaStream_t& m_stream; | |||
| cudaEvent_t &m_evt0, &m_evt1; | |||
| }; | |||
| } // namespace | |||
| TEST(TestReformatManager, AutoAlignedFeatureProfiling) { | |||
| REQUIRE_GPU(1); | |||
| auto cn = CompNode::load("gpux"); | |||
| using ReformatKey = ReformatManager::ReformatKey; | |||
| auto dtype = dtype::Quantized4Asymm(20.f, static_cast<uint8_t>(4)); | |||
| HostTensorND hval(cn, dtype); | |||
| constexpr size_t N = 16, C = 18, H = 55, W = 55; | |||
| hval.resize({N, (C + 63) / 64, H, W, 64}); | |||
| std::shared_ptr<DeviceTensorND> dval = | |||
| std::make_shared<DeviceTensorND>(cn, dtype); | |||
| dval->copy_from(hval).sync(); | |||
| std::shared_ptr<DeviceTensorND> dprime = | |||
| std::make_shared<DeviceTensorND>(cn, dtype); | |||
| dprime->resize({N, C, H, W}); | |||
| auto graph = ComputingGraph::make(); | |||
| graph->options().graph_opt_level = 0; | |||
| graph->options().var_sanity_check_first_run = false; | |||
| auto x = opr::VolatileSharedDeviceTensor::make(*graph, dval); | |||
| auto xprime = opr::VolatileSharedDeviceTensor::make(*graph, dprime); | |||
| ReformatKey key{TensorFormats::NCHWc64, TensorFormats::NCHW}; | |||
| auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( | |||
| xprime.node(), TensorFormats::NCHW, key); | |||
| auto y = builder({x.node()}); | |||
| auto mark = ReformatProfiler::MarkInputContiguous::make(SymbolVar(y)); | |||
| auto cb = [](DeviceTensorND& d) { MGB_MARK_USED_VAR(d); }; | |||
| auto output_spec = std::make_pair(mark, cb); | |||
| auto func = graph->compile({output_spec}); | |||
| static constexpr size_t RUNS = 100; | |||
| cn.activate(); | |||
| auto stream = CompNodeEnv::from_comp_node(cn).cuda_env().stream; | |||
| cudaEvent_t evt0; | |||
| cudaEvent_t evt1; | |||
| MGB_CUDA_CHECK(cudaEventCreate(&evt0)); | |||
| MGB_CUDA_CHECK(cudaEventCreate(&evt1)); | |||
| CUTimer timer(stream, evt0, evt1); | |||
| timer.start(); | |||
| for (size_t i = 0; i < RUNS; ++i) | |||
| func->execute(); | |||
| timer.stop(); | |||
| double time_cuda_evt = timer.get_time_in_us() / static_cast<double>(RUNS); | |||
| OperatorNodeBase* start = x.node()->owner_opr(); | |||
| OperatorNodeBase* end = y->owner_opr(); | |||
| std::unique_ptr<ReformatProfiler> profiler = | |||
| std::make_unique<ReformatProfiler>(graph.get(), start, end); | |||
| ASSERT_TRUE(y->shape().eq_shape(TensorShape{N, C, H, W})); | |||
| for (size_t i = 0; i < RUNS; ++i) | |||
| func->execute(); | |||
| double time_profiler = profiler->duration() * 1e6; | |||
| printf("%f, %f\n", time_profiler, time_cuda_evt); | |||
| ASSERT_EQ(time_cuda_evt, time_profiler); | |||
| MGB_CUDA_CHECK(cudaEventDestroy(evt0)); | |||
| MGB_CUDA_CHECK(cudaEventDestroy(evt1)); | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||