/** * \file src/gopt/test/profiler.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ #include "./helper.h" #include "megbrain/gopt/global_layout_transform.h" #include "megbrain/gopt/inference.h" #include "megbrain/opr/dnn/pooling.h" #include "megbrain/opr/imgproc.h" #include "megbrain/opr/nn_int.h" #include "megbrain/serialization/serializer.h" using namespace mgb; using namespace gopt; using namespace serialization; namespace { class LayoutTransformContext : public NonCopyableObj { public: using OprList = SubGraphExtractor::OprList; using OprFormat = Problem::OprFormat; using OprConfigTrait = Problem::OprConfigTrait; LayoutTransformContext() = delete; LayoutTransformContext(OprList opr_list, SmallVector available_tensor_formats, OprConfigTrait opr_configs) : m_opr_list{std::move(opr_list)}, m_available_tensor_formats{std::move(available_tensor_formats)}, m_opr_configs{std::move(opr_configs)} {} const OprList& opr_list() const { return m_opr_list; } const SmallVector& available_tensor_formats() const { return m_available_tensor_formats; } const OprConfigTrait& opr_configs() const { return m_opr_configs; } static std::unique_ptr make() { OprList opr_list = { opr::ConvBiasForward::typeinfo(), opr::ConvolutionForward::typeinfo(), opr::ConvolutionBackwardData::typeinfo(), opr::ElemwiseMultiType::typeinfo(), opr::Elemwise::typeinfo(), opr::TypeCvt::typeinfo(), opr::PoolingForward::typeinfo(), opr::WarpPerspectiveForward::typeinfo(), }; OprConfigTrait opr_configs; { auto& dispatchers = opr_configs[opr::ConvBias::typeinfo()]; #define cb(_fmt) \ dispatchers[OprFormat::_fmt] = \ OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ opr::ConvBias::typeinfo(), OprFormat::_fmt); cb(NCHW4); cb(NCHW32); cb(NHWC); cb(NCHW64); cb(CHWN4); #undef cb } { auto& dispatchers = opr_configs[opr::ConvolutionBackwardData::typeinfo()]; #define cb(_fmt) \ dispatchers[OprFormat::_fmt] = \ OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ opr::ConvolutionBackwardData::typeinfo(), \ OprFormat::_fmt); cb(NCHW4); #undef cb } { auto& dispatchers = opr_configs[opr::ConvolutionForward::typeinfo()]; #define cb(_fmt) \ dispatchers[OprFormat::_fmt] = \ OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ opr::ConvolutionForward::typeinfo(), OprFormat::_fmt); cb(NCHW4); #undef cb } { auto& dispatchers = opr_configs[opr::PoolingForward::typeinfo()]; #define cb(_fmt) \ dispatchers[OprFormat::_fmt] = \ OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ opr::PoolingForward::typeinfo(), OprFormat::_fmt); cb(NCHW4); cb(NCHW32); cb(NHWC); cb(NCHW64); cb(CHWN4); #undef cb } { auto& dispatchers = opr_configs[opr::WarpPerspectiveForward::typeinfo()]; #define cb(_fmt) \ dispatchers[OprFormat::_fmt] = \ OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ opr::WarpPerspectiveForward::typeinfo(), OprFormat::_fmt); cb(NHWC); cb(NCHW4); cb(NCHW64); #undef cb } SmallVector available_tensor_formats = { TensorFormats::NHWC, TensorFormats::NCHWc4, TensorFormats::NCHWc32, TensorFormats::NCHWc64}; return std::make_unique( std::move(opr_list), std::move(available_tensor_formats), std::move(opr_configs)); } private: OprList m_opr_list; SmallVector m_available_tensor_formats; OprConfigTrait m_opr_configs; }; }; // namespace #if MGB_CUDA #if CUDA_VERSION >= 10020 TEST(TestProfiler, Conv) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); auto ctx = LayoutTransformContext::make(); HostTensorGenerator gen; auto graph = ComputingGraph::make(); graph->options().graph_opt_level = 0; auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) { return opr::TypeCvt::make( opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype); }; auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) { return opr::TypeCvt::make( opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) .rename(name), dtype); }; auto x = mkvar("x", {64, 48, 14, 14}, dtype::Quantized4Asymm(2.5f, static_cast(4))); auto w1 = mkcvar("w1", {48, 48, 3, 3}, dtype::QuantizedS4(2.5f)); auto b1 = mkcvar("b1", {1, 48, 1, 1}, dtype::QuantizedS32(6.25f)); opr::ConvBias::Param param; param.format = opr::ConvBias::Param::Format::NCHW; param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; param.stride_h = param.stride_w = 1; param.pad_h = param.pad_w = 1; auto c1 = opr::ConvBias::make(x, w1, b1, param, {}, OperatorNodeConfig(dtype::Quantized4Asymm( 12.345f, static_cast(5)))); x = opr::TypeCvt::make(c1, dtype::QuantizedS8(12.345f)); auto w2 = mkcvar("w2", {48, 48, 3, 3}, dtype::QuantizedS8(2.5f)); auto b2 = mkcvar("b2", {1, 48, 1, 1}, dtype::QuantizedS32(12.345f * 2.5f)); auto c2 = opr::ConvBias::make(x, w2, b2, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f))); using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; S strategy = S::PROFILE; gopt::modify_opr_algo_strategy_inplace({c2}, strategy); using OprFormat = OprTensorFormatsConfiguration::OprFormat; SubGraphExtractor extractor(ctx->opr_list()); auto partitions = extractor.extract({c2}); ASSERT_EQ(partitions.size(), 1u); using Attribute = Problem::Attribute; Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; Problem problem(partitions[0], ctx->available_tensor_formats(), ctx->opr_configs(), attribute); auto profiler = ProfilerBase::make_profiler(); auto rst = profiler->profile(problem); const auto& opr_rst = rst.opr_record; const auto& var_rst = rst.var_record; EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0); EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0); EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0); EXPECT_TRUE(var_rst.count(w1.node()) == 0); EXPECT_TRUE(var_rst.count(b1.node()) == 0); EXPECT_TRUE(var_rst.count(w2.node()) == 0); EXPECT_TRUE(var_rst.count(b2.node()) == 0); } #endif TEST(TestProfiler, Deconv) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); auto ctx = LayoutTransformContext::make(); HostTensorGenerator gen; auto graph = ComputingGraph::make(); graph->options().graph_opt_level = 0; auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) { return opr::TypeCvt::make( opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype); }; auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) { return opr::TypeCvt::make( opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) .rename(name), dtype); }; auto x = mkvar("x", {64, 10, 7, 7}, dtype::QuantizedS8(2.5f)); auto w1 = mkcvar("w1", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f)); using Param = opr::ConvolutionBackwardData::Param; Param param; param.format = opr::ConvolutionBackwardData::Param::Format::NCHW; param.stride_h = param.stride_w = 2; param.pad_h = param.pad_w = 0; auto c1 = opr::ConvolutionBackwardData::make( w1, x, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f))); auto w2 = mkcvar("w2", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f)); auto c2 = opr::ConvolutionBackwardData::make( w2, c1, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f))); using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; S strategy = S::PROFILE; gopt::modify_opr_algo_strategy_inplace({c2}, strategy); using OprFormat = OprTensorFormatsConfiguration::OprFormat; SubGraphExtractor extractor(ctx->opr_list()); auto partitions = extractor.extract({c2}); ASSERT_EQ(partitions.size(), 1u); using Attribute = Problem::Attribute; Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; Problem problem(partitions[0], ctx->available_tensor_formats(), ctx->opr_configs(), attribute); auto profiler = ProfilerBase::make_profiler(); auto rst = profiler->profile(problem); const auto& opr_rst = rst.opr_record; const auto& var_rst = rst.var_record; EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0); EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0); EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0); EXPECT_TRUE(var_rst.count(w1.node()) == 0); EXPECT_TRUE(var_rst.count(w2.node()) == 0); } TEST(TestProfiler, Warp) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); auto ctx = LayoutTransformContext::make(); constexpr size_t INP_H = 10, INP_W = 10, N = 16; HostTensorGenerator gen; auto graph = ComputingGraph::make(); graph->options().graph_opt_level = 0; auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) { return opr::TypeCvt::make( opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype); }; auto x = mkvar("x", {N, 48, INP_H, INP_W}, dtype::Quantized4Asymm(2.5f, static_cast(4))); float value1 = M_PI, value2 = 0.6; auto gen_mat = [&](HostTensorND& mat) { auto ptr = mat.ptr(); for (size_t i = 0; i < N; ++i) { auto rot = value1, scale = value2, sheer = value1, dy = value2, dx = value2, ky = value2, kx = value2, kb = value2; ptr[0] = ptr[4] = cos(rot) * scale; ptr[1] = -(ptr[3] = sin(rot) * scale); ptr[3] *= sheer; ptr[4] *= sheer; ptr[2] = dx; ptr[5] = dy; ptr[6] = kx; ptr[7] = ky; ptr[8] = kb; ptr += 9; } mgb_assert(ptr == mat.ptr() + mat.shape().total_nr_elems()); }; auto mat_host = std::make_shared( x.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32()); gen_mat(*mat_host); auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat"); TensorShape out_shp{20, 20}; auto w1 = opr::WarpPerspectiveForward::make(x, mat, out_shp); using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; S strategy = S::PROFILE; gopt::modify_opr_algo_strategy_inplace({w1}, strategy); using OprFormat = OprTensorFormatsConfiguration::OprFormat; SubGraphExtractor extractor(ctx->opr_list()); auto partitions = extractor.extract({w1}); ASSERT_EQ(partitions.size(), 1u); using Attribute = Problem::Attribute; Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; Problem problem(partitions[0], ctx->available_tensor_formats(), ctx->opr_configs(), attribute); auto profiler = ProfilerBase::make_profiler(); auto rst = profiler->profile(problem); const auto& opr_rst = rst.opr_record; const auto& var_rst = rst.var_record; EXPECT_TRUE(opr_rst.count(w1.node()->owner_opr()) > 0); EXPECT_TRUE(var_rst.count(mat.node()) == 0); EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(2)) == 0); EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(0)) > 0); } TEST(TestProfiler, Pooling) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); auto ctx = LayoutTransformContext::make(); HostTensorGenerator gen; auto graph = ComputingGraph::make(); graph->options().graph_opt_level = 0; auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) { return opr::TypeCvt::make( opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype); }; auto x = mkvar("x", {64, 64, 55, 55}, dtype::Quantized4Asymm(2.5f, static_cast(4))); using Param = opr::Pooling::Param; Param param; param.format = Param::Format::NCHW; auto p1 = opr::Pooling::make(x, param); x = opr::TypeCvt::make(p1, dtype::QuantizedS8(12.345f)); auto p2 = opr::Pooling::make(x, param); using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; S strategy = S::PROFILE; gopt::modify_opr_algo_strategy_inplace({p2}, strategy); using OprFormat = OprTensorFormatsConfiguration::OprFormat; SubGraphExtractor extractor(ctx->opr_list()); auto partitions = extractor.extract({p2}); ASSERT_EQ(partitions.size(), 1u); using Attribute = Problem::Attribute; Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; Problem problem(partitions[0], ctx->available_tensor_formats(), ctx->opr_configs(), attribute); auto profiler = ProfilerBase::make_profiler(); auto rst = profiler->profile(problem); const auto& opr_rst = rst.opr_record; EXPECT_TRUE(opr_rst.count(p1.node()->owner_opr()) > 0); EXPECT_TRUE(opr_rst.count(p2.node()->owner_opr()) > 0); EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0); } TEST(TestProfiler, Elemwise) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); auto ctx = LayoutTransformContext::make(); HostTensorGenerator gen; auto graph = ComputingGraph::make(); graph->options().graph_opt_level = 0; auto mkvar = [&](const char* name, const TensorShape& shp, const DType& dtype) { return opr::TypeCvt::make( opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), dtype); }; auto a = mkvar("a", {64, 48, 14, 14}, dtype::Float32()); auto b = mkvar("b", {1, 48, 1, 1}, dtype::Float32()); auto c = opr::Elemwise::make({a, b}, {opr::Elemwise::Param::Mode::FUSE_ADD_RELU}); auto q4c = opr::TypeCvt::make( c, dtype::Quantized4Asymm(2.5f, static_cast(4))); auto q8a = mkvar("q8a", {64, 48, 14, 14}, dtype::QuantizedS8(2.5f)); auto q8b = mkvar("q8b", {64, 48, 14, 14}, dtype::QuantizedS8(1.2f)); auto q8d = opr::ElemwiseMultiType::make( {q8a, q8b}, {opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU}, OperatorNodeConfig(dtype::QuantizedS8(12.f))); auto q4d = opr::TypeCvt::make( q8d, dtype::Quantized4Asymm(1.2f, static_cast(3))); auto q4e = opr::ElemwiseMultiType::make( {q4c, q4d}, {opr::ElemwiseMultiType::Param::Mode::QADD}, OperatorNodeConfig( dtype::Quantized4Asymm(13.f, static_cast(4)))); using OprFormat = OprTensorFormatsConfiguration::OprFormat; SubGraphExtractor extractor(ctx->opr_list()); auto partitions = extractor.extract({q4e}); ASSERT_EQ(partitions.size(), 1u); using Attribute = Problem::Attribute; Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; Problem problem(partitions[0], ctx->available_tensor_formats(), ctx->opr_configs(), attribute); auto profiler = ProfilerBase::make_profiler(); auto rst = profiler->profile(problem); const auto& opr_rst = rst.opr_record; const auto& var_rst = rst.var_record; EXPECT_TRUE(opr_rst.count(c.node()->owner_opr()) > 0); EXPECT_TRUE(opr_rst.count(q8d.node()->owner_opr()) > 0); EXPECT_TRUE(opr_rst.count(q4e.node()->owner_opr()) > 0); EXPECT_TRUE(var_rst.count(a.node()) > 0); EXPECT_TRUE(var_rst.count(b.node()) > 0); EXPECT_TRUE(var_rst.count(q8a.node()) > 0); EXPECT_TRUE(var_rst.count(q8b.node()) > 0); } #endif // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}