|
- /**
- * \file src/gopt/test/profiler.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
- * implied.
- */
-
- #include "./helper.h"
- #include "megbrain/gopt/global_layout_transform.h"
- #include "megbrain/gopt/inference.h"
- #include "megbrain/opr/dnn/pooling.h"
- #include "megbrain/opr/imgproc.h"
- #include "megbrain/opr/nn_int.h"
- #include "megbrain/serialization/serializer.h"
-
- using namespace mgb;
- using namespace gopt;
- using namespace serialization;
-
- namespace {
- class LayoutTransformContext : public NonCopyableObj {
- public:
- using OprList = SubGraphExtractor::OprList;
- using OprFormat = Problem::OprFormat;
- using OprConfigTrait = Problem::OprConfigTrait;
-
- LayoutTransformContext() = delete;
- LayoutTransformContext(OprList opr_list,
- SmallVector<TensorFormats> available_tensor_formats,
- OprConfigTrait opr_configs)
- : m_opr_list{std::move(opr_list)},
- m_available_tensor_formats{std::move(available_tensor_formats)},
- m_opr_configs{std::move(opr_configs)} {}
- const OprList& opr_list() const { return m_opr_list; }
- const SmallVector<TensorFormats>& available_tensor_formats() const {
- return m_available_tensor_formats;
- }
- const OprConfigTrait& opr_configs() const { return m_opr_configs; }
- static std::unique_ptr<LayoutTransformContext> make() {
- OprList opr_list = {
- opr::ConvBiasForward::typeinfo(),
- opr::ConvolutionForward::typeinfo(),
- opr::ConvolutionBackwardData::typeinfo(),
- opr::ElemwiseMultiType::typeinfo(),
- opr::Elemwise::typeinfo(),
- opr::TypeCvt::typeinfo(),
- opr::PoolingForward::typeinfo(),
- opr::WarpPerspectiveForward::typeinfo(),
- };
- OprConfigTrait opr_configs;
- {
- auto& dispatchers = opr_configs[opr::ConvBias::typeinfo()];
- #define cb(_fmt) \
- dispatchers[OprFormat::_fmt] = \
- OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
- opr::ConvBias::typeinfo(), OprFormat::_fmt);
- cb(NCHW4);
- cb(NCHW32);
- cb(NHWC);
- cb(NCHW64);
- cb(CHWN4);
- #undef cb
- }
- {
- auto& dispatchers =
- opr_configs[opr::ConvolutionBackwardData::typeinfo()];
- #define cb(_fmt) \
- dispatchers[OprFormat::_fmt] = \
- OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
- opr::ConvolutionBackwardData::typeinfo(), \
- OprFormat::_fmt);
- cb(NCHW4);
- #undef cb
- }
-
- {
- auto& dispatchers =
- opr_configs[opr::ConvolutionForward::typeinfo()];
- #define cb(_fmt) \
- dispatchers[OprFormat::_fmt] = \
- OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
- opr::ConvolutionForward::typeinfo(), OprFormat::_fmt);
- cb(NCHW4);
- #undef cb
- }
-
- {
- auto& dispatchers = opr_configs[opr::PoolingForward::typeinfo()];
- #define cb(_fmt) \
- dispatchers[OprFormat::_fmt] = \
- OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
- opr::PoolingForward::typeinfo(), OprFormat::_fmt);
- cb(NCHW4);
- cb(NCHW32);
- cb(NHWC);
- cb(NCHW64);
- cb(CHWN4);
- #undef cb
- }
-
- {
- auto& dispatchers =
- opr_configs[opr::WarpPerspectiveForward::typeinfo()];
- #define cb(_fmt) \
- dispatchers[OprFormat::_fmt] = \
- OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
- opr::WarpPerspectiveForward::typeinfo(), OprFormat::_fmt);
- cb(NHWC);
- cb(NCHW4);
- cb(NCHW64);
- #undef cb
- }
-
- SmallVector<TensorFormats> available_tensor_formats = {
- TensorFormats::NHWC, TensorFormats::NCHWc4,
- TensorFormats::NCHWc32, TensorFormats::NCHWc64};
- return std::make_unique<LayoutTransformContext>(
- std::move(opr_list), std::move(available_tensor_formats),
- std::move(opr_configs));
- }
-
- private:
- OprList m_opr_list;
- SmallVector<TensorFormats> m_available_tensor_formats;
- OprConfigTrait m_opr_configs;
- };
- }; // namespace
-
- #if MGB_CUDA
- #if CUDA_VERSION >= 10020
- TEST(TestProfiler, Conv) {
- REQUIRE_GPU(1);
- auto cn = CompNode::load("gpu0");
- cn.activate();
- REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
- auto ctx = LayoutTransformContext::make();
-
- HostTensorGenerator<dtype::Int8> gen;
- auto graph = ComputingGraph::make();
- graph->options().graph_opt_level = 0;
- auto mkvar = [&](const char* name, const TensorShape& shp,
- const DType& dtype) {
- return opr::TypeCvt::make(
- opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
- dtype);
- };
- auto mkcvar = [&](const char* name, const TensorShape& shp,
- const DType& dtype) {
- return opr::TypeCvt::make(
- opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
- .rename(name),
- dtype);
- };
- auto x = mkvar("x", {64, 48, 14, 14},
- dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
- auto w1 = mkcvar("w1", {48, 48, 3, 3}, dtype::QuantizedS4(2.5f));
- auto b1 = mkcvar("b1", {1, 48, 1, 1}, dtype::QuantizedS32(6.25f));
- opr::ConvBias::Param param;
- param.format = opr::ConvBias::Param::Format::NCHW;
- param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
- param.stride_h = param.stride_w = 1;
- param.pad_h = param.pad_w = 1;
- auto c1 = opr::ConvBias::make(x, w1, b1, param, {},
- OperatorNodeConfig(dtype::Quantized4Asymm(
- 12.345f, static_cast<uint8_t>(5))));
- x = opr::TypeCvt::make(c1, dtype::QuantizedS8(12.345f));
- auto w2 = mkcvar("w2", {48, 48, 3, 3}, dtype::QuantizedS8(2.5f));
- auto b2 = mkcvar("b2", {1, 48, 1, 1}, dtype::QuantizedS32(12.345f * 2.5f));
- auto c2 = opr::ConvBias::make(x, w2, b2, param, {},
- OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
-
- using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
- S strategy = S::PROFILE;
- gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
- using OprFormat = OprTensorFormatsConfiguration::OprFormat;
- SubGraphExtractor extractor(ctx->opr_list());
- auto partitions = extractor.extract({c2});
- ASSERT_EQ(partitions.size(), 1u);
- using Attribute = Problem::Attribute;
- Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
- Problem problem(partitions[0], ctx->available_tensor_formats(),
- ctx->opr_configs(), attribute);
- auto profiler = ProfilerBase::make_profiler();
- auto rst = profiler->profile(problem);
- const auto& opr_rst = rst.opr_record;
- const auto& var_rst = rst.var_record;
- EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
- EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
- EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
- EXPECT_TRUE(var_rst.count(w1.node()) == 0);
- EXPECT_TRUE(var_rst.count(b1.node()) == 0);
- EXPECT_TRUE(var_rst.count(w2.node()) == 0);
- EXPECT_TRUE(var_rst.count(b2.node()) == 0);
- }
- #endif
-
- TEST(TestProfiler, Deconv) {
- REQUIRE_GPU(1);
- auto cn = CompNode::load("gpu0");
- cn.activate();
- REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
- auto ctx = LayoutTransformContext::make();
-
- HostTensorGenerator<dtype::Int8> gen;
- auto graph = ComputingGraph::make();
- graph->options().graph_opt_level = 0;
- auto mkvar = [&](const char* name, const TensorShape& shp,
- const DType& dtype) {
- return opr::TypeCvt::make(
- opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
- dtype);
- };
- auto mkcvar = [&](const char* name, const TensorShape& shp,
- const DType& dtype) {
- return opr::TypeCvt::make(
- opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
- .rename(name),
- dtype);
- };
- auto x = mkvar("x", {64, 10, 7, 7}, dtype::QuantizedS8(2.5f));
- auto w1 = mkcvar("w1", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
- using Param = opr::ConvolutionBackwardData::Param;
- Param param;
- param.format = opr::ConvolutionBackwardData::Param::Format::NCHW;
- param.stride_h = param.stride_w = 2;
- param.pad_h = param.pad_w = 0;
- auto c1 = opr::ConvolutionBackwardData::make(
- w1, x, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
- auto w2 = mkcvar("w2", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
- auto c2 = opr::ConvolutionBackwardData::make(
- w2, c1, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
-
- using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
- S strategy = S::PROFILE;
- gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
- using OprFormat = OprTensorFormatsConfiguration::OprFormat;
- SubGraphExtractor extractor(ctx->opr_list());
- auto partitions = extractor.extract({c2});
- ASSERT_EQ(partitions.size(), 1u);
- using Attribute = Problem::Attribute;
- Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
- Problem problem(partitions[0], ctx->available_tensor_formats(),
- ctx->opr_configs(), attribute);
- auto profiler = ProfilerBase::make_profiler();
- auto rst = profiler->profile(problem);
- const auto& opr_rst = rst.opr_record;
- const auto& var_rst = rst.var_record;
- EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
- EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
- EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
- EXPECT_TRUE(var_rst.count(w1.node()) == 0);
- EXPECT_TRUE(var_rst.count(w2.node()) == 0);
- }
-
- TEST(TestProfiler, Warp) {
- REQUIRE_GPU(1);
- auto cn = CompNode::load("gpu0");
- cn.activate();
- REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
- auto ctx = LayoutTransformContext::make();
-
- constexpr size_t INP_H = 10, INP_W = 10, N = 16;
-
- HostTensorGenerator<dtype::Int8> gen;
- auto graph = ComputingGraph::make();
- graph->options().graph_opt_level = 0;
- auto mkvar = [&](const char* name, const TensorShape& shp,
- const DType& dtype) {
- return opr::TypeCvt::make(
- opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
- dtype);
- };
-
- auto x = mkvar("x", {N, 48, INP_H, INP_W},
- dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
- float value1 = M_PI, value2 = 0.6;
- auto gen_mat = [&](HostTensorND& mat) {
- auto ptr = mat.ptr<float>();
- for (size_t i = 0; i < N; ++i) {
- auto rot = value1, scale = value2, sheer = value1, dy = value2,
- dx = value2, ky = value2, kx = value2, kb = value2;
- ptr[0] = ptr[4] = cos(rot) * scale;
- ptr[1] = -(ptr[3] = sin(rot) * scale);
- ptr[3] *= sheer;
- ptr[4] *= sheer;
- ptr[2] = dx;
- ptr[5] = dy;
- ptr[6] = kx;
- ptr[7] = ky;
- ptr[8] = kb;
- ptr += 9;
- }
- mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
- };
- auto mat_host = std::make_shared<HostTensorND>(
- x.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
- gen_mat(*mat_host);
- auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
- TensorShape out_shp{20, 20};
- auto w1 = opr::WarpPerspectiveForward::make(x, mat, out_shp);
-
- using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
- S strategy = S::PROFILE;
- gopt::modify_opr_algo_strategy_inplace({w1}, strategy);
- using OprFormat = OprTensorFormatsConfiguration::OprFormat;
- SubGraphExtractor extractor(ctx->opr_list());
- auto partitions = extractor.extract({w1});
- ASSERT_EQ(partitions.size(), 1u);
- using Attribute = Problem::Attribute;
- Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
- Problem problem(partitions[0], ctx->available_tensor_formats(),
- ctx->opr_configs(), attribute);
- auto profiler = ProfilerBase::make_profiler();
- auto rst = profiler->profile(problem);
- const auto& opr_rst = rst.opr_record;
- const auto& var_rst = rst.var_record;
- EXPECT_TRUE(opr_rst.count(w1.node()->owner_opr()) > 0);
- EXPECT_TRUE(var_rst.count(mat.node()) == 0);
- EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(2)) == 0);
- EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(0)) > 0);
- }
-
- TEST(TestProfiler, Pooling) {
- REQUIRE_GPU(1);
- auto cn = CompNode::load("gpu0");
- cn.activate();
- REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
- auto ctx = LayoutTransformContext::make();
-
- HostTensorGenerator<dtype::Int8> gen;
- auto graph = ComputingGraph::make();
- graph->options().graph_opt_level = 0;
- auto mkvar = [&](const char* name, const TensorShape& shp,
- const DType& dtype) {
- return opr::TypeCvt::make(
- opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
- dtype);
- };
- auto x = mkvar("x", {64, 64, 55, 55},
- dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
- using Param = opr::Pooling::Param;
- Param param;
- param.format = Param::Format::NCHW;
- auto p1 = opr::Pooling::make(x, param);
- x = opr::TypeCvt::make(p1, dtype::QuantizedS8(12.345f));
- auto p2 = opr::Pooling::make(x, param);
-
- using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
- S strategy = S::PROFILE;
- gopt::modify_opr_algo_strategy_inplace({p2}, strategy);
- using OprFormat = OprTensorFormatsConfiguration::OprFormat;
- SubGraphExtractor extractor(ctx->opr_list());
- auto partitions = extractor.extract({p2});
- ASSERT_EQ(partitions.size(), 1u);
- using Attribute = Problem::Attribute;
- Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
- Problem problem(partitions[0], ctx->available_tensor_formats(),
- ctx->opr_configs(), attribute);
- auto profiler = ProfilerBase::make_profiler();
- auto rst = profiler->profile(problem);
- const auto& opr_rst = rst.opr_record;
- EXPECT_TRUE(opr_rst.count(p1.node()->owner_opr()) > 0);
- EXPECT_TRUE(opr_rst.count(p2.node()->owner_opr()) > 0);
- EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
- }
-
- TEST(TestProfiler, Elemwise) {
- REQUIRE_GPU(1);
- auto cn = CompNode::load("gpu0");
- cn.activate();
- REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
- auto ctx = LayoutTransformContext::make();
-
- HostTensorGenerator<dtype::Int8> gen;
- auto graph = ComputingGraph::make();
- graph->options().graph_opt_level = 0;
- auto mkvar = [&](const char* name, const TensorShape& shp,
- const DType& dtype) {
- return opr::TypeCvt::make(
- opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
- dtype);
- };
- auto a = mkvar("a", {64, 48, 14, 14}, dtype::Float32());
- auto b = mkvar("b", {1, 48, 1, 1}, dtype::Float32());
- auto c = opr::Elemwise::make({a, b},
- {opr::Elemwise::Param::Mode::FUSE_ADD_RELU});
- auto q4c = opr::TypeCvt::make(
- c, dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
- auto q8a = mkvar("q8a", {64, 48, 14, 14}, dtype::QuantizedS8(2.5f));
- auto q8b = mkvar("q8b", {64, 48, 14, 14}, dtype::QuantizedS8(1.2f));
- auto q8d = opr::ElemwiseMultiType::make(
- {q8a, q8b}, {opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU},
- OperatorNodeConfig(dtype::QuantizedS8(12.f)));
- auto q4d = opr::TypeCvt::make(
- q8d, dtype::Quantized4Asymm(1.2f, static_cast<uint8_t>(3)));
- auto q4e = opr::ElemwiseMultiType::make(
- {q4c, q4d}, {opr::ElemwiseMultiType::Param::Mode::QADD},
- OperatorNodeConfig(
- dtype::Quantized4Asymm(13.f, static_cast<uint8_t>(4))));
-
- using OprFormat = OprTensorFormatsConfiguration::OprFormat;
- SubGraphExtractor extractor(ctx->opr_list());
- auto partitions = extractor.extract({q4e});
- ASSERT_EQ(partitions.size(), 1u);
- using Attribute = Problem::Attribute;
- Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
- Problem problem(partitions[0], ctx->available_tensor_formats(),
- ctx->opr_configs(), attribute);
- auto profiler = ProfilerBase::make_profiler();
- auto rst = profiler->profile(problem);
- const auto& opr_rst = rst.opr_record;
- const auto& var_rst = rst.var_record;
- EXPECT_TRUE(opr_rst.count(c.node()->owner_opr()) > 0);
- EXPECT_TRUE(opr_rst.count(q8d.node()->owner_opr()) > 0);
- EXPECT_TRUE(opr_rst.count(q4e.node()->owner_opr()) > 0);
- EXPECT_TRUE(var_rst.count(a.node()) > 0);
- EXPECT_TRUE(var_rst.count(b.node()) > 0);
- EXPECT_TRUE(var_rst.count(q8a.node()) > 0);
- EXPECT_TRUE(var_rst.count(q8b.node()) > 0);
- }
-
- #endif
-
- // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
|