feat(mgb/gopt): add profiler cache

In order to improve performance of the profiling procedure. Make layout transform testcase stable. The profiling result in ci environment will be cached in files. GitOrigin-RevId: ba2743f35f
4 years ago · 012de7695d
--- a/.gitattributes
+++ b/.gitattributes
@@ -9,6 +9,7 @@ dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary
 dnn/src/cuda/sass/prebuilt/map_defs.cpp binary
 dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary
 dnn/src/cuda/elemwise_multi_type/kimpl/* binary
 src/gopt/test/cache_data.h binary
 tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text
 imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text
 ci/resource/models/float/mobilenet_v2.pkl filter=lfs diff=lfs merge=lfs -text
--- a/sdk/load-and-run/BUILD
+++ b/sdk/load-and-run/BUILD
@@ -2,13 +2,11 @@ cc_library(
    name = "mgblar",
    copts = ["-std=c++14"],
    srcs = [
        "src/infile_persistent_cache.cpp",
        "src/mgblar.cpp",
        "src/json_loader.cpp",
        "src/text_table.cpp",
    ],
    hdrs = [
        "src/infile_persistent_cache.h",
        "src/mgblar.h",
        "src/json_loader.h",
        "src/text_table.h",
@@ -57,11 +55,9 @@ cc_megvii_binary(
 cc_library(
    name = "megbrain_ios_lar_lib",
    srcs = [
        "src/infile_persistent_cache.cpp",
        "src/mgblar.cpp",
    ],
    hdrs = [
        "src/infile_persistent_cache.h",
        "src/mgblar.h",
    ],
    copts = ["-DMGB_NO_MAIN=1"],
--- a/sdk/load-and-run/src/mgblar.cpp
+++ b/sdk/load-and-run/src/mgblar.cpp
@@ -10,7 +10,6 @@
 */

 #include "./mgblar.h"
 #include "./infile_persistent_cache.h"
 #include "./json_loader.h"
 #include "./npy.h"
 #include "./text_table.h"
@@ -30,6 +29,7 @@
 #include "megbrain/serialization/extern_c_opr.h"
 #include "megbrain/serialization/serializer.h"
 #include "megbrain/utils/debug.h"
 #include "megbrain/utils/infile_persistent_cache.h"

 #include "megbrain/system.h"
 #include "megbrain/version.h"
--- a/sdk/load-and-run/src/infile_persistent_cache.cpp
+++ b/sdk/load-and-run/src/infile_persistent_cache.cpp
@@ -1,5 +1,5 @@
 /**
 * \file sdk/load-and-run/src/infile_persistent_cache.cpp
 * \file src/core/impl/utils/infile_persistent_cache.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -9,7 +9,7 @@
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "./infile_persistent_cache.h"
 #include "megbrain/utils/infile_persistent_cache.h"

 #if defined(_WIN32)
 #include <io.h>
--- a/src/core/include/megbrain/utils/infile_persistent_cache.h
+++ b/src/core/include/megbrain/utils/infile_persistent_cache.h
@@ -1,5 +1,5 @@
 /**
 * \file sdk/load-and-run/src/infile_persistent_cache.h
 * \file src/core/include/megbrain/utils/infile_persistent_cache.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -70,6 +70,7 @@ public:
    Maybe<Blob> get(const std::string& category, const Blob& key) override;
    void put(const std::string& category, const Blob& key,
             const Blob& value) override;
    bool support_dump_cache() override { return true; }
 };
 }  // namespace mgb

--- a/src/core/include/megbrain/utils/persistent_cache.h
+++ b/src/core/include/megbrain/utils/persistent_cache.h
@@ -39,6 +39,8 @@ public:
    virtual void put(
            const std::string& category, const Blob& key, const Blob& value) = 0;

    virtual bool support_dump_cache() { return false; }
    
    //! set an implementation; return the original implementation
    static std::shared_ptr<PersistentCache> set_impl(
            std::shared_ptr<PersistentCache> impl);
--- a/src/gopt/impl/global_layout_transform/opr_safe_dump.cpp
+++ b/src/gopt/impl/global_layout_transform/opr_safe_dump.cpp
@@ -0,0 +1,96 @@
 /**
 * \file src/gopt/impl/global_layout_transform/opr_safe_dump.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "./opr_safe_dump.h"
 #include "megbrain/opr/basic_arith.h"
 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/opr/dnn/pooling.h"
 #include "megbrain/opr/imgproc.h"
 #include "megbrain/opr/nn_int.h"
 #include "megbrain/opr/tensor_manip.h"

 #include "midout.h"
 MIDOUT_DECL(megbrain_opr_safe_dump)
 #define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_safe_dump, __VA_ARGS__) {
 #define MIDOUT_E \
    }            \
    MIDOUT_END();

 using namespace mgb;
 using namespace opr;

 namespace {
 template <typename Param>
 void write_param(std::string& data, const Param& param) {
    megdnn::Algorithm::serialize_write_pod(param, data);
 }

 template <>
 void write_param(std::string& /* data */, const DType& /* dtype */) {}

 template <class Opr>
 struct OprDumpImpl {
    static std::string dump(const cg::OperatorNodeBase* opr_) {
        MIDOUT_B(Opr)
        auto&& opr = opr_->cast_final_safe<Opr>();
        std::string data;
        write_param(data, opr.param());
        return data;
        MIDOUT_E
    }
 };

 #define INST(_Opr)                                                     \
    template <>                                                        \
    struct OprDumpImpl<_Opr> {                                         \
        static std::string dump(const cg::OperatorNodeBase* opr_) {    \
            MIDOUT_B(_Opr)                                             \
            auto&& opr = opr_->cast_final_safe<_Opr>();                \
            std::string data;                                          \
            write_param(data, opr.param());                            \
            using ExecutionPolicy = megdnn::param::ExecutionPolicy;    \
            ExecutionPolicy policy{                                    \
                    opr.execution_policy_transient().strategy,         \
                    opr.execution_policy_transient().workspace_limit}; \
            write_param(data, policy);                                 \
            return data;                                               \
            MIDOUT_E                                                   \
        }                                                              \
    };
 INST(Convolution);
 INST(ConvBiasForward);
 INST(ConvolutionBackwardData);
 INST(PoolingForward);
 #undef INST
 }  // namespace

 namespace mgb {
 namespace gopt {
 namespace intl {

 std::string opr_safe_dump(const cg::OperatorNodeBase* opr) {
 #define cb(_Opr)                                   \
    if (opr->dyn_typeinfo() == _Opr::typeinfo()) { \
        return OprDumpImpl<_Opr>::dump(opr);       \
    } else
    FOREACH_SUPPORTED_OPR(cb) {
        mgb_throw(InternalError, "unsupported operator(got:%s)",
                  opr->dyn_typeinfo()->name);
    }
 #undef cb
 }

 }  // namespace intl
 }  // namespace gopt
 }  // namespace mgb

 // vim: syntax=cpp.doxygen
--- a/src/gopt/impl/global_layout_transform/opr_safe_dump.h
+++ b/src/gopt/impl/global_layout_transform/opr_safe_dump.h
@@ -0,0 +1,30 @@
 /**
 * \file src/gopt/impl/global_layout_transform/opr_safe_dump.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once
 #include "megbrain/graph.h"

 namespace mgb {
 namespace gopt {
 namespace intl {
 #define FOREACH_SUPPORTED_OPR(cb)                                          \
    cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData)        \
            cb(PoolingForward) cb(WarpPerspective) cb(Resize) cb(Elemwise) \
                    cb(ElemwiseMultiType) cb(Concat) cb(PowC) cb(TypeCvt)

 std::string opr_safe_dump(const cg::OperatorNodeBase* opr);

 }  // namespace intl
 }  // namespace gopt
 }  // namespace mgb

 // vim: syntax=cpp.doxygen
--- a/src/gopt/impl/global_layout_transform/profiler_cache.cpp
+++ b/src/gopt/impl/global_layout_transform/profiler_cache.cpp
@@ -0,0 +1,184 @@
 /**
 * \file src/gopt/impl/profiler_cache.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "./opr_safe_dump.h"
 #include "megbrain/gopt/profiler.h"
 #include "megbrain/comp_node_env.h"

 using namespace mgb;
 using namespace gopt;
 using ReformatKey = ReformatManager::ReformatKey;

 // =================== ProfilerCache ======================
 void ProfilerCache::Key::build_blob_from_opr() {
    auto&& opr = m_key_impl.opr_key.opr;
    // process opr type
    auto type = opr->dyn_typeinfo()->name;
    size_t type_size = strlen(type);

    // process  opr param
    auto data = intl::opr_safe_dump(opr);
    size_t param_size = data.size();

    size_t nr_inputs = opr->input().size();
    size_t nr_outputs = opr->usable_output().size();
    size_t nr_layouts = nr_inputs + nr_outputs;
    m_blob_storage.reserve(sizeof(TensorLayout) * 3 * nr_layouts + type_size +
                           param_size);

    // serialize opr type
    m_blob_storage.append(type, type_size);

    // serialize param
    const char* data_ptr = reinterpret_cast<const char*>(data.data());
    m_blob_storage.append(data_ptr, param_size);

    // serialize layouts
    auto append_layout = [this](const VarNode* v) {
        TensorLayout ly{v->shape(), v->dtype(), v->format()};
        for (size_t i = 0; i < ly.ndim; ++i) {
            if (i)
                m_blob_storage.push_back(',');
            m_blob_storage.append(std::to_string(ly.shape[i]));
        }
        if (!ly.is_contiguous()) {
            m_blob_storage.push_back(';');
            for (size_t i = 0; i < ly.ndim; ++i) {
                if (i)
                    m_blob_storage.push_back(',');
                m_blob_storage.append(std::to_string(ly.stride[i]));
            }
        }
        m_blob_storage.push_back(';');
        m_blob_storage.append(ly.dtype.name());
        m_blob_storage.push_back('|');
    };
    for (size_t i = 0; i < nr_inputs; ++i) {
        append_layout(opr->input(i));
    }
    for (size_t i = 0; i < nr_outputs; ++i) {
        append_layout(opr->output(i));
    }

    // serialize opr_format
    m_blob_storage.append(std::to_string(
            static_cast<uint32_t>(m_key_impl.opr_key.opr_format)));

    // serialize extra_attribute
    m_blob_storage.append(std::to_string(
            static_cast<uint32_t>(m_key_impl.opr_key.extra_attribute)));
 }

 void ProfilerCache::Key::build_category(CompNode cn) {
    m_category = "layout_transform_profile:";
    auto&& env = CompNodeEnv::from_comp_node(cn);
    switch (env.property().type) {
 #if MGB_CUDA
        case CompNode::DeviceType::CUDA: {
            auto&& prop = env.cuda_env().device_prop;
            m_category += ssprintf("plat=cuda;dev=%s;cap=%d.%d", prop.name,
                                   prop.major, prop.minor);
            break;
        }
 #endif
        case CompNode::DeviceType::CPU:
            m_category += "plat=cpu";
            break;
        default:
            mgb_throw(MegBrainError,
                      "unsupported comp node for global layout transform "
                      "profiler cache category");
    }
 }

 void ProfilerCache::Key::build_blob_from_var() {
    auto v = m_key_impl.var_key.var;

    // serialize layouts
    auto append_layout = [this](const VarNode* v) {
        TensorLayout ly{v->shape(), v->dtype(), v->format()};
        for (size_t i = 0; i < ly.ndim; ++i) {
            if (i)
                m_blob_storage.push_back(',');
            m_blob_storage.append(std::to_string(ly.shape[i]));
        }
        if (!ly.is_contiguous()) {
            m_blob_storage.push_back(';');
            for (size_t i = 0; i < ly.ndim; ++i) {
                if (i)
                    m_blob_storage.push_back(',');
                m_blob_storage.append(std::to_string(ly.stride[i]));
            }
        }
        m_blob_storage.push_back(';');
        m_blob_storage.append(ly.dtype.name());
        m_blob_storage.push_back('|');
    };
    append_layout(v);

    // serialze reformat key
    m_blob_storage.append(m_key_impl.var_key.key.to_string());
 }

 const std::string& ProfilerCache::Key::category() const {
    mgb_assert(!m_category.empty());
    return m_category;
 }

 PersistentCache::Blob ProfilerCache::Key::blob() const {
    mgb_assert(!m_blob_storage.empty());
    return {m_blob_storage.data(), m_blob_storage.size()};
 }

 ProfilerCache& ProfilerCache::inst() {
    static ProfilerCache inst;
    return inst;
 }

 ProfilerCache& ProfilerCache::set_impl(std::unique_ptr<PersistentCache> impl) {
    mgb_assert(impl != nullptr);
    m_impl.swap(impl);
    return *this;
 }

 void ProfilerCache::dump_cache(const char* path) {
    mgb_assert(m_impl->support_dump_cache(),
               "current impl of ProfilerCache does not support dump cache to "
               "file.");
    auto cache = static_cast<InFilePersistentCache*>(m_impl.get());
    cache->dump_cache(path);
 }

 Maybe<ProfilerCache::Result> ProfilerCache::get(const Key& key) {
    auto raw_buf = m_impl->get(key.category(), key.blob());
    if (!raw_buf.valid())
        return None;
    // data type of cost is float
    auto buf = static_cast<const uint8_t*>(raw_buf->ptr);
    auto size = raw_buf->size;
    mgb_assert(buf && size == sizeof(float),
               "ProfileCache invalid value: ptr=%p, size=%zu", buf, size);
    auto read_f32 = [&]() {
        auto ret = *reinterpret_cast<const float*>(buf);
        return ret;
    };
    auto cost = read_f32();
    return cost;
 }

 void ProfilerCache::put(const Key& key, Result& result) {
    std::string val;
    megdnn::Algorithm::serialize_write_pod(result, val);
    m_impl->put(key.category(), key.blob(), {val.data(), val.size()});
 }

 // vim: syntax=cpp.doxygen
--- a/src/gopt/impl/global_layout_transform/profiler_impl.cpp
+++ b/src/gopt/impl/global_layout_transform/profiler_impl.cpp
@@ -154,69 +154,61 @@ void MarkInputContiguous::init_output_static_infer_desc() {
 }  // namespace

 /* ================== ProfilerImpl =================*/
 class ProfilerImpl final : public ProfilerBase {
 public:
    ProfilerImpl(int runs = 10) : m_runs{runs} {};
    ~ProfilerImpl() = default;
    ProfilingResult profile(const Problem& problem) const override;

 private:
    static constexpr float PROFILE_TIME_OUT = 1e7;
    using ReformatAttribute = ReformatKey::Attribute;
    /*!
     * \brief profile opr format agnostic operators (like elemwise, elemwise
     * multi type, typecvt etc.)
     *
     * \param opr pointer to the operator node to be profiled
     * \param base_format the original tensor format of the operator node.
     * \param available_tensor_formats the available tensor formats
     * \return the operator node record
     */
    OperatorNodeRecord profile_operator(
            const OperatorNodeBase* opr, TensorFormats base_format,
            const SmallVector<TensorFormats>& available_tensor_formats,
            ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
    float profile_operator(
            const OperatorNodeBase* opr, TensorFormats base_format,
            TensorFormats tensor_format,
            ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
    /*!
     * \brief profile opr format aware operators (like conv, deconv, conv_bias,
     * etc.)
     *
     * \param opr pointer to the operator node to be profiled
     * \param base_config the tensor formats configuration of base opr format
     * \param config all the available configuration
     * \return the operator node record
     */
    OperatorNodeRecord profile_operator(
            const OperatorNodeBase* opr,
            const OprTensorFormatsConfiguration& base_config,
            const SmallVector<OprTensorFormatsConfiguration>& available_configs,
            ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
    float profile_operator(
            const OperatorNodeBase* opr,
            const OprTensorFormatsConfiguration& base_config,
            const OprTensorFormatsConfiguration& config,
            ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
    /*!
     * \brief profile layout transform of the var node
     *
     * \param var pointer to the var node to be profiled
     * \param base_format the original tensor formats in which the var node is
     * stored \param available_tensor_formats the available tensor formats
     * \param extra_attribute the extra attributes (options) of the problem
     * \return the var node record
     */
    VarNodeRecord profile_var_node(
            const VarNode* var, TensorFormats base_format,
            const SmallVector<TensorFormats>& available_tensor_formats,
            ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
    float profile_var_node(
            const VarNode* var, TensorFormats base_format,
            const ReformatKey& key) const;
    int m_runs;  /// sample times of the profiler
 };
 ProfilerImpl::ProfilerImpl(int runs, float opr_threshold,
                           float var_node_threshold)
        : m_opr_threshold{opr_threshold},
          m_var_node_threshold{var_node_threshold},
          m_runs{runs} {
    m_opr_filter = [this](const OperatorNodeBase* opr,
                          OperatorNodeBase* new_opr) {
        /// \note: for the considerations of performance, we skip nchw(naive)
        /// kernels for conv bias on CUDA platform. to remove this later
        if (auto conv = try_cast_as_op<opr::ConvBiasForward>(new_opr)) {
            if (conv->output(0)->comp_node().device_type() ==
                        CompNode::DeviceType::CUDA &&
                conv->input(0)->dtype().category() ==
                        DTypeCategory::QUANTIZED &&
                conv->param().format == OprFormat::NCHW) {
                return false;
            }
        }
        float comp1 = m_opr_footprint.get_computation(
                const_cast<OperatorNodeBase*>(opr));
        float comp2 = m_opr_footprint.get_computation(new_opr);
        if (comp2 > m_opr_threshold * comp1)
            return false;
        return true;
    };
    m_var_node_filter = [this](const VarNode* var, TensorShape from,
                               TensorShape to, ReformatKey key) {
        /// \note: due to the alignment requirement of low-bit tensor, we skip
        /// some layout transform for low-bit tensors. The skipped layout
        /// transforms do not have corresponding dnn kernel and cannot be
        /// implemented by tensor manip operators (like reshape, dimshuffle,
        /// subtensor, etc.).
        if (var->dtype().enumv() == DTypeEnum::QuantizedS4 ||
            var->dtype().enumv() == DTypeEnum::Quantized4Asymm) {
            if (key.input_format == TensorFormats::NCHW &&
                key.output_format != TensorFormats::NHWC &&
                key.output_format != TensorFormats::NCHWc64) {
                return false;
            }
            if (key.output_format == TensorFormats::NCHW &&
                key.input_format != TensorFormats::NHWC &&
                key.input_format != TensorFormats::NCHWc64) {
                return false;
            }
        }
        TensorLayout orig_ly = {var->shape(), var->dtype()},
                     from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()};
        float orig_memory = orig_ly.span().dist_byte() * 2.f;
        float reformat_memory =
                from_ly.span().dist_byte() + to_ly.span().dist_byte();
        if (reformat_memory > orig_memory * m_var_node_threshold)
            return false;
        return true;
    };
 }

 ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
        const OperatorNodeBase* opr, TensorFormats base_format,
@@ -507,56 +499,6 @@ ProfilerImpl::ProfilingResult ProfilerImpl::profile(const Problem& problem) cons
 }

 /* ================== ProfilerBase =================*/
 ProfilerBase::ProfilerBase(float opr_threshold, float var_node_threshold)
        : m_opr_threshold{opr_threshold}, m_var_node_threshold{var_node_threshold} {
    m_opr_filter = [this](const OperatorNodeBase* opr, OperatorNodeBase* new_opr) {
        /// \note: for the considerations of performance, we skip nchw(naive)
        /// kernels for conv bias on CUDA platform. to remove this later
        if (auto conv = try_cast_as_op<opr::ConvBiasForward>(new_opr)) {
            if (conv->output(0)->comp_node().device_type() ==
                        CompNode::DeviceType::CUDA &&
                conv->input(0)->dtype().category() == DTypeCategory::QUANTIZED &&
                conv->param().format == OprFormat::NCHW) {
                return false;
            }
        }
        float comp1 =
                m_opr_footprint.get_computation(const_cast<OperatorNodeBase*>(opr));
        float comp2 = m_opr_footprint.get_computation(new_opr);
        if (comp2 > m_opr_threshold * comp1)
            return false;
        return true;
    };
    m_var_node_filter = [this](const VarNode* var, TensorShape from, TensorShape to,
                               ReformatKey key) {
        /// \note: due to the alignment requirement of low-bit tensor, we skip
        /// some layout transform for low-bit tensors. The skipped layout
        /// transforms do not have corresponding dnn kernel and cannot be
        /// implemented by tensor manip operators (like reshape, dimshuffle,
        /// subtensor, etc.).
        if (var->dtype().enumv() == DTypeEnum::QuantizedS4 ||
            var->dtype().enumv() == DTypeEnum::Quantized4Asymm) {
            if (key.input_format == TensorFormats::NCHW &&
                key.output_format != TensorFormats::NHWC &&
                key.output_format != TensorFormats::NCHWc64) {
                return false;
            }
            if (key.output_format == TensorFormats::NCHW &&
                key.input_format != TensorFormats::NHWC &&
                key.input_format != TensorFormats::NCHWc64) {
                return false;
            }
        }
        TensorLayout orig_ly = {var->shape(), var->dtype()},
                     from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()};
        float orig_memory = orig_ly.span().dist_byte() * 2.f;
        float reformat_memory = from_ly.span().dist_byte() + to_ly.span().dist_byte();
        if (reformat_memory > orig_memory * m_var_node_threshold)
            return false;
        return true;
    };
 }

 std::string ProfilerBase::OperatorNodeRecord::to_string() const {
    auto str = ssprintf(
            "\nopr type: %s\nopr name: %s\ninputs:\n", opr->dyn_typeinfo()->name,
@@ -595,4 +537,68 @@ std::unique_ptr<ProfilerBase> ProfilerBase::make_profiler() {
    return std::make_unique<ProfilerImpl>();
 }

 std::unique_ptr<ProfilerBase> ProfilerBase::make_cached_profiler(
        const char* path) {
    return std::make_unique<CachedProfiler>(path);
 }

 /* ================== CachedProfiler =================*/
 CachedProfiler::CachedProfiler(const char* path, int runs, float opr_threshold,
                               float var_node_threshold)
        : ProfilerImpl(runs, opr_threshold, var_node_threshold), m_path{path} {
    if (m_path != nullptr) {  // file cache
        ProfilerCache::inst().set_impl(
                std::make_unique<InFilePersistentCache>(m_path));
    }
 }

 CachedProfiler::ProfilingResult CachedProfiler::profile(
        const Problem& problem) const {
    auto ret = ProfilerImpl::profile(problem);
    if (m_path != nullptr)
        ProfilerCache::inst().dump_cache(m_path);
    return ret;
 }

 float CachedProfiler::profile_operator(
        const OperatorNodeBase* opr, TensorFormats base_format,
        TensorFormats tensor_format, ReformatAttribute extra_attribute) const {
    ProfilerCache::Key key{opr, tensor_formats_to_opr_format(tensor_format),
                           extra_attribute};
    auto ret = ProfilerCache::inst().get(key);
    if (ret.valid())
        return ret.val();
    auto rst = ProfilerImpl::profile_operator(opr, base_format, tensor_format,
                                   extra_attribute);
    ProfilerCache::inst().put(key, rst);
    return rst;
 }

 float CachedProfiler::profile_operator(
        const OperatorNodeBase* opr,
        const OprTensorFormatsConfiguration& base_config,
        const OprTensorFormatsConfiguration& config,
        ReformatAttribute extra_attribute) const {
    ProfilerCache::Key key{opr, config.opr_format, extra_attribute};
    auto ret = ProfilerCache::inst().get(key);
    if (ret.valid())
        return ret.val();
    auto rst = ProfilerImpl::profile_operator(opr, base_config, config,
                                              extra_attribute);
    ProfilerCache::inst().put(key, rst);
    return rst;
 }

 float CachedProfiler::profile_var_node(const VarNode* var,
                                       TensorFormats base_format,
                                       const ReformatKey& key) const {
    ProfilerCache::Key pf_key{var, key};
    auto ret = ProfilerCache::inst().get(pf_key);
    if (ret.valid())
        return ret.val();
    auto rst = ProfilerImpl::profile_var_node(var, base_format, key);
    ProfilerCache::inst().put(pf_key, rst);
    return rst;
 }

 // vim: syntax=cpp.doxygen
--- a/src/gopt/include/megbrain/gopt/profiler.h
+++ b/src/gopt/include/megbrain/gopt/profiler.h
@@ -18,11 +18,13 @@
 #include "megbrain/gopt/subgraph_extractor.h"
 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/plugin/opr_footprint.h"
 #include "megbrain/utils/infile_persistent_cache.h"

 namespace mgb {
 namespace gopt {

 class Problem;
 class CachedProfiler;

 /*!
 * \brief A profiler that collects all the performance data to describe the
@@ -75,22 +77,245 @@ public:
    using VarNodeFilter = thin_function<bool(
            const VarNode*, TensorShape, TensorShape, ReformatManager::ReformatKey)>;

    ProfilerBase(float opr_threshold = 2.f, float var_node_threshold = 2.f);
    ProfilerBase(OprFilter opr_filter, VarNodeFilter var_node_filter = {})
            : m_opr_filter{std::move(opr_filter)},
              m_var_node_filter{std::move(var_node_filter)} {}
    ProfilerBase() = default;
    
    virtual ~ProfilerBase() = default;

    virtual ProfilingResult profile(const Problem& problem) const = 0;

    ProfilerBase& set_opr_filter(const OprFilter& opr_filter) {
        m_opr_filter = opr_filter;
        return *this;
    }

    ProfilerBase& set_var_node_filter(const VarNodeFilter& var_node_filter) {
        m_var_node_filter = var_node_filter;
        return *this;
    }

    static std::unique_ptr<ProfilerBase> make_profiler();
    static std::unique_ptr<ProfilerBase> make_cached_profiler(
            const char* path = nullptr);

 protected:
    OprFilter m_opr_filter;
    VarNodeFilter m_var_node_filter;
    float m_opr_threshold;
    float m_var_node_threshold;
 };

 private:

 /*! \brief A default profiler impl
 */
 class ProfilerImpl : public ProfilerBase {
 public:
    ProfilerImpl(int runs = 10, float opr_threshold = 2.f,
                 float var_node_threshold = 2.f);
    ~ProfilerImpl() = default;
    ProfilingResult profile(const Problem& problem) const override;

 protected:
    static constexpr float PROFILE_TIME_OUT = 1e7;
    using ReformatKey = ReformatManager::ReformatKey;
    using ReformatAttribute = ReformatKey::Attribute;
    /*!
     * \brief profile opr format agnostic operators (like elemwise, elemwise
     * multi type, typecvt etc.)
     *
     * \param opr pointer to the operator node to be profiled
     * \param base_format the original tensor format of the operator node.
     * \param available_tensor_formats the available tensor formats
     * \return the operator node record
     */
    OperatorNodeRecord profile_operator(
            const OperatorNodeBase* opr, TensorFormats base_format,
            const SmallVector<TensorFormats>& available_tensor_formats,
            ReformatAttribute extra_attribute =
                    ReformatAttribute::DEFAULT) const;
    /*!
     * \brief prfile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.)
     *
     * \param opr pointer to the operator to be profiled
     * \param base_format the original tensor format of the operator node.
     * \param tensor_format the tensor format to be profiled
     * \param extra_attribute identify whether to use image object for OpenCL or automatically padding nhwc layout
     * \return elapsed time of operator in the given tensor format configuration
     */
    virtual float profile_operator(
            const OperatorNodeBase* opr, TensorFormats base_format,
            TensorFormats tensor_format,
            ReformatAttribute extra_attribute =
                    ReformatAttribute::DEFAULT) const;
    /*!
     * \brief profile opr format aware operators (like conv, deconv, conv_bias,
     * etc.)
     *
     * \param opr pointer to the operator node to be profiled
     * \param base_config the tensor formats configuration of base opr format
     * \param config all the available configuration
     * \return the operator node record
     */
    OperatorNodeRecord profile_operator(
            const OperatorNodeBase* opr,
            const OprTensorFormatsConfiguration& base_config,
            const SmallVector<OprTensorFormatsConfiguration>& available_configs,
            ReformatAttribute extra_attribute =
                    ReformatAttribute::DEFAULT) const;
    /*!
     * \brief prfile opr format aware operators (like conv, deconv, conv_bias, resize, warp etc.)
     *
     * \param opr pointer to the operator to be profiled
     * \param base_config the original opr format configuration of the operator node, 
     * \param config the opr format configuration to be profiled
     * \param extra_attribute identify whether to use image object for OpenCL or automatically padding nhwc layout
     * \return elapsed time of operator in the given opr format configuration
     */
    virtual float profile_operator(const OperatorNodeBase* opr,
                           const OprTensorFormatsConfiguration& base_config,
                           const OprTensorFormatsConfiguration& config,
                           ReformatAttribute extra_attribute =
                                   ReformatAttribute::DEFAULT) const;
    /*!
     * \brief profile layout transform of the var node
     *
     * \param var pointer to the var node to be profiled
     * \param base_format the original tensor formats in which the var node is
     * stored 
     * \param available_tensor_formats the available tensor formats
     * \param extra_attribute the extra attributes (options) of the problem
     * \return the var node record
     */
    VarNodeRecord profile_var_node(
            const VarNode* var, TensorFormats base_format,
            const SmallVector<TensorFormats>& available_tensor_formats,
            ReformatAttribute extra_attribute =
                    ReformatAttribute::DEFAULT) const;
    /*!
     * \brief profile layout transform of the var node
     *
     * \param var pointer to the var node to be profiled
     * \param base_format the original tensor formats in which the var node is
     * stored
     * \param key type of ReformatKey, identify the information/attributes of the layout transoform
     * \return elapsed time of the layout transform
     */
    virtual float profile_var_node(const VarNode* var,
                                   TensorFormats base_format,
                                   const ReformatKey& key) const;
    OprFootprint m_opr_footprint;
    float m_opr_threshold;  /// a threshold, when the computation of the newly
                            /// created operator that is built in some opr
                            /// format configuration is as greater as
                            /// m_opr_threshold times of the original operator,
                            /// the opr format configuration will be skipped
                            /// (i.e. the cost is infinite)
    float m_var_node_threshold;  /// a threshold, when the memory footprint of
                                 /// the layout transform of the var node is as
                                 /// larger as m_var_node_threshold as the var
                                 /// node itself, the layout transform will be
                                 /// skipped (i.e. the cost is infinite)
    int m_runs;                  /// sample times of the profiler
 };

 /*!
 * \brief a ProfilerCache that manages the profiling results of operator in
 * different layouts and of layout transform of var nodes.
 */
 class ProfilerCache : public NonCopyableObj {
    ProfilerCache() : m_impl{std::make_unique<InMemoryPersistentCache>()} {};

 public:
    using ReformatKey = ReformatManager::ReformatKey;
    using ReformatAttribute = ReformatKey::Attribute;
    using OprFormat = ProfilerBase::OprFormat;
    class Key final : public NonCopyableObj {
        std::string m_blob_storage;
        std::string m_category;

        struct OprKey {
            const OperatorNodeBase* opr;
            OprFormat opr_format;
            ReformatAttribute extra_attribute;
        };

        struct VarKey {
            const VarNode* var;
            ReformatKey key;
        };

        union KeyImpl {
            OprKey opr_key;
            VarKey var_key;

            KeyImpl() { std::memset(this, 0, sizeof(KeyImpl)); }
        };

        KeyImpl m_key_impl;

        void build_blob_from_opr();
        void build_blob_from_var();
        void build_category(CompNode cn);

    public:
        Key(const OperatorNodeBase* opr, OprFormat opr_format,
            ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) {
            m_key_impl.opr_key = {opr, opr_format, extra_attribute};
            build_blob_from_opr();
            mgb_assert(
                    opr->node_prop().contain(
                            cg::OperatorNodeProp::Flag::SINGLE_COMP_NODE),
                    "operator with multiple comp node is not supported(opr:%s)",
                    opr->cname());
            // here, we assume that the operator to be profiled has only one
            // comp node
            build_category(opr->output(0)->comp_node());
        }

        Key(const VarNode* var, ReformatKey key) {
            m_key_impl.var_key = {var, key};
            build_blob_from_var();
            build_category(var->comp_node());
        }

        const std::string& category() const;
        PersistentCache::Blob blob() const;
    };

    using Result = float;

 public:
    static ProfilerCache& inst();

    ProfilerCache& set_impl(std::unique_ptr<PersistentCache> impl);

    void dump_cache(const char* path);

    Maybe<Result> get(const Key& key);

    void put(const Key& key, Result& result);

 private:
    std::unique_ptr<PersistentCache> m_impl;
 };

 class CachedProfiler final : public ProfilerImpl {
 public:
    CachedProfiler(const char* path = nullptr, int runs = 10,
                   float opr_threshold = 2.f, float var_node_threshold = 2.f);
    ProfilingResult profile(const Problem& problem) const override;

 private:
    float profile_operator(const OperatorNodeBase* opr,
                           TensorFormats base_format,
                           TensorFormats tensor_format,
                           ReformatAttribute extra_attribute =
                                   ReformatAttribute::DEFAULT) const override;
    float profile_operator(const OperatorNodeBase* opr,
                           const OprTensorFormatsConfiguration& base_config,
                           const OprTensorFormatsConfiguration& config,
                           ReformatAttribute extra_attribute =
                                   ReformatAttribute::DEFAULT) const override;
    float profile_var_node(const VarNode* var, TensorFormats base_format,
                           const ReformatKey& key) const override;
    const char* m_path;
 };

 }  // namespace gopt
--- a/src/gopt/test/cache_data.h
+++ b/src/gopt/test/cache_data.h
--- a/src/gopt/test/embed_cache.py
+++ b/src/gopt/test/embed_cache.py
@@ -0,0 +1,93 @@
 #!/usr/bin/env python3
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 # 为了保证全局图优化里的 profiling 结果不受到 ci 环境的影响，所以把写死的 profiling 结果存到了 cache 里去，
 # 每次跑测试会从内存里读取 cache 里的 profiling 结果，然后根据 profiling 结果去做全局图优化。
 # 这个脚本用来把 dump 出去的 cache 文件转化成 cache 的头文件，用于测试时读取数据。
 # 如果在 src/gopt/test/layout_transform_pass.cpp 里添加了全局图优化相关的测试，则需要考虑用这个脚本来
 # 处理一下 profiling 数据。
 # 1. 首先将 src/gopt/test/layout_transform_pass.cpp 中的 `#define MGB_WITH_CACHED_TEST 1` 修改为
 # `#define MGB_WITH_CACHED_TEST 0`
 # 2. 编译megbrain_test，并运行所有全局图优化相关测试：
 #    ./megbrain_test --gtest_filter="*LayoutTransform*"
 # 3. 用这个脚本把所有的cache文件打包在一起
 #    python3 embed_cache.py -o cache_data.h $(ls /path/to/cache/*.cache)
 # 4. 将步骤1中的 define 改回去，这样 profile 过程用到的是 cache 下来的数据。随后可以重新构建 megbrain_test ，
 #    验证测试是否正确。
 import os.path
 import logging
 import hashlib
 import argparse
 import struct
 import itertools
 import sys
 import subprocess

 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.WARNING, format='%(asctime)-15s %(message)s')

 CHAR_MAP = {i: r'{}'.format(i) for i in range(256)}

 def _u32(data):
    return struct.unpack('<I', data)[0]

 class CacheDataGenerator:
    _cache_files = None

    def __init__(self, cache_files):
        self._cache_files = cache_files

    def _get_hash(self):
        return _u32(self._hash.digest()[:4])

    def gen_cache_data(self, fpath):
        fname = os.path.basename(fpath)
        with open(fpath, 'rb') as fcache:
            cache_data = fcache.read()
        cache_data = struct.unpack(
            "<{}B".format(len(cache_data)), cache_data)
        ret = list(map(CHAR_MAP.__getitem__, cache_data))
        for i in range(50, len(ret), 50):
            ret[i] = '\n' + ret[i]
        return ','.join(ret)

    def gen_cache_data_header(self, fout, src_map):
        fout.write('// generated embed_cache.py\n')
        fout.write('#include <vector>\n')
        fout.write('#include <stdint.h>\n')
        for k, v in sorted(src_map.items()):
            fout.write("""
 static const std::vector<uint8_t> {} = {{
 """.format(k.replace('.', '_')))
            fout.write('{}'.format(v))
            fout.write('};\n')

    def invoke(self, output):
        logger.info('generate cache_data.h ...')
        fname2cache_data = {}
        for fname in self._cache_files:
            base, ext = os.path.splitext(os.path.basename(fname))
            assert ext == ".cache", "ext: {}, fname {}".format(ext, fname)
            assert base not in fname2cache_data, "duplicated kernel: " + base
            fname2cache_data[base] = self.gen_cache_data(fname)
        with open(output, 'w') as fout:
            self.gen_cache_data_header(fout, fname2cache_data)
        logger.info('done')


 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='embed cache into cache header file',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-o', '--output', help='output source file',
                        required=True)
    parser.add_argument('cache', help='cache files to be embedded', nargs='+')
    args = parser.parse_args()
    cache_generator = CacheDataGenerator(args.cache)
    cache_generator.invoke(args.output)
--- a/src/gopt/test/layout_transform_pass.cpp
+++ b/src/gopt/test/layout_transform_pass.cpp
@@ -23,6 +23,12 @@
 #include "megbrain/plugin/profiler.h"
 #include "megbrain/serialization/serializer.h"

 #define MGB_WITH_CACHED_TEST 1

 #if MGB_WITH_CACHED_TEST
 #include "./cache_data.h"
 #endif

 using namespace mgb;
 using namespace gopt;
 using namespace serialization;
@@ -53,6 +59,78 @@ size_t find_opr_num(SymbolVar endpoint) {
    cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
    return opr_num;
 }

 using OprFormat = Problem::OprFormat;
 OprFormat tensor_formats_to_opr_format(TensorFormats tensor_format) {
    switch (tensor_format) {
        case TensorFormats::NCHW:
            return OprFormat::NCHW;
        case TensorFormats::NCHWc4:
            return OprFormat::NCHW4;
        case TensorFormats::NCHWc8:
            return OprFormat::NCHW8;
        case TensorFormats::NCHWc32:
            return OprFormat::NCHW32;
        case TensorFormats::NCHWc64:
            return OprFormat::NCHW64;
        case TensorFormats::NHWC:
            return OprFormat::NHWC;
        case TensorFormats::CHWNc4:
            return OprFormat::CHWN4;
        default:
            mgb_throw(MegBrainError, "tensor format(%u) is not supported",
                      static_cast<uint32_t>(tensor_format));
    }
 }

 class ProfilerMock : public ProfilerImpl {
 public:
    ProfilerMock(const uint8_t* bin, size_t size) {
        mgb_assert(bin != nullptr);
        ProfilerCache::inst().set_impl(
                std::make_unique<InFilePersistentCache>(bin, size));
    }
    ~ProfilerMock() {
        // reset in memory cache
        ProfilerCache::inst().set_impl(
                std::make_unique<InMemoryPersistentCache>());
    }

 private:
    float profile_operator(const OperatorNodeBase* opr,
                           TensorFormats base_format,
                           TensorFormats tensor_format,
                           ReformatAttribute extra_attribute =
                                   ReformatAttribute::DEFAULT) const override {
        ProfilerCache::Key key{opr, tensor_formats_to_opr_format(tensor_format),
                               extra_attribute};
        auto ret = ProfilerCache::inst().get(key);
        if (ret.valid())
            return ret.val();
        mgb_assert(false);
    }
    float profile_operator(const OperatorNodeBase* opr,
                           const OprTensorFormatsConfiguration& base_config,
                           const OprTensorFormatsConfiguration& config,
                           ReformatAttribute extra_attribute =
                                   ReformatAttribute::DEFAULT) const override {
        ProfilerCache::Key key{opr, config.opr_format, extra_attribute};
        std::string tmp;
        tmp.reserve(key.blob().size);
        auto ret = ProfilerCache::inst().get(key);
        if (ret.valid())
            return ret.val();
        mgb_assert(false);
    }
    float profile_var_node(const VarNode* var, TensorFormats base_format,
                           const ReformatKey& key) const override {
        ProfilerCache::Key pf_key{var, key};
        auto ret = ProfilerCache::inst().get(pf_key);
        if (ret.valid())
            return ret.val();
        mgb_assert(false);
    }
 };
 }  // namespace

 #if MGB_CUDA
@@ -96,15 +174,23 @@ TEST(TestLayoutTransform, Resnet18_QS8) {
            OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC,
            ReformatAttribute::AUTO_PADDING_NHWC};
    auto ctx = std::make_unique<LayoutTransformContext>(
            std::move(opr_list), std::move(available_tensor_formats), attribute);
    ctx->add_opr_config(
               opr::ConvBiasForward::typeinfo(),
               {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, OprFormat::NHWC})
            .add_opr_config(
                    opr::PoolingForward::typeinfo(),
                    {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NHWC,
                     OprFormat::CHWN4});
    auto profiler = ProfilerBase::make_profiler();
            std::move(opr_list), std::move(available_tensor_formats),
            attribute);
    ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
                        {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4,
                         OprFormat::NHWC})
            .add_opr_config(opr::PoolingForward::typeinfo(),
                            {OprFormat::NCHW4, OprFormat::NCHW32,
                             OprFormat::NHWC, OprFormat::CHWN4});
 #if MGB_WITH_CACHED_TEST
    auto profiler = std::make_unique<ProfilerMock>(
            static_cast<const uint8_t*>(
                    TestLayoutTransform_Resnet18_QS8.data()),
            TestLayoutTransform_Resnet18_QS8.size());
 #else
    auto profiler = ProfilerBase::make_cached_profiler(
            "TestLayoutTransform.Resnet18_QS8.cache");
 #endif
    std::unique_ptr<SolverBase> solver{
            new DynamicProgrammingSolver(std::move(profiler))};
    auto new_output =
@@ -190,7 +276,15 @@ TEST(TestLayoutTransform, Resnet18_QS4) {
                    opr::PoolingForward::typeinfo(),
                    {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64,
                     OprFormat::NHWC, OprFormat::CHWN4});
    auto profiler = ProfilerBase::make_profiler();
 #if MGB_WITH_CACHED_TEST
    auto profiler = std::make_unique<ProfilerMock>(
            static_cast<const uint8_t*>(
                    TestLayoutTransform_Resnet18_QS4.data()),
            TestLayoutTransform_Resnet18_QS4.size());
 #else
    auto profiler = ProfilerBase::make_cached_profiler(
            "TestLayoutTransform.Resnet18_QS4.cache");
 #endif
    std::unique_ptr<SolverBase> solver{
            new DynamicProgrammingSolver(std::move(profiler))};
    auto new_output =
@@ -305,7 +399,15 @@ TEST(TestLayoutTransform, Detection_QS8) {
                    opr::PoolingForward::typeinfo(),
                    {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64,
                     OprFormat::NHWC, OprFormat::CHWN4});
    auto profiler = ProfilerBase::make_profiler();
 #if MGB_WITH_CACHED_TEST
    auto profiler = std::make_unique<ProfilerMock>(
            static_cast<const uint8_t*>(
                    TestLayoutTransform_Detection_QS8.data()),
            TestLayoutTransform_Detection_QS8.size());
 #else
    auto profiler = ProfilerBase::make_cached_profiler(
            "TestLayoutTransform.Detection_QS8.cache");
 #endif
    std::unique_ptr<SolverBase> solver{
            new DynamicProgrammingSolver(std::move(profiler))};
    auto new_outputs =
@@ -375,7 +477,15 @@ TEST(TestLayoutTransform, Detection_QS4) {
                    opr::PoolingForward::typeinfo(),
                    {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64,
                     OprFormat::NHWC, OprFormat::CHWN4});
    auto profiler = ProfilerBase::make_profiler();
 #if MGB_WITH_CACHED_TEST
    auto profiler = std::make_unique<ProfilerMock>(
            static_cast<const uint8_t*>(
                    TestLayoutTransform_Detection_QS4.data()),
            TestLayoutTransform_Detection_QS4.size());
 #else
    auto profiler = ProfilerBase::make_cached_profiler(
            "TestLayoutTransform.Detection_QS4.cache");
 #endif
    std::unique_ptr<SolverBase> solver{
            new DynamicProgrammingSolver(std::move(profiler))};
    auto new_outputs =
@@ -443,10 +553,18 @@ TEST(TestLayoutTransform, Wide) {
            OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC,
            ReformatAttribute::DEFAULT};
    auto ctx = std::make_unique<LayoutTransformContext>(
            std::move(opr_list), std::move(available_tensor_formats), attribute);
    ctx->add_opr_config(
            opr::ConvBiasForward::typeinfo(), {OprFormat::NCHW, OprFormat::NHWC});
    auto profiler = ProfilerBase::make_profiler();
            std::move(opr_list), std::move(available_tensor_formats),
            attribute);
    ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
                        {OprFormat::NCHW, OprFormat::NHWC});
 #if MGB_WITH_CACHED_TEST
    auto profiler = std::make_unique<ProfilerMock>(
            static_cast<const uint8_t*>(TestLayoutTransform_Wide.data()),
            TestLayoutTransform_Wide.size());
 #else
    auto profiler = ProfilerBase::make_cached_profiler(
            "TestLayoutTransform.Wide.cache");
 #endif
    std::unique_ptr<SolverBase> solver{
            new DynamicProgrammingSolver(std::move(profiler))};
    auto v = gopt::GraphOptimizer{}
@@ -463,12 +581,8 @@ TEST(TestLayoutTransform, Wide) {
    auto func = network.graph->compile({{sym_o, {}}});
    func->execute();
    gprof.to_json_full(func.get())->writeto_fpath(output_file("wide.json"));
    /// check global layout transform pass, no dimshuffle
    /// disable the following check, to make ci stable.
 #if 0
    auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(sym_o);
    ASSERT_EQ(nr_dimshuffle, 0u);
 #endif
    auto nr_param_merge = find_opr_num<opr::MultipleDeviceTensorHolder>(sym_o);
    ASSERT_EQ(nr_param_merge, 1u);
    /// check first conv format
@@ -477,48 +591,6 @@ TEST(TestLayoutTransform, Wide) {
    ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW);
 }

 TEST(TestLayoutTransform, ElemwiseMultiType) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    Network network(cn);
    auto x = network.add_var("x", {64, 64, 1, 2});
    auto y = network.add_var("y", {64, 64, 1, 2});
    x = network.add_type_cvt(x, dtype::QuantizedS4{1.f});
    y = network.add_type_cvt(y, dtype::QuantizedS4{1.f});
    auto x_ = network.add_type_cvt(x, dtype::Float32());
    auto y_ = network.add_type_cvt(y, dtype::Float32());
    auto z = network.add_elemwise(
            {x_, y_}, dtype::Float32(), opr::Elemwise::Mode::FUSE_ADD_RELU);
    z = network.add_type_cvt(z, dtype::QuantizedS4{1.f});
    z = network.add_type_cvt(z, dtype::Float32());
    auto z2 = network.add_elemwise(
            {x, y}, dtype::QuantizedS4{1.f}, opr::Elemwise::Mode::FUSE_ADD_RELU);
    z2 = network.add_type_cvt(z2, dtype::Float32());
    HostTensorND t1;
    auto func1 = network.graph->compile({make_callback_copy(z, t1)});
    func1->execute();

    HostTensorND t3;
    auto func3 = network.graph->compile({make_callback_copy(z2, t3)});
    func3->execute();

    auto alter_x = opr::RelayoutFormat::make(
            x, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW64);
    auto alter_y = opr::RelayoutFormat::make(
            y, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW64);
    auto alter_z = network.add_elemwise(
            {alter_x, alter_y}, dtype::QuantizedS4{1.f},
            opr::Elemwise::Mode::FUSE_ADD_RELU);
    alter_z = opr::RelayoutFormat::make(
            alter_z, megdnn::param::RelayoutFormat::Mode::NCHW64_NCHW);
    alter_z = network.add_type_cvt(alter_z, dtype::Float32());
    HostTensorND t2;
    auto func2 = network.graph->compile({make_callback_copy(alter_z, t2)});
    func2->execute();
    // MGB_ASSERT_TENSOR_EQ(t1, t3);
    MGB_ASSERT_TENSOR_EQ(t2, t3);
 }

 #if CUDA_VERSION >= 10020
 TEST(TestLayoutTransform, DetectionHead) {
    REQUIRE_GPU(1);
@@ -600,8 +672,15 @@ TEST(TestLayoutTransform, DetectionHead) {
            .add_opr_config(
                    opr::WarpPerspectiveForward::typeinfo(),
                    {OprFormat::NHWC, OprFormat::NCHW4, OprFormat::NCHW64});

    auto profiler = ProfilerBase::make_profiler();
 #if MGB_WITH_CACHED_TEST
    auto profiler = std::make_unique<ProfilerMock>(
            static_cast<const uint8_t*>(
                    TestLayoutTransform_DetectionHead.data()),
            TestLayoutTransform_DetectionHead.size());
 #else
    auto profiler = ProfilerBase::make_cached_profiler(
            "TestLayoutTransform.DetectionHead.cache");
 #endif
    std::unique_ptr<SolverBase> solver{
            new DynamicProgrammingSolver(std::move(profiler))};
    auto new_out_vars =