feat(mgb): add fast run batch size graph option

GitOrigin-RevId: 94e333ec80
4 years ago · 2eea00097c
--- a/dnn/include/megdnn/oprs/utils.h
+++ b/dnn/include/megdnn/oprs/utils.h
@@ -91,6 +91,11 @@ class MaxTensorDiff : public OperatorBase {
        void check_exec(const TensorLayout& layout1,
                        const TensorLayout& layout2, size_t workspace_in_bytes);
 };


 bool check_bias_share_in_channel(const TensorLayout& bias,
                                 const param::ConvBias::Format format);

 }  // namespace megdnn

 #include "megdnn/internal/opr_header_epilogue.h"
--- a/dnn/src/common/conv_bias.cpp
+++ b/dnn/src/common/conv_bias.cpp
@@ -318,36 +318,6 @@ void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args,
            megdnn_assert(false);
    }
 }

 bool check_bias_share_in_channel(const TensorLayout& bias,
                                 const param::ConvBias::Format format) {
    bool share_in_channel = false;
    if (format == param::ConvBias::Format::NCHW ||
        format == param::ConvBias::Format::NCHW4_NCHW) {
        share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[2] == 1 &&
                            bias[3] == 1);
    } else if (format == param::ConvBias::Format::NHWC) {
        share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[1] == 1 &&
                            bias[2] == 1);
    } else if (format == param::ConvBias::Format::NCHW4 ||
               format == param::ConvBias::Format::NCHW8 ||
               format == param::ConvBias::Format::NCHW32 ||
               format == param::ConvBias::Format::NCHW64 ||
               format == param::ConvBias::Format::NCHW4_NCHW32 ||
               format == param::ConvBias::Format::NCHW32_NCHW4) {
        share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[2] == 1 &&
                            bias[3] == 1);
    } else if (format == param::ConvBias::Format::NHWCD4) {
        share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[1] == 1 &&
                            bias[3] == 1);
    } else {
        megdnn_assert(format == param::ConvBias::Format::CHWN4);
        share_in_channel = (bias.ndim == 5 && bias[1] == 1 && bias[2] == 1 &&
                            bias[3] == 1);
    }
    return share_in_channel;
 }

 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/common/conv_bias.h
+++ b/dnn/src/common/conv_bias.h
@@ -22,8 +22,6 @@ void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args,
                               const TensorND* dst_tensor,
                               const TensorND* bias_tensor);

 bool check_bias_share_in_channel(const TensorLayout& bias,
                                 const param::ConvBias::Format format);
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/common/utils.cpp
+++ b/dnn/src/common/utils.cpp
@@ -10,6 +10,7 @@
 */

 #include "src/common/utils.h"
 #include "megdnn/oprs/utils.h"
 #include "megdnn/handle.h"

 #include <cstdarg>
@@ -344,4 +345,33 @@ size_t& CpuNDRange::operator[](size_t idx) {
    return m_dim[idx];
 }

 bool megdnn::check_bias_share_in_channel(const TensorLayout& bias,
                                 const param::ConvBias::Format format) {
    bool share_in_channel = false;
    if (format == param::ConvBias::Format::NCHW ||
        format == param::ConvBias::Format::NCHW4_NCHW) {
        share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[2] == 1 &&
                            bias[3] == 1);
    } else if (format == param::ConvBias::Format::NHWC) {
        share_in_channel = (bias.ndim == 4 && bias[0] == 1 && bias[1] == 1 &&
                            bias[2] == 1);
    } else if (format == param::ConvBias::Format::NCHW4 ||
               format == param::ConvBias::Format::NCHW8 ||
               format == param::ConvBias::Format::NCHW32 ||
               format == param::ConvBias::Format::NCHW64 ||
               format == param::ConvBias::Format::NCHW4_NCHW32 ||
               format == param::ConvBias::Format::NCHW32_NCHW4) {
        share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[2] == 1 &&
                            bias[3] == 1);
    } else if (format == param::ConvBias::Format::NHWCD4) {
        share_in_channel = (bias.ndim == 5 && bias[0] == 1 && bias[1] == 1 &&
                            bias[3] == 1);
    } else {
        megdnn_assert(format == param::ConvBias::Format::CHWN4);
        share_in_channel = (bias.ndim == 5 && bias[1] == 1 && bias[2] == 1 &&
                            bias[3] == 1);
    }
    return share_in_channel;
 }

 // vim: syntax=cpp.doxygen
--- a/sdk/load-and-run/src/mgblar.cpp
+++ b/sdk/load-and-run/src/mgblar.cpp
@@ -158,6 +158,11 @@ R"__usage__(
 R"__usage__(
  --fast-run-algo-policy <path>
    It will read the cache file before profile, and save new fastrun in cache file.
  --fast-run-shared-batch-size
    Set the batch size used during fastrun, Note that it may not be the same as the actual running batch size
  --binary-equal-between-batch
    Each batch of output is promised binary equal if each batch of input is binary equal.
    Note that if this option is turned on, `--reproducible` will also be turned on.
  --reproducible
    Enable choose algo which is reproducible. It mainly used for cudnn algos.
    See https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#reproducibility
@@ -1356,6 +1361,20 @@ Args Args::from_argv(int argc, char **argv) {
            ret.fast_run_cache_path = argv[i];
            continue;
        }
        if (!strcmp(argv[i], "--fast-run-shared-batch-size")) {
            ++i;
            mgb_assert(i < argc,
                       "value not given for --fast-run-shared-batch-size");
            int32_t batch_size = std::stoi(argv[i]);
            mgb_assert(batch_size >= 0);
            graph_opt.fast_run_config.shared_batch_size = batch_size;
            continue;
        }
        if (!strcmp(argv[i], "--binary-equal-between-batch")) {
            graph_opt.fast_run_config.binary_equal_between_batch = true;
            ret.reproducible = true;
            continue;
        }
        if (!strcmp(argv[i], "--reproducible")) {
            ret.reproducible = true;
            continue;
@@ -1452,6 +1471,14 @@ Args Args::from_argv(int argc, char **argv) {
        return ret;
    }

 #if MGB_ENABLE_FASTRUN
    if (graph_opt.fast_run_config.shared_batch_size) {
        mgb_assert(ret.use_fast_run || ret.use_full_run ||
                           !ret.fast_run_cache_path.empty(),
                   "--fast-run-shared-batch-size should be used with "
                   "--fast-run/--full-run/--fast-run-algo-policy");
    }
 #endif
    return ret;
 }

--- a/src/core/include/megbrain/graph/cg.h
+++ b/src/core/include/megbrain/graph/cg.h
@@ -502,7 +502,28 @@ class ComputingGraph : public std::enable_shared_from_this<ComputingGraph>,

            //! contains any user data associated with this graph
            UserDataContainer user_data;
        }; // Options

            //! Control parameter for fast run
            struct FastRunConfig {
                /*!
                 * the batch size used by fastrun
                 *
                 * Non-zero value means that fastrun use this batch size
                 * regardless of the batch size of the model
                 *
                 * Zero means fastrun use batch size of the model
                 */
                uint32_t shared_batch_size = 0;

                /*!
                 * \brief if the content of each input batch is binary equal,
                 * whether the content of each output batch is promised to be
                 * equal
                 */
                bool binary_equal_between_batch = false;
            } fast_run_config;

        };  // Options

        Options& options() {
            return m_options;
--- a/src/opr/impl/search_policy/algo_chooser.cpp
+++ b/src/opr/impl/search_policy/algo_chooser.cpp
@@ -60,21 +60,31 @@ std::string profile_name(Opr* opr) {
 template <typename Opr>
 std::string format_fixlayouts(
        const typename opr::AlgoChooser<Opr>::FixedTensorLayouts& layouts,
        size_t arity_in, size_t arity_out) {
        size_t arity_in, size_t arity_out,
        const std::string& delimiter = " -> ") {
    std::string ret;
    ret.append(": tensor layouts(");
    for (size_t i = 0; i < arity_in; ++i) {
        if (i) {
            ret.append(", ");
    if (arity_in) {
        ret.append("(");
        for (size_t i = 0; i < arity_in; ++i) {
            if (i) {
                ret.append(", ");
            }
            ret.append(layouts[i].to_string() + " ");
        }
        ret.append(layouts[i].to_string() + " ");
        ret.append(")");
    }
    if (arity_in && arity_out) {
        ret.append(delimiter);
    }
    ret.append(") -> (");
    for (size_t i = 0; i < arity_out; ++i) {
        if (i) {
            ret.append(", ");
    if (arity_out) {
        ret.append("(");
        for (size_t i = 0; i < arity_out; ++i) {
            if (i) {
                ret.append(", ");
            }
            ret.append(layouts[i + arity_in].to_string() + " ");
        }
        ret.append(layouts[i + arity_in].to_string() + " ");
        ret.append(")");
    }
    return ret;
 }
@@ -247,7 +257,7 @@ std::vector<megdnn::Algorithm::SearchItem> flatten_search_space(
        CircularDepsChecker& checker) {
    auto&& search_item = megdnn::Algorithm::SearchItem{
            OprTypeFromOprTrait<Opr>::opr_type, helper.param(),
            to_layout_array<Opr>(helper.layouts())};
            to_layout_array<Opr>(helper.fastrun_layouts())};
    checker.put(search_item);
    std::vector<megdnn::Algorithm::SearchItem> ret;
    for (auto algo_info : helper.get_all_candidates()) {
@@ -255,8 +265,9 @@ std::vector<megdnn::Algorithm::SearchItem> flatten_search_space(
                helper.get_algorithm_from_desc(algo_info.desc);
        mgb_assert(algo, "Unknown algo description");
        std::vector<megdnn::Algorithm::SearchItem>&& sub_items =
                algo->get_subopr_list(to_layout_array<Opr>(helper.layouts()),
                                      helper.megdnn_opr());
                algo->get_subopr_list(
                        to_layout_array<Opr>(helper.fastrun_layouts()),
                        helper.megdnn_opr());

        FOREACH_OPR_TYPE_DISPATCH(sub_items, {
            auto&& megdnn_opr =
@@ -323,6 +334,166 @@ static Algorithm::Info::Desc deserialize_read_pod(const std::string& data,

 namespace mgb {
 namespace opr {
 template <class Opr>
 class LayoutsModifier {
    using FixedTensorLayouts = typename AlgoChooser<Opr>::FixedTensorLayouts;

 public:
    static void on(FixedTensorLayouts&, const typename Opr::Param&, size_t) {}

 private:
    //! index of batch in tensor, 3 for CHWN4 e.g.
    static size_t index_of_batch(const typename Opr::Param&) { return 0; }

    //! indices contain batch in inputs and outputs, src(0) dst(2) for conv e.g.
    static std::vector<size_t> sm_indices_contain_batch;
 };
 template <class Opr>
 std::vector<size_t> LayoutsModifier<Opr>::sm_indices_contain_batch = {};

 #define DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(opr, idxs)                       \
    template <>                                                              \
    class LayoutsModifier<opr> {                                             \
    public:                                                                  \
        using FixedTensorLayouts =                                           \
                typename AlgoChooser<opr>::FixedTensorLayouts;               \
        static void on(FixedTensorLayouts& layouts, const opr::Param& param, \
                       size_t new_batch_size) {                              \
            size_t batch_index = index_of_batch(param);                      \
            for (size_t index : sm_indices_contain_batch) {                  \
                layouts.at(index)[batch_index] = new_batch_size;             \
            }                                                                \
        }                                                                    \
                                                                             \
    private:                                                                 \
        static size_t index_of_batch(const opr::Param&) { return 0; }        \
        static std::vector<size_t> sm_indices_contain_batch;                 \
    };                                                                       \
    std::vector<size_t> LayoutsModifier<opr>::sm_indices_contain_batch = idxs;

 DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(megdnn::Convolution3DForward,
                                    (std::initializer_list<size_t>{0, 2}))
 DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(megdnn::Convolution3DBackwardData,
                                    (std::initializer_list<size_t>{1, 2}))
 DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(megdnn::Convolution3DBackwardFilter,
                                    (std::initializer_list<size_t>{0, 1}))
 DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(megdnn::BatchedMatrixMul,
                                    (std::initializer_list<size_t>{0, 1, 2}))
 #undef DEFAULT_OPR_WITHOUT_INPUT_BROADCAST

 #define CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(opr, idxs)                     \
    template <>                                                              \
    class LayoutsModifier<opr> {                                             \
    public:                                                                  \
        using FixedTensorLayouts =                                           \
                typename AlgoChooser<opr>::FixedTensorLayouts;               \
        static void on(FixedTensorLayouts& layouts, const opr::Param& param, \
                       size_t new_batch_size) {                              \
            size_t batch_index = index_of_batch(param);                      \
            for (size_t index : sm_indices_contain_batch) {                  \
                layouts.at(index)[batch_index] = new_batch_size;             \
            }                                                                \
        }                                                                    \
                                                                             \
    private:                                                                 \
        static size_t index_of_batch(const opr::Param& param) {              \
            if (param.format == opr::Param::Format::CHWN4) {                 \
                return 3;                                                    \
            }                                                                \
            return 0;                                                        \
        }                                                                    \
        static std::vector<size_t> sm_indices_contain_batch;                 \
    };                                                                       \
    std::vector<size_t> LayoutsModifier<opr>::sm_indices_contain_batch = idxs;

 CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::ConvolutionForward,
                                      (std::initializer_list<size_t>{0, 2}))
 CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::ConvolutionBackwardData,
                                      (std::initializer_list<size_t>{1, 2}))
 CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::ConvolutionBackwardFilter,
                                      (std::initializer_list<size_t>{0, 1}))
 CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::LocalShareForward,
                                      (std::initializer_list<size_t>{0, 2}))
 CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::LocalShareBackwardData,
                                      (std::initializer_list<size_t>{1, 2}))
 CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::LocalShareBackwardFilter,
                                      (std::initializer_list<size_t>{0, 1}))
 CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::DeformableConvForward,
                                      (std::initializer_list<size_t>{0, 2, 3,
                                                                     4}))
 CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::DeformableConvBackwardData,
                                      (std::initializer_list<size_t>{0, 2, 3, 4,
                                                                     5, 6, 7}))
 CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::DeformableConvBackwardFilter,
                                      (std::initializer_list<size_t>{0, 1, 2,
                                                                     3}))
 CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(megdnn::BatchConvBiasForward,
                                      (std::initializer_list<size_t>{0, 1, 2, 3,
                                                                     4}))
 #undef CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST

 template <>
 class LayoutsModifier<megdnn::ConvBiasForward> {
 public:
    using FixedTensorLayouts =
            typename AlgoChooser<megdnn::ConvBiasForward>::FixedTensorLayouts;
    static void on(FixedTensorLayouts& layouts,
                   const megdnn::ConvBiasForward::Param& param,
                   size_t new_batch_size) {
        size_t batch_index = index_of_batch(param);
        for (size_t index : sm_indices_contain_batch) {
            layouts.at(index)[batch_index] = new_batch_size;
        }
        for (size_t index : sm_indices_contain_batch_broadcast) {
            if (!check_bias_share_in_channel(layouts.at(index), param.format)) {
                layouts.at(index)[batch_index] = new_batch_size;
            }
        }
    }

 private:
    static std::vector<size_t> sm_indices_contain_batch;
    static std::vector<size_t> sm_indices_contain_batch_broadcast;
    static size_t index_of_batch(const megdnn::ConvBiasForward::Param& param) {
        if (param.format == megdnn::ConvBiasForward::Param::Format::CHWN4) {
            return 3;
        }
        return 0;
    }
 };
 std::vector<size_t>
        LayoutsModifier<megdnn::ConvBiasForward>::sm_indices_contain_batch = {
                0, 3, 4};
 std::vector<size_t> LayoutsModifier<
        megdnn::ConvBiasForward>::sm_indices_contain_batch_broadcast = {2};

 template <>
 class LayoutsModifier<megdnn::MatrixMul> {
 public:
    using FixedTensorLayouts=
            typename AlgoChooser<megdnn::MatrixMul>::FixedTensorLayouts;
    static void on(FixedTensorLayouts& layouts,
                   const megdnn::MatrixMul::Param& param,
                   size_t new_batch_size) {
        //! Because we do not know whether the batch size is in the dimension m
        //! or the dimension n, we just ignore both m and n here.
        // FIXME Find a way to make mgb obtain batch size information from R or
        // automatically
        layouts.at(2)[0] = new_batch_size;
        layouts.at(2)[1] = new_batch_size;
        if (param.transposeA) {
            layouts.at(0)[1] = new_batch_size;
        } else {
            layouts.at(0)[0] = new_batch_size;
        }
        if (param.transposeB) {
            layouts.at(1)[0] = new_batch_size;
        } else {
            layouts.at(1)[1] = new_batch_size;
        }
    }
 };

 ///////////////////////////// AlgoChooserHelper //////////////////////////
 template <typename Opr>
 AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper(
@@ -331,14 +502,25 @@ AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper(
        const CompNode& cn,
        const megdnn::param::ExecutionPolicy& execution_policy,
        bool allow_weight_preprocess)
        : m_layouts{layouts},
        : m_fastrun_layouts{layouts},
          m_incache_layouts{layouts},
          m_dnn_opr{megdnn_opr},
          m_param{param_str},
          m_base_mgb_opr{mgb_opr},
          m_cn{cn},
          m_execution_policy{execution_policy},
          m_allow_weight_preprocess{allow_weight_preprocess} {
    mgb_assert(m_layouts.size() == layouts.size());
    auto fastrun_batch_size =
            owner_graph()->options().fast_run_config.shared_batch_size;

    if (fastrun_batch_size) {
        LayoutsModifier<Opr>::on(m_incache_layouts, m_dnn_opr->param(), 0);
        LayoutsModifier<Opr>::on(m_fastrun_layouts, m_dnn_opr->param(),
                                 fastrun_batch_size);
    }

    mgb_assert(m_fastrun_layouts.size() == layouts.size());

    static_assert(std::tuple_size<FixedTensorLayouts>::value == 3 ||
                          std::tuple_size<FixedTensorLayouts>::value == 5 ||
                          std::tuple_size<FixedTensorLayouts>::value == 8,
@@ -358,13 +540,13 @@ AlgoChooser<Opr>::AlgoChooserHelper::choose_by_heuristic(
    policy.algo =
            APPLY(m_dnn_opr->get_algorithm_info_heuristic(
                          args..., workspace_limit, attr.first, attr.second),
                  m_layouts)
                  m_fastrun_layouts)
                    .desc;

    Algorithm* algo = m_dnn_opr->get_algorithm_from_desc(policy.algo);
    mgb_assert(algo, "Unknown algo description");
    std::vector<Algorithm::SearchItem>&& sub_items = algo->get_subopr_list(
            to_layout_array<Opr>(m_layouts), m_dnn_opr);
            to_layout_array<Opr>(m_fastrun_layouts), m_dnn_opr);

    FOREACH_OPR_TYPE_DISPATCH(sub_items, {
        auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn);
@@ -393,7 +575,7 @@ AlgoChooser<Opr>::AlgoChooserHelper::choose_by_profile(
        if (policy.algo.valid()) {
            return policy;
        }
        if (!algo_usable_on_shape_change<Opr>()) {
        if (is_matmul<Opr>()) {
            mgb_log_warn(
                    "choose algo by heuristic, which may cause performance "
                    "regression.");
@@ -442,7 +624,8 @@ AlgoChooser<Opr>::AlgoChooserHelper::get_profile_result_from_cache(
    AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());

    typename Opr::Param origin_param = m_dnn_opr->param();
    AlgoChooserProfileCache::Key cache_key{m_layouts.data(), m_layouts.size(),
    AlgoChooserProfileCache::Key cache_key{m_incache_layouts.data(),
                                           m_incache_layouts.size(),
                                           &origin_param, sizeof(origin_param)};
    auto&& rst = cache.get(cache_key);
    if (!rst.valid())
@@ -472,21 +655,21 @@ AlgoChooser<Opr>::AlgoChooserHelper::get_profile_result_from_cache(
    }

    std::string layouts_str =
            format_fixlayouts<Opr>(m_layouts, arity_in, arity_out);
            format_fixlayouts<Opr>(m_fastrun_layouts, arity_in, arity_out);
    if (skip_by_negative) {
        mgb_log_error(
                "opr: %s, layouts: %s, No usable algo. There are available algos match "
                "opr: %s, layouts: %s, No usable algo. There are available "
                "algos match "
                "positive strategy(%s), but filtered by negative stategy(%s).",
                m_base_mgb_opr->dyn_typeinfo()->name,
                layouts_str.c_str(),
                m_base_mgb_opr->dyn_typeinfo()->name, layouts_str.c_str(),
                Algorithm::attribute_str(target_attr.first).c_str(),
                Algorithm::attribute_str(target_attr.second).c_str());
    } else {
        mgb_log_error(
                "opr: %s, layouts: %s, No usable algo. algos read from cache could not "
                "opr: %s, layouts: %s, No usable algo. algos read from cache "
                "could not "
                "satisfy positive strategy(%s)",
                m_base_mgb_opr->dyn_typeinfo()->name,
                layouts_str.c_str(),
                m_base_mgb_opr->dyn_typeinfo()->name, layouts_str.c_str(),
                Algorithm::attribute_str(target_attr.first).c_str());
    }

@@ -508,7 +691,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy(
                    auto target_attr =
                            extract_algo_attribute(selected_strategy);
                    std::string layouts_str = format_fixlayouts<Opr>(
                            m_layouts, arity_in, arity_out);
                            m_fastrun_layouts, arity_in, arity_out);
                    std::string msg = ssprintf(
                            "(opr : %s, layouts %s, with attribute(%s) and "
                            "without attribute(%s)",
@@ -535,7 +718,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy(
            policy.algo = APPLY(m_dnn_opr->get_algorithm_info_heuristic(
                                        args..., workspace_limit, attr.first,
                                        attr.second),
                                m_layouts)
                                m_fastrun_layouts)
                                  .desc;
            mgb_assert(policy.algo.valid(),
                       "No algo found from heuristic with strategy %u and "
@@ -548,7 +731,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy(
    Algorithm* algo = m_dnn_opr->get_algorithm_from_desc(policy.algo);
    mgb_assert(algo, "Unknown algo description");
    std::vector<Algorithm::SearchItem>&& sub_items = algo->get_subopr_list(
            to_layout_array<Opr>(m_layouts), m_dnn_opr);
            to_layout_array<Opr>(m_fastrun_layouts), m_dnn_opr);

    FOREACH_OPR_TYPE_DISPATCH(sub_items, {
        auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn);
@@ -575,26 +758,32 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy(

 template <typename Opr>
 size_t AlgoChooser<Opr>::AlgoChooserHelper::get_workspace_size_bytes(
        const ImplExecutionPolicy& policy) const {
        const ImplExecutionPolicy& policy,
        const FixedTensorLayouts& layouts) const {
    MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_workspace_size_bytes")))
    m_dnn_opr->execution_policy() = policy;
    size_t result;
    const FixedTensorLayouts* layouts_ptr = &m_fastrun_layouts;
    if (layouts.at(0).ndim) {
        layouts_ptr = &layouts;
    }
    if_constexpr<opr_supports_preprocess<Opr>()>(
            [&](auto _) {
                auto&& opr = _(m_dnn_opr);
                auto prep = this->construct_fake_preprocess_filter();
                auto prep =
                        this->construct_fake_preprocess_filter(*layouts_ptr);
                PreprocessFilter<Opr>* prep_ptr =
                        prep.valid() ? &prep.val() : nullptr;
                result = std::max(
                        APPLY(opr->get_preprocess_workspace_in_bytes(args...),
                              m_layouts),
                              *layouts_ptr),
                        APPLY(opr->get_workspace_in_bytes(args..., prep_ptr),
                              m_layouts));
                              *layouts_ptr));
            },
            /* else */
            [&](auto _) {
                result = APPLY(_(m_dnn_opr)->get_workspace_in_bytes(args...),
                               m_layouts);
                               *layouts_ptr);
            });
    return result;
    MIDOUT_E
@@ -605,8 +794,8 @@ std::vector<typename AlgoChooser<Opr>::ImplAlgo>
 AlgoChooser<Opr>::AlgoChooserHelper::get_all_candidates() const {
    MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_all_candidates")))
    auto heu = choose_by_heuristic(m_execution_policy.strategy);
    auto&& ret =
            APPLY(m_dnn_opr->get_all_algorithms_info(args...), m_layouts);
    auto&& ret = APPLY(m_dnn_opr->get_all_algorithms_info(args...),
                       m_fastrun_layouts);
    bool found = false;
    for (size_t i = 0; i < ret.size(); ++i) {
        if (ret[i].desc == heu.algo) {
@@ -637,7 +826,7 @@ AlgoChooser<Opr>::AlgoChooserHelper::profile_single_algo(
            TimedProfiler<Opr>::Param::ExecutionPolicyBlob::serialize(policy);
    param.workspace = get_workspace_size_bytes(policy);
    for (int i = 0; i < arity; ++i) {
        auto&& src = m_layouts[i];
        auto&& src = m_fastrun_layouts[i];
        bool cond_normal = src.format.is_default() &&
                           (src.dtype.category() == DTypeCategory::FLOAT ||
                            src.dtype.category() == DTypeCategory::INT ||
@@ -655,9 +844,9 @@ AlgoChooser<Opr>::AlgoChooserHelper::profile_single_algo(
        param.dtypes[i] = src.dtype.enumv();
    }
    param.comp_node_loc = m_cn.locator();
    mgb_assert(param.shapes.size() == m_layouts.size());
    mgb_assert(param.shapes.size() == m_fastrun_layouts.size());
    for (size_t i = 0; i < param.shapes.size(); ++i)
        param.shapes[i] = m_layouts[i];
        param.shapes[i] = m_fastrun_layouts[i];
    param.opr_param = m_dnn_opr->param();
    param.allow_weight_preprocess = m_allow_weight_preprocess;

@@ -692,7 +881,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(

    auto target_attr = extract_algo_attribute(selected_strategy);
    std::string layouts_str =
            format_fixlayouts<Opr>(m_layouts, arity_in, arity_out);
            format_fixlayouts<Opr>(m_fastrun_layouts, arity_in, arity_out);
    double cur_timeout = 0;

    auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit(
@@ -761,10 +950,10 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
            workspace_limit);
    mgb_assert(!prof_rst.empty(), "%s", msg.c_str());

    FixedTensorLayouts origin_layouts = m_layouts;
    FixedTensorLayouts incache_layouts = m_incache_layouts;
    typename Opr::Param origin_param = m_dnn_opr->param();
    AlgoChooserProfileCache::Key cache_key{origin_layouts.data(),
                                           origin_layouts.size(), &origin_param,
    AlgoChooserProfileCache::Key cache_key{incache_layouts.data(),
                                           incache_layouts.size(), &origin_param,
                                           sizeof(origin_param)};

    AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
@@ -774,15 +963,20 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(

 template <typename Opr>
 Maybe<PreprocessFilter<Opr>>
 AlgoChooser<Opr>::AlgoChooserHelper::construct_fake_preprocess_filter() const {
 AlgoChooser<Opr>::AlgoChooserHelper::construct_fake_preprocess_filter(
        const FixedTensorLayouts& layouts) const {
    MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("construct_fake_preprocess_filter")))
    Maybe<PreprocessFilter<Opr>> result = None;
    const FixedTensorLayouts* layouts_ptr = &m_fastrun_layouts;
    if (layouts.at(0).ndim) {
        layouts_ptr = &layouts;
    }
    if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
        if (!m_allow_weight_preprocess)
            return;
        auto opr = _(m_dnn_opr);
        auto layouts = APPLY(opr->deduce_preprocessed_filter_layout(args...),
                             m_layouts);
                             *layouts_ptr);
        //! No preprocess layout means no need weight preprocess
        if (layouts.empty()) {
            return;
@@ -825,6 +1019,16 @@ AlgoChooser<Opr>::AlgoChooserHelper::extract_algo_attribute(
        ret.second |= AlgoAttribute::NAIVE;
    }

    //! from graph option
    if (owner_graph()->options().fast_run_config.shared_batch_size) {
        ret.second |= AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
    }

    if (owner_graph()->options().fast_run_config.binary_equal_between_batch) {
        ret.first |= AlgoAttribute::REPRODUCIBLE;
        ret.second |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
    }

    return ret;
 }

@@ -854,7 +1058,8 @@ AlgoChooser<Opr>::AlgoChooserHelper::extract_algo_attribute(
    template size_t                                                            \
    AlgoChooser<megdnn::Opr>::AlgoChooserHelper::get_workspace_size_bytes(     \
            const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy&      \
                    policy) const;                                             \
                    policy,                                                    \
            const FixedTensorLayouts& layouts) const;                          \
    template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo>          \
    AlgoChooser<megdnn::Opr>::AlgoChooserHelper::get_all_candidates() const;   \
    template Maybe<AlgoChooserProfileCache::ResultEntry>                       \
@@ -942,10 +1147,11 @@ size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts,
    if (!policy.algo.valid()) {
        policy = get_policy(helper);
    }
    size_t workspace = helper.get_workspace_size_bytes(policy);
    size_t workspace = helper.get_workspace_size_bytes(policy, layouts);

    std::string ret;
    ret.append(mgb_opr->dyn_typeinfo()->name);
    ret.append(": tensor layouts");
    ret += format_fixlayouts<Opr>(layouts, arity_in, arity_out);
    Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(policy.algo);
    mgb_assert(palgo, "Unknown algo description");
--- a/src/opr/include/megbrain/opr/search_policy/algo_chooser.h
+++ b/src/opr/include/megbrain/opr/search_policy/algo_chooser.h
@@ -68,7 +68,10 @@ class AlgoChooser {
 public:
    using FixedTensorLayouts = std::array<TensorLayout, arity>;
    class AlgoChooserHelper {
        FixedTensorLayouts m_layouts;
        //! fastrun layouts
        FixedTensorLayouts m_fastrun_layouts;
        //! layouts used when get and set cache item
        FixedTensorLayouts m_incache_layouts;
        Opr* m_dnn_opr;
        std::string m_param;
        const cg::OperatorNodeBase* m_base_mgb_opr;
@@ -89,7 +92,7 @@ public:
        const cg::OperatorNodeBase* mgb_opr() const { return m_base_mgb_opr; }

        const TensorLayout& inp_layout(size_t idx) const {
            return m_layouts[idx];
            return m_fastrun_layouts[idx];
        }
        cg::ComputingGraph* owner_graph() const {
            return m_base_mgb_opr->owner_graph();
@@ -109,7 +112,13 @@ public:
            return m_dnn_opr->get_algorithm_from_desc(desc);
        }

        const FixedTensorLayouts& layouts() const { return m_layouts; }
        const FixedTensorLayouts& fastrun_layouts() const {
            return m_fastrun_layouts;
        }

        const FixedTensorLayouts& incache_layouts() const {
            return m_incache_layouts;
        }

        //! construct algo chain by heuristic
        ImplExecutionPolicy choose_by_heuristic(
@@ -141,7 +150,8 @@ public:

        //! get workspace size required for specific execution policy
        size_t get_workspace_size_bytes(
                const ImplExecutionPolicy& policy) const;
                const ImplExecutionPolicy& policy,
                const FixedTensorLayouts& layouts = {}) const;

        //! get all candidate algos, and the one choose_by_heuristic() is
        //! put first
@@ -173,7 +183,8 @@ public:
                const ExecutionStrategy& strategy) const;

    private:
        Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const;
        Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter(
                const FixedTensorLayouts& layouts = {}) const;
    };

    template <typename U>
--- a/src/opr/include/megbrain/opr/search_policy/profiler.h
+++ b/src/opr/include/megbrain/opr/search_policy/profiler.h
@@ -54,11 +54,11 @@ constexpr bool opr_contain_bias() {
    return std::is_same<Opr, megdnn::ConvBias>::value;
 }

 //! matmul and batchedMatrixMul may not be usable once shape changed
 //! matmul and batchedMatrixMul
 template <typename Opr>
 constexpr bool algo_usable_on_shape_change() {
    return !(std::is_same<Opr, megdnn::MatrixMul>::value ||
             std::is_same<Opr, megdnn::BatchedMatrixMul>::value);
 constexpr bool is_matmul() {
    return std::is_same<Opr, megdnn::MatrixMul>::value ||
             std::is_same<Opr, megdnn::BatchedMatrixMul>::value;
 }

 template <typename Opr, bool has_prep>
--- a/src/opr/test/algo_chooser.cpp
+++ b/src/opr/test/algo_chooser.cpp
@@ -0,0 +1,304 @@
 /**
 * \file src/opr/test/algo_chooser.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "megbrain/comp_node_env.h"

 #include "megbrain/opr/blas.h"
 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/test/autocheck.h"
 #include "megbrain/test/helper.h"
 #include "megbrain/test/megdnn_helper.h"
 #include "megbrain/serialization/serializer.h"
 #include "megbrain/opr/basic_arith.h"
 #include "megbrain/gopt/inference.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megdnn/oprs/base.h"
 #include "megdnn/dtype.h"

 #include <cmath>
 #include <random>
 #include <utility>

 using namespace mgb;

 namespace {

 #if MGB_CUDA
 #if MGB_ENABLE_FASTRUN
 template <typename MgbOpr, int arith>
 struct GraphMaker;

 template <typename MgbOpr>
 struct GraphMaker<MgbOpr, 2> {
    SymbolVar operator()(const std::array<cg::SymbolVar, 2>& inputs,
                          typename MgbOpr::Param& param,
                          typename MgbOpr::ExecutionPolicy& policy) {
        return MgbOpr::make(inputs[0], inputs[1], param, policy);
    }
 };

 template <>
 struct GraphMaker<opr::ConvolutionBackwardData, 2> {
    SymbolVar operator()(
            const std::array<cg::SymbolVar, 2>& inputs,
            opr::ConvolutionBackwardData::Param& param,
            opr::ConvolutionBackwardData::ExecutionPolicy& policy) {
        return opr::ConvolutionBackwardData::make_deconv(inputs[0], inputs[1],
                                                         param, policy);
    }
 };

 template <>
 struct GraphMaker<opr::Convolution3DBackwardData, 2> {
    SymbolVar operator()(
            const std::array<cg::SymbolVar, 2>& inputs,
            opr::Convolution3DBackwardData::Param& param,
            opr::Convolution3DBackwardData::ExecutionPolicy& policy) {
        return opr::Convolution3DBackwardData::make_deconv(inputs[0], inputs[1],
                                                           param, policy);
    }
 };

 template <typename MgbOpr>
 struct GraphMaker<MgbOpr, 3> {
    SymbolVar operator()(const std::array<cg::SymbolVar, 3>& inputs,
                          typename MgbOpr::Param& param,
                          typename MgbOpr::ExecutionPolicy& policy) {
        return MgbOpr::make(inputs[0], inputs[1], inputs[2], param, policy, {});
    }
 };

 template <typename MgbOpr>
 struct GraphMaker<MgbOpr, 4> {
    SymbolVar operator()(const std::array<cg::SymbolVar, 4>& inputs,
                         typename MgbOpr::Param& param,
                         typename MgbOpr::ExecutionPolicy& policy) {
        return MgbOpr::make(inputs[0], inputs[1], inputs[2], inputs[3], param,
                            policy, {});
    }
 };

 template <typename MgbOpr>
 struct GraphMaker<MgbOpr, 5> {
    SymbolVar operator()(const std::array<cg::SymbolVar, 5>& inputs,
                         typename MgbOpr::Param& param,
                         typename MgbOpr::ExecutionPolicy& policy) {
        return MgbOpr::make(inputs[0], inputs[1], inputs[2], inputs[3],
                            inputs[4], param, policy, {});
    }
 };

 template <typename MgbOpr, int arith, typename dtype = dtype::Float32>
 void test_fastrun_opr(std::array<TensorShape, arith> inps0,
                      std::array<TensorShape, arith> inps1,
                      size_t expect_nr_cache_set_inp0 = 0,
                      size_t expect_nr_cache_set_inp1 = 0,
                      typename MgbOpr::Param param = {}) {
    using Policy = opr::Convolution::ExecutionPolicy;
    using S = Policy::Strategy;
    using InputGenerator = std::function<void(HostTensorND & dest)>;
    using ShapeInpArray = std::array<TensorShape, arith>;
    using CacheMem = std::pair<const void*, size_t>;
    auto on_get = [](const std::string&, const void*, size_t, const void*,
                     size_t) {};

    std::vector<std::pair<CacheMem, CacheMem>> cache_set_history;
    auto on_set = [&cache_set_history](const std::string&, const void* key,
                                       size_t key_size, const void* val,
                                       size_t val_size) {
        cache_set_history.emplace_back(std::make_pair(key, key_size),
                                       std::make_pair(val, val_size));
    };

    PersistentCacheHook cache_hook{on_get, on_set};

    CompNode comp_node = CompNode::load("xpu0");
    GraphMaker<MgbOpr, arith> graph_maker;
    auto run = [&param, &comp_node, &graph_maker](
                       const std::shared_ptr<cg::ComputingGraph>& graph,
                       const ShapeInpArray& shapes) {
        std::array<InputGenerator, arith> inputs_generator;
        std::array<std::shared_ptr<HostTensorND>, arith> inputs;
        for (size_t i = 0; i < arith; ++i) {
            inputs[i] = std::make_shared<HostTensorND>(comp_node,
                                                       dtype());
        }
        HostTensorGenerator<dtype> gen_host;
        for (size_t i = 0; i < arith; ++i) {
            inputs[i]->resize(shapes[i]);
            *inputs[i] = *gen_host(inputs[i]->shape(), comp_node);
            mgb_assert(inputs[i]->shape().eq_shape(shapes[i]));
        }
        std::array<cg::SymbolVar, arith> sym_in;
        for (size_t i = 0; i < arith; ++i) {
            // to trigger graph trans
            sym_in[i] = opr::Host2DeviceCopy::make(*graph, inputs[i],
                                                   ssprintf("inp%zu", i));
        }
        Policy policy;
        policy.strategy = S::PROFILE;
        auto out = graph_maker(sym_in, param, policy);

        std::unique_ptr<cg::AsyncExecutable> func =
                graph->compile({{out, {}}});
        func->execute();
    };

    std::shared_ptr<cg::ComputingGraph> fastrun_ignore_batchsize_graph =
            ComputingGraph::make();
    fastrun_ignore_batchsize_graph->options()
            .fast_run_config.shared_batch_size = 20;
    run(fastrun_ignore_batchsize_graph, inps0);
    size_t nr_set_inp0 = cache_set_history.size();
    if (expect_nr_cache_set_inp0) {
        ASSERT_EQ(cache_set_history.size(), expect_nr_cache_set_inp0);
    }
    run(fastrun_ignore_batchsize_graph, inps1);
    size_t nr_set_total = expect_nr_cache_set_inp1 + nr_set_inp0;
    ASSERT_EQ(cache_set_history.size(), nr_set_total);
 }

 TEST(TestOprDNN, FastrunIgnoreBatchSizeConvolution) {
    REQUIRE_GPU(1);
    test_fastrun_opr<opr::Convolution, 2>(
            {TensorShape{12, 3, 36, 36}, TensorShape{4, 3, 3, 3}},
            {TensorShape{1, 3, 36, 36}, TensorShape{4, 3, 3, 3}});

    test_fastrun_opr<opr::ConvolutionBackwardData, 2>(
            {TensorShape{12, 4, 23, 29}, TensorShape{4, 5, 3, 2}},
            {TensorShape{2, 4, 23, 29}, TensorShape{4, 5, 3, 2}});

    test_fastrun_opr<opr::ConvolutionBackwardFilter, 3>(
            {TensorShape{12, 4, 23, 29}, TensorShape{12, 5, 21, 28},
             TensorShape{5, 4, 3, 2}},
            {TensorShape{2, 4, 23, 29}, TensorShape{2, 5, 21, 28},
             TensorShape{5, 4, 3, 2}});
 }

 TEST(TestOprDNN, FastrunIgnoreBatchSizeConvBias) {
    REQUIRE_GPU(1);
    test_fastrun_opr<opr::ConvBias, 3>(
            {TensorShape{20, 16, 50, 50}, TensorShape{24, 16, 3, 3},
             TensorShape{1, 24, 1, 1}},
            {TensorShape{1, 16, 50, 50}, TensorShape{24, 16, 3, 3},
             TensorShape{1, 24, 1, 1}});
 }

 TEST(TestOprDNN, FastrunIgnoreBatchSizeConvolution3D) {
    REQUIRE_GPU(1);
    test_fastrun_opr<opr::Convolution3D, 2>(
            {TensorShape{8, 4, 12, 13, 14}, TensorShape{4, 4, 3, 3, 3}},
            {TensorShape{3, 4, 12, 13, 14}, TensorShape{4, 4, 3, 3, 3}});

    test_fastrun_opr<opr::Convolution3DBackwardData, 2>(
            {TensorShape{14, 5, 12, 12, 16}, TensorShape{5, 5, 3, 3, 3}},
            {TensorShape{4, 5, 12, 12, 16}, TensorShape{5, 5, 3, 3, 3}});

    test_fastrun_opr<opr::Convolution3DBackwardFilter, 3>(
            {TensorShape{64, 16, 18, 18, 18}, TensorShape{64, 16, 18, 18, 18},
             TensorShape{16, 16, 1, 1, 1}},
            {TensorShape{4, 16, 18, 18, 18}, TensorShape{4, 16, 18, 18, 18},
             TensorShape{16, 16, 1, 1, 1}});
 }

 TEST(TestOprDNN, FastrunIgnoreBatchSizeLocalShare) {
    REQUIRE_GPU(1);
    opr::LocalShare::Param local_share_param;
    local_share_param.mode = opr::LocalShare::Param::Mode::CROSS_CORRELATION;
    local_share_param.pad_h = local_share_param.pad_w = 1;
    local_share_param.stride_h = local_share_param.stride_w = 1;
    local_share_param.spatial_groups_h = local_share_param.spatial_groups_w = 2;
    test_fastrun_opr<opr::LocalShareForward, 2>(
            {TensorShape{32, 2, 23, 23}, TensorShape{2, 2, 2, 2, 2, 7}},
            {TensorShape{3, 2, 23, 23}, TensorShape{2, 2, 2, 2, 2, 7}}, 0, 0,
            local_share_param);

    test_fastrun_opr<opr::LocalShareBackwardData, 3>(
            {TensorShape{3, 3, 128, 1, 1, 128}, TensorShape{32, 128, 24, 24},
             TensorShape{32, 128, 24, 24}},
            {TensorShape{3, 3, 128, 1, 1, 128}, TensorShape{2, 128, 24, 24},
             TensorShape{2, 128, 24, 24}});

    test_fastrun_opr<opr::LocalShareBackwardFilter, 3>(
            {TensorShape{12, 3, 36, 36}, TensorShape{12, 4, 35, 35},
             TensorShape{3, 3, 3, 3, 3, 4}},
            {TensorShape{4, 3, 36, 36}, TensorShape{4, 4, 35, 35},
             TensorShape{3, 3, 3, 3, 3, 4}});
 }

 TEST(TestOprDNN, FastrunIgnoreBatchSizeDeformableConv) {
    REQUIRE_GPU(1);
    test_fastrun_opr<opr::DeformableConvForward, 4>(
            {TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3},
             TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18}},
            {TensorShape{4, 6, 20, 20}, TensorShape{6, 6, 3, 3},
             TensorShape{4, 18, 18, 18}, TensorShape{4, 9, 18, 18}});

    test_fastrun_opr<opr::DeformableConvBackwardData, 5>(
            {TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3},
             TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18},
             TensorShape{12, 6, 18, 18}},
            {TensorShape{4, 6, 20, 20},
             TensorShape{6, 6, 3, 3},
             TensorShape{4, 18, 18, 18},
             TensorShape{4, 9, 18, 18},
             TensorShape{4, 6, 18, 18}});

    test_fastrun_opr<opr::DeformableConvBackwardFilter, 5>(
            {TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3},
             TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18},
             TensorShape{12, 6, 18, 18}},
            {TensorShape{4, 6, 20, 20}, TensorShape{6, 6, 3, 3},
             TensorShape{4, 18, 18, 18}, TensorShape{4, 9, 18, 18},
             TensorShape{4, 6, 18, 18}});
 }

 TEST(TestOprDNN, FastrunIgnoreBatchSizeMatrixMul) {
    REQUIRE_GPU(1);
    //! fastrun_shared_batch_size == 20
    //! {20(12), 12(1)}, {12(12), 20(1)} -> {20(12), 20(1)} origin
    //! {12(10), 20(1)}, {12(12), 20(1)} -> {20(12), 20(1)} transA
    //! {12(10), 20(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transA, transB
    //! {20(12), 12(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transB
    //!
    //! {20(12), 12(1)}, {12(12), 20(1)} -> {20(12), 20(1)} origin duplicate
    //! {12(4), 20(1)}, {12(12), 20(1)} -> {20(12), 20(1)} transA
    //! {12(4), 20(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transA, transB
    //! {20(12), 12(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transB duplicate
    test_fastrun_opr<opr::MatrixMul, 2>(
            {TensorShape{10, 12}, TensorShape{12, 12}},
            {TensorShape{4, 12}, TensorShape{12, 12}}, 4, 2);
 }

 TEST(TestOprDNN, FastrunIgnoreBatchSizeBatchedMatrixMul) {
    REQUIRE_GPU(1);

    //! fastrun_shared_batch_size == 20
    //! {20(48), 6(8), 8(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} origin
    //! {20(48), 8(6), 6(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} transA
    //! {20(48), 8(6), 6(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transA, transB
    //! {20(48), 6(8), 8(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transB
    //!
    //! {20(48), 6(8), 8(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} origin duplicate
    //! {20(48), 8(6), 6(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} transA duplicate
    //! {20(48), 8(6), 6(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transA, transB duplicate
    //! {20(48), 6(8), 8(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transB duplicate
    test_fastrun_opr<opr::BatchedMatrixMul, 2>(
            {TensorShape{12, 6, 8}, TensorShape{12, 8, 4}},
            {TensorShape{4, 6, 8}, TensorShape{4, 8, 4}});
 }

 #endif // MGB_ENABLE_FASTRUN
 #endif // MGB_CUDA

 }  // anonymous namespace

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/test/src/helper.cpp
+++ b/test/src/helper.cpp
@@ -460,12 +460,13 @@ mgb::make_callback_copy(SymbolVar dev, HostTensorND &host, bool sync) {

 /* ========================== PersistentCacheHook ========================== */
 class PersistentCacheHook::HookedImpl final : public PersistentCache {
    GetHook m_on_get;
    Hook m_on_get, m_on_set;

 public:
    std::shared_ptr<PersistentCache> orig_impl;

    HookedImpl(GetHook on_get) : m_on_get{std::move(on_get)} {}
    HookedImpl(Hook on_get, Hook on_set)
            : m_on_get{std::move(on_get)}, m_on_set{std::move(on_set)} {}

    Maybe<Blob> get(const std::string& category, const Blob& key) override {
        auto ret = orig_impl->get(category, key);
@@ -476,12 +477,18 @@ public:

    void put(const std::string& category, const Blob& key,
             const Blob& value) override {
        m_on_set(category, key.ptr, key.size, value.ptr,
                 value.size);
        orig_impl->put(category, key, value);
    }
 };

 PersistentCacheHook::PersistentCacheHook(GetHook on_get)
        : m_impl{std::make_shared<HookedImpl>(std::move(on_get))} {
 PersistentCacheHook::Hook PersistentCacheHook::default_set_hook =
        [](const std::string&, const void*, size_t, const void*, size_t) {};

 PersistentCacheHook::PersistentCacheHook(Hook on_get, Hook on_set)
        : m_impl{std::make_shared<HookedImpl>(std::move(on_get),
                                              std::move(on_set))} {
    m_impl->orig_impl = PersistentCache::set_impl(m_impl);
 }

--- a/test/src/include/megbrain/test/helper.h
+++ b/test/src/include/megbrain/test/helper.h
@@ -512,17 +512,17 @@ bool check_device_type_avaiable(CompNode::DeviceType device_type);

 //! hook persistent cache get calls during the lifetime
 class PersistentCacheHook {
    class HookedImpl;

    std::shared_ptr<HookedImpl> m_impl;

 public:
    //! if value is not available, \p val and \p val_size would be zero
    using GetHook = thin_function<void(const std::string& category,
                                       const void* key, size_t key_size,
                                       const void* val, size_t val_size)>;
    PersistentCacheHook(GetHook on_get);
    using Hook = thin_function<void(const std::string& category,
                                    const void* key, size_t key_size,
                                    const void* val, size_t val_size)>;
    PersistentCacheHook(Hook on_get, Hook on_set = default_set_hook);

    ~PersistentCacheHook();
 private:
    static Hook default_set_hook;
    class HookedImpl;
    std::shared_ptr<HookedImpl> m_impl;
 };
 //! skip a testcase if xpu not available
 #define REQUIRE_XPU(n) do { \