feat(gopt): support nchw44 global pooling with fuse_grain

GitOrigin-RevId: 4c43a149f8
3 years ago · fac67e7c2b
--- a/dnn/src/arm_common/adaptive_pooling/opr_impl.cpp
+++ b/dnn/src/arm_common/adaptive_pooling/opr_impl.cpp
@@ -0,0 +1,29 @@
 #include "src/arm_common/adaptive_pooling/opr_impl.h"
 #include "src/common/opr_delegate.h"
 #include "src/naive/handle.h"
 namespace megdnn {
 namespace arm_common {

 void AdaptivePoolingImpl::exec(
        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
    auto adapt_fwd = [=]() {
        auto opr = inplace_cpu_handle()->create_operator<PoolingForward>();
        opr->param() = deduce_pooling_param(src.layout, dst.layout);
        opr->exec(src, dst, workspace);
    };
    MEGDNN_DISPATCH_CPU_KERN_OPR(adapt_fwd());
    return;
 }

 size_t AdaptivePoolingImpl::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& dst) {
    auto opr = inplace_cpu_handle()->create_operator<PoolingForward>();
    opr->param() = deduce_pooling_param(src, dst);
    auto need_size = opr->get_workspace_in_bytes(src, dst);
    return need_size;
 }

 }  // namespace arm_common
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/arm_common/adaptive_pooling/opr_impl.h
+++ b/dnn/src/arm_common/adaptive_pooling/opr_impl.h
@@ -0,0 +1,20 @@
 #pragma once
 #include "megdnn/oprs.h"

 namespace megdnn {
 namespace arm_common {

 class AdaptivePoolingImpl final : public AdaptivePoolingForward {
 public:
    using AdaptivePoolingForward::AdaptivePoolingForward;
    void exec(
            _megdnn_tensor_in src, _megdnn_tensor_out dst,
            _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(
            const TensorLayout& src, const TensorLayout& dst) override;
 };

 }  // namespace arm_common
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/arm_common/handle.cpp
+++ b/dnn/src/arm_common/handle.cpp
@@ -2,6 +2,7 @@

 #include "src/arm_common/handle.h"

 #include "src/arm_common/adaptive_pooling/opr_impl.h"
 #include "src/arm_common/conv_bias/opr_impl.h"
 #include "src/arm_common/convolution/opr_impl.h"
 #include "src/arm_common/cvt_color/opr_impl.h"
@@ -45,6 +46,7 @@ MEGDNN_SPECIALIZE_CREATE_OPERATOR(ConvolutionBackwardData)
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(RNNCell)
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(LSTMCell)
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(LSTM)
 MEGDNN_SPECIALIZE_CREATE_OPERATOR(AdaptivePooling)

 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wpragmas"
--- a/dnn/src/arm_common/pooling/opr_impl.cpp
+++ b/dnn/src/arm_common/pooling/opr_impl.cpp
@@ -2,6 +2,7 @@
 #include "src/arm_common/pooling/algo.h"
 #include "src/common/algo_chooser.h"
 #include "src/common/metahelper.h"
 #include "src/common/opr_delegate.h"

 using namespace megdnn;
 using namespace arm_common;
@@ -48,10 +49,72 @@ public:
 };

 PoolingImpl::AlgoPack PoolingImpl::sm_algo_pack;
 namespace {
 TensorLayout merge_hw_layout(TensorLayout src) {
    src.ndim -= 1;
    src.shape[2] = src.shape[2] * src.shape[3];
    src.stride[2] = src.stride[3];
    for (size_t i = 3; i < src.ndim; ++i) {
        src.shape[i] = src.shape[i + 1];
        src.stride[i] = src.stride[i + 1];
    }
    return src;
 }
 std::pair<TensorND, TensorND> get_gloabl_pooling_reduce_tensor(
        const TensorND& src, const TensorND& dst) {
    auto reduce_src_layout = merge_hw_layout(src.layout);
    auto reduce_dst_layout = merge_hw_layout(dst.layout);
    return std::make_pair<TensorND, TensorND>(
            {src.raw_ptr(), reduce_src_layout}, {dst.raw_ptr(), reduce_dst_layout});
 }
 std::unique_ptr<Reduce> get_global_pooling_reduce_opr(
        Handle* handle, const PoolingImpl::PoolingKernSizeParam& param) {
    std::unique_ptr<Reduce> opr;
    if (handle) {
        opr = handle->create_operator<Reduce>();
    } else {
        opr = inplace_cpu_handle()->create_operator<Reduce>();
    }
    param::Reduce reduce_param;
    reduce_param.axis = 2;
    if (param.mode == PoolingImpl::Param::Mode::MAX) {
        reduce_param.mode = param::Reduce::Mode::MAX;
    } else {
        megdnn_assert(
                param.mode == PoolingImpl::Param::Mode::AVERAGE ||
                param.mode == PoolingImpl::Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING);
        reduce_param.mode = param::Reduce::Mode::MEAN;
    }
    opr->param() = reduce_param;
    return opr;
 }
 bool is_global_pooling_reduce(PoolingImpl::PoolingKernSizeParam& param) {
    bool fmt_ok = param.format == PoolingImpl::Param::Format::NCHW ||
                  param.format == PoolingImpl::Param::Format::NCHW44 ||
                  param.format == PoolingImpl::Param::Format::NCHW88;
    bool size_ok = param.filter[0] == param.isz[0] && param.filter[1] == param.isz[1] &&
                   param.padding[0] == 0 && param.padding[1] == 0 &&
                   param.osz[0] == 1 && param.osz[1] == 1;
    bool dtype_ok = param.src_type == param.dst_type &&
                    param.src_type.enumv() != DTypeEnum::Int8;
    return fmt_ok && size_ok && dtype_ok;
 }

 }  // namespace
 size_t PoolingImpl::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& dst) {
    auto param = make_pooling_kern_szie_param(this, src, dst);
    bool fwd_reduce = is_global_pooling_reduce(param);
    if (fwd_reduce) {
        TensorND src_tensor{nullptr, src};
        TensorND dst_tensor{nullptr, dst};
        auto reduce_tensor = get_gloabl_pooling_reduce_tensor(src_tensor, dst_tensor);
        auto&& opr = get_global_pooling_reduce_opr(nullptr, param);
        auto reduce_need = opr->get_workspace_in_bytes(
                reduce_tensor.first.layout, reduce_tensor.second.layout);
        return reduce_need;
    }

    auto algo = get_algorithm(this, src, dst);
    if (!is_fallback_algo(algo)) {
        size_t arm_common_workspace = 0;
@@ -93,6 +156,18 @@ void PoolingImpl::exec(
        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
    check_exec(src.layout, dst.layout, workspace.size);
    auto param = make_pooling_kern_param(this, src, dst, workspace);

    bool fwd_reduce = is_global_pooling_reduce(param);
    if (fwd_reduce) {
        auto global_pooling_fwd = [=]() {
            auto reduce_tensor = get_gloabl_pooling_reduce_tensor(src, dst);
            auto&& opr = get_global_pooling_reduce_opr(nullptr, param);
            opr->exec(reduce_tensor.first, reduce_tensor.second, workspace);
        };
        MEGDNN_DISPATCH_CPU_KERN_OPR(global_pooling_fwd());
        return;
    }

    auto algo = get_algorithm(this, src.layout, dst.layout);
    if (!is_fallback_algo(algo)) {
        algo->exec(param);
--- a/dnn/src/common/adaptive_pooling.cpp
+++ b/dnn/src/common/adaptive_pooling.cpp
@@ -8,7 +8,9 @@ param::Pooling AdaptivePoolingBase::deduce_pooling_param(
        const TensorLayout& src, const TensorLayout& dst) {
    auto param_format = param().format;
    size_t IH, IW, OH, OW;
    if (param_format == param::AdaptivePooling::Format::NCHW) {
    if (param_format == param::AdaptivePooling::Format::NCHW ||
        param_format == param::AdaptivePooling::Format::NCHW44 ||
        param_format == param::AdaptivePooling::Format::NCHW88) {
        IH = src.shape[2];
        IW = src.shape[3];
        OH = dst.shape[2];
@@ -19,7 +21,8 @@ param::Pooling AdaptivePoolingBase::deduce_pooling_param(
        OH = dst.shape[1];
        OW = dst.shape[2];
    } else {
        megdnn_throw("AdaptivePooling only support NCHW or NHWC format");
        megdnn_throw(
                "AdaptivePooling only support NCHW or NHWC or NCHW44 or NCHW88 format");
    }

    param::Pooling ret;
--- a/dnn/src/common/pooling.cpp
+++ b/dnn/src/common/pooling.cpp
@@ -140,7 +140,9 @@ void PoolingForward::check_exec(
        const TensorLayout& src, const TensorLayout& dst, size_t workspace_in_bytes) {
    check_layout_fwd(src, dst);
    auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst);
    megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
    megdnn_assert(
            workspace_in_bytes >= required_workspace_in_bytes, "need %zu, get %zu",
            required_workspace_in_bytes, workspace_in_bytes);
 }

 void PoolingBackward::check_exec(
--- a/dnn/src/naive/adaptive_pooling/opr_impl.cpp
+++ b/dnn/src/naive/adaptive_pooling/opr_impl.cpp
@@ -6,11 +6,17 @@

 namespace megdnn {
 namespace naive {

 size_t AdaptivePoolingForwardImpl::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& dst) {
    auto opr = inplace_cpu_handle(2)->create_operator<PoolingForward>();
    opr->param() = deduce_pooling_param(src, dst);
    auto need_size = opr->get_workspace_in_bytes(src, dst);
    return need_size;
 }
 void AdaptivePoolingForwardImpl::exec(
        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
    MEGDNN_DISPATCH_CPU_KERN(static_cast<naive::HandleImpl*>(handle()), {
        auto opr = inplace_cpu_handle()->create_operator<PoolingForward>();
        auto opr = inplace_cpu_handle(2)->create_operator<PoolingForward>();
        opr->param() = deduce_pooling_param(src.layout, dst.layout);
        opr->exec(src, dst, workspace);
    });
@@ -20,7 +26,7 @@ void AdaptivePoolingBackwardImpl::exec(
        _megdnn_tensor_in src, _megdnn_tensor_in dst, _megdnn_tensor_in diff,
        _megdnn_tensor_out grad, _megdnn_workspace workspace) {
    MEGDNN_DISPATCH_CPU_KERN(static_cast<naive::HandleImpl*>(handle()), {
        auto opr = inplace_cpu_handle()->create_operator<PoolingBackward>();
        auto opr = inplace_cpu_handle(2)->create_operator<PoolingBackward>();
        opr->param() = deduce_pooling_param(src.layout, dst.layout);
        opr->exec(src, dst, diff, grad, workspace);
    });
@@ -29,7 +35,7 @@ void AdaptivePoolingBackwardImpl::exec(
 size_t AdaptivePoolingBackwardImpl::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& dst, const TensorLayout& diff,
        const TensorLayout& grad) {
    auto opr = inplace_cpu_handle()->create_operator<PoolingBackward>();
    auto opr = inplace_cpu_handle(2)->create_operator<PoolingBackward>();
    opr->param() = deduce_pooling_param(src, dst);
    return opr->get_workspace_in_bytes(src, dst, diff, grad);
 }
--- a/dnn/src/naive/adaptive_pooling/opr_impl.h
+++ b/dnn/src/naive/adaptive_pooling/opr_impl.h
@@ -11,9 +11,7 @@ public:
    void exec(
            _megdnn_tensor_in src, _megdnn_tensor_out dst,
            _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) override {
        return 0;
    }
    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) override;
 };

 class AdaptivePoolingBackwardImpl : public AdaptivePoolingBackward {
--- a/dnn/test/arm_common/adaptive_pooling.cpp
+++ b/dnn/test/arm_common/adaptive_pooling.cpp
@@ -0,0 +1,98 @@
 #include "test/arm_common/fixture.h"

 #include "megdnn/tensor_iter.h"
 #include "src/common/utils.h"
 #include "test/common/adaptive_pooling.h"
 #include "test/common/benchmarker.h"
 #include "test/common/checker.h"

 namespace megdnn {
 namespace test {

 TEST_F(ARM_COMMON, ADAPTIVE_POOLING_FORWARD_NCHW44) {
    auto args = adaptive_pooling::get_args_nchw44();
    Checker<AdaptivePooling> checker(handle());
    checker.set_epsilon(1e-4);
    for (DType dtype : {(DType)dtype::Float32(), (DType)dtype::QuantizedS8(1.0)})
        for (auto&& arg : args) {
            auto param = arg.param;
            auto src = arg.ishape;
            auto dst = arg.oshape;

            checker.set_param(param).set_dtype(0, dtype).set_dtype(1, dtype).exec(
                    TensorShapeArray{src, dst, {}});
        }
 }

 TEST_F(ARM_COMMON, ADAPTIVE_POOLING_FORWARD) {
    auto args = adaptive_pooling::get_args();
    Checker<AdaptivePooling> checker(handle());
    checker.set_epsilon(1e-4);
    for (DType dtype : {(DType)dtype::Float32(), (DType)dtype::QuantizedS8(1.0)})
        for (auto&& arg : args) {
            auto param = arg.param;
            auto src = arg.ishape;
            auto dst = arg.oshape;

            checker.set_param(param).set_dtype(0, dtype).set_dtype(1, dtype).exec(
                    TensorShapeArray{src, dst, {}});
        }
 }

 #if MEGDNN_WITH_BENCHMARK
 namespace {
 void benchmark_globalpooling_nchw44_fp32(Handle* handle) {
    using Param = param::AdaptivePooling;
    auto run = [&](size_t n, size_t c, size_t h, size_t w, Param::Mode mode) {
        Param param;
        param.format = Param::Format::NCHW;
        param.mode = mode;
        TensorShape nchw_shape = {n, c, h, w};
        TensorShape nchw_dst_shape = {n, c, 1, 1};
        TensorShape nchw44_shape = {n, c / 4, h, w, 4};
        TensorShape nchw44_dst_shape = {n, c / 4, 1, 1, 4};
        TensorLayout dst_layout;
        float calc_amount = n * c * h * w;

        Benchmarker<AdaptivePooling> benchmarker_float_nchw(handle);
        Benchmarker<AdaptivePooling> benchmarker_float_nchw44(handle);
        Benchmarker<AdaptivePooling> benchmarker_int_nchw44(handle);
        size_t RUN = 500;
        auto t1 = benchmarker_float_nchw.set_display(false)
                          .set_times(RUN)
                          .set_param(param)
                          .exec({nchw_shape, nchw_dst_shape});

        param.format = Param::Format::NCHW44;
        auto t2 = benchmarker_int_nchw44.set_display(false)
                          .set_times(RUN)
                          .set_param(param)
                          .execl({{nchw44_shape, dtype::QuantizedS8(1.0)},
                                  {nchw44_dst_shape, dtype::QuantizedS8(1.0)}});

        auto t3 = benchmarker_float_nchw44.set_display(false)
                          .set_times(RUN)
                          .set_param(param)
                          .exec({nchw44_shape, nchw44_dst_shape});

        printf("{%zu %zu %zu %zu} \n"
               "nchw_fp32={%.3f ms, %.3f Mflops},  "
               "nchw44_int={%.3f ms, %.3f Mflops},  "
               "nchw44_fp32={%.3f ms, %.3f Mflops, speed_up %f}\n\n",
               n, c, h, w, t1 / RUN, calc_amount / (t1 / RUN * 1000), t2 / RUN,
               calc_amount / (t2 / RUN * 1000), t3 / RUN,
               calc_amount / (t3 / RUN * 1000), t1 / t3);
    };

    run(1, 128, 25, 25, param::AdaptivePooling::Mode::AVERAGE);
 }
 }  // namespace
 TEST_F(ARM_COMMON, BENCHMARK_GLOBAL_POOLING_NCHW44_FP32) {
    benchmark_globalpooling_nchw44_fp32(handle());
 }

 #endif
 }  // namespace test
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/test/common/adaptive_pooling.h
+++ b/dnn/test/common/adaptive_pooling.h
@@ -40,6 +40,36 @@ inline std::vector<TestArg> get_args() {
    return args;
 }

 inline std::vector<TestArg> get_args_nchw44() {
    std::vector<TestArg> args;
    using Param = param::AdaptivePooling;
    using Mode = param::AdaptivePooling::Mode;

    for (size_t i = 36; i < 40; ++i) {
        args.emplace_back(
                Param{Mode::AVERAGE, Param::Format::NCHW44},
                TensorShape{2, 3, i, i + 1, 4}, TensorShape{2, 3, i - 4, i - 2, 4});
        args.emplace_back(
                Param{Mode::MAX, Param::Format::NCHW44}, TensorShape{2, 3, i, i + 1, 4},
                TensorShape{2, 3, i - 4, i - 2, 4});
        args.emplace_back(
                Param{Mode::AVERAGE, Param::Format::NCHW44},
                TensorShape{2, 3, i, i + 1, 4}, TensorShape{2, 3, 1, 1, 4});
        args.emplace_back(
                Param{Mode::MAX, Param::Format::NCHW44}, TensorShape{2, 3, i, i + 1, 4},
                TensorShape{2, 3, 1, 1, 4});
    }

    for (size_t i = 5; i < 10; ++i) {
        args.emplace_back(
                Param{Mode::AVERAGE, Param::Format::NCHW44},
                TensorShape{2, 3, i, i + 1, 4}, TensorShape{2, 3, i - 3, i - 2, 4});
        args.emplace_back(
                Param{Mode::MAX, Param::Format::NCHW44}, TensorShape{2, 3, i, i + 1, 4},
                TensorShape{2, 3, i - 3, i - 2, 4});
    }
    return args;
 }
 }  // namespace adaptive_pooling
 }  // namespace test
 }  // namespace megdnn
--- a/imperative/python/megengine/core/tensor/megbrain_graph.py
+++ b/imperative/python/megengine/core/tensor/megbrain_graph.py
@@ -254,7 +254,9 @@ def optimize_for_inference(dest_vars, **kwargs):
        * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
          input for inference on nvidia backend(this optimization pass will
          result in mismatch of the precision of output of training and
          inference)
          inference
        * enable_fuse_grain: fuse grain will be enable by default to fuse grain operator to huge operator, you can disable it.
          )
    """
    inference_options = GraphOptimizeOptions()
    inference_optimize_layout_transform_map = {
@@ -282,6 +284,8 @@ def optimize_for_inference(dest_vars, **kwargs):
        inference_options.fuse_conv_bias_with_z = True
    if kwargs.pop("enable_fuse_preprocess", False):
        inference_options.fuse_preprocess = True
    if kwargs.pop("enable_fuse_grain", True):
        inference_options.fuse_grain = True

    if kwargs:
        raise ValueError("unknown options: %s" % list(kwargs))
@@ -330,6 +334,8 @@ def deserialize_infer_option(x: int) -> Dict[str, bool]:
        ret["enable_fuse_conv_bias_with_z"] = True
    if inference_options.fuse_preprocess:
        ret["enable_fuse_preprocess"] = True
    if inference_options.fuse_grain:
        ret["enable_fuse_grain"] = True

    return ret

--- a/imperative/python/megengine/utils/network.py
+++ b/imperative/python/megengine/utils/network.py
@@ -151,7 +151,9 @@ class Network:
        * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
          input for inference on nvidia backend(this optimization pass will
          result in mismatch of the precision of output of training and
          inference)
          inference
        * enable_fuse_grain: fuse grain will be enable by default to fuse grain operator to huge operator, you can disable it.  
        )
        """

        if not isinstance(dest_vars, Sequence):
@@ -221,7 +223,6 @@ class Network:
            logger.warning(
                '"output_names" is not supported in Network.dump, rename output vars directly'
            )

        if optimize_for_inference:
            out, optimize_options = G.optimize_for_inference(out, **kwargs)

--- a/imperative/python/src/graph_rt.cpp
+++ b/imperative/python/src/graph_rt.cpp
@@ -292,7 +292,9 @@ void init_graph_rt(py::module m) {
                            &_OptimizeForInferenceOptions::fuse_preprocess)
                    .def_readwrite(
                            "layout_transform",
                            &_OptimizeForInferenceOptions::layout_transform);
                            &_OptimizeForInferenceOptions::layout_transform)
                    .def_readwrite(
                            "fuse_grain", &_OptimizeForInferenceOptions::fuse_grain);

    py::enum_<_LayoutTransform>(GraphOptimizeOptions, "LayoutTransform")
            .value("DEFAULT", _LayoutTransform::DEFAULT)
--- a/imperative/python/test/unit/utils/test_network.py
+++ b/imperative/python/test/unit/utils/test_network.py
@@ -47,6 +47,7 @@ def test_metadata():
        "user_info": {"str": "x", "tensor": x, "module": M.Module, "none": None},
        "graph_modified": True,  # True: Network.dump
        "optimized_for_inference": True,
        "enable_fuse_grain": True,
        "enable_nchw4": True,
        "enable_ioc16": True,
    }
--- a/lite/load_and_run/src/models/model_mdl.cpp
+++ b/lite/load_and_run/src/models/model_mdl.cpp
@@ -76,6 +76,13 @@ void ModelMdl::make_output_spec() {
    }

    m_asyc_exec = m_load_result.graph_compile(m_output_spec);
    auto new_output_vars = m_asyc_exec->get_output_vars();
    mgb::cg::SymbolVarArray symbol_var_array;
    symbol_var_array.reserve(new_output_vars.size());
    for (auto output_var : new_output_vars) {
        symbol_var_array.emplace_back(output_var);
    }
    m_load_result.output_var_list = symbol_var_array;
 }

 std::shared_ptr<mgb::serialization::GraphLoader>& ModelMdl::reset_loader(
--- a/lite/load_and_run/src/options/model_options.cpp
+++ b/lite/load_and_run/src/options/model_options.cpp
@@ -56,6 +56,43 @@ void PackModelOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }
 ///////////////////// RawModelOption //////////////////////////
 std::shared_ptr<OptionBase> RawModelOption::create_option() {
    static std::shared_ptr<RawModelOption> option(new RawModelOption);
    if (RawModelOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 RawModelOption::RawModelOption() {
    m_option_name = "raw_model";
    if (!FLAGS_model_dump.empty())
        model_dump = FLAGS_model_dump;
 }
 bool RawModelOption::is_valid() {
    return !FLAGS_model_dump.empty();
 }
 void RawModelOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }
 template <typename ModelImpl>
 void RawModelOption::config_model_internel(
        RuntimeParam& runtime_param, std::shared_ptr<ModelImpl> model) {
    if (runtime_param.stage == RunStage::AFTER_MODEL_RUNNING) {
        auto model_data = model->get_model_data();
        std::ofstream ofs(model_dump, std::ios::binary);
        if (!ofs.is_open()) {
            mgb_log_warn("can not open file %s to write model\n", model_dump.c_str());
            return;
        }
        ofs.write((char*)model_data.data(), model_data.size());
        ofs.close();
        mgb_log_warn("success write model to %s\n", model_dump.c_str());
    }
 }

 ////////////////////// PackModel gflags ////////////////////////

@@ -79,4 +116,8 @@ DEFINE_string(
        "https://megengine.megvii-inc.com/user-guide/deployment/lite/advance/"
        "pack-lite-model.html for more details.");

 REGIST_OPTION_CREATOR(pack_model, lar::PackModelOption::create_option);
 /////////////////////  RawModel gflags ///////////////////////////
 DEFINE_string(model_dump, "", "The output file path of raw model.");

 REGIST_OPTION_CREATOR(pack_model, lar::PackModelOption::create_option);
 REGIST_OPTION_CREATOR(dump_model, lar::RawModelOption::create_option);
--- a/lite/load_and_run/src/options/model_options.h
+++ b/lite/load_and_run/src/options/model_options.h
@@ -3,7 +3,7 @@
 #include "megbrain/graph/operator_node.h"
 #include "models/model.h"
 #include "option_base.h"

 DECLARE_string(model_dump);
 DECLARE_string(packed_model_dump);
 DECLARE_string(pack_info_json);
 DECLARE_string(pack_cache);
@@ -36,4 +36,22 @@ private:
    std::string pack_model_cryption;
    bool is_fast_run_cache = true;
 };

 class RawModelOption : public OptionBase {
 public:
    static bool is_valid();
    static std::shared_ptr<OptionBase> create_option();
    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
    std::string option_name() const override { return m_option_name; }

 private:
    RawModelOption();

    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>);

    std::string m_option_name;
    std::string model_dump;
 };
 }  // namespace lar
--- a/lite/load_and_run/src/options/optimize_options.cpp
+++ b/lite/load_and_run/src/options/optimize_options.cpp
@@ -124,6 +124,60 @@ void WeightPreprocessOption::config_model(
    CONFIG_MODEL_FUN;
 }

 ///////////////////////// fuse grain optimize options ///////////////
 bool FuseGrainOption::m_valid;
 namespace lar {
 template <>
 void FuseGrainOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite>) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        if (m_fuse_grain) {
            LITE_THROW("enable fuse-grain optimization not support in lite model");
        }
    }
 }

 template <>
 void FuseGrainOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto&& graph_option = model->get_mdl_config().comp_graph->options();
        if (m_fuse_grain) {
            mgb_log_warn("enable fuse-grain optimization");
            graph_option.graph_opt.enable_fuse_grain();
        }
    }
 }
 }  // namespace lar

 FuseGrainOption::FuseGrainOption() {
    m_option_name = "fuse_grain";
    m_fuse_grain = FLAGS_fuse_grain;
    m_option = {{"fuse_grain", lar::Bool::make(false)}};
    std::static_pointer_cast<lar::Bool>(m_option["fuse_grain"])
            ->set_value(FLAGS_fuse_grain);
 }

 bool FuseGrainOption::is_valid() {
    return true;
 }

 std::shared_ptr<OptionBase> FuseGrainOption::create_option() {
    static std::shared_ptr<FuseGrainOption> option(new FuseGrainOption);
    if (FuseGrainOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void FuseGrainOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    m_fuse_grain =
            std::static_pointer_cast<lar::Bool>(m_option["fuse_grain"])->get_value();
    CONFIG_MODEL_FUN;
 }

 ///// fuse conv bias and nonlinear activation opr optimize options ////////
 bool FuseConvBiasNonlinearOption::m_valid;
 namespace lar {
@@ -579,6 +633,7 @@ void TensorRTOption::config_model(
 DEFINE_bool(
        enable_fuse_preprocess, false,
        "Fusion astype | pad_channel | dimshuffle and etc opr from h2d opr");
 DEFINE_bool(fuse_grain, false, "Enable fusion grain opr to huge opr");
 DEFINE_bool(
        weight_preprocess, false,
        "Execute operators with weight preprocess, which can optimize the "
@@ -589,7 +644,7 @@ DEFINE_bool(
        "whether to fuse conv+bias+nonlinearity");
 DEFINE_bool(
        enable_fuse_conv_bias_with_z, false,
        "fuse conv，bias (elemwise add)，z(elemwise add) into one opr "
        "fuse conv, bias (elemwise add), z(elemwise add) into one opr "
        "(only support on GPU)");

 ///////////////////////// graph retrict options /////////////////////////
@@ -636,6 +691,9 @@ REGIST_OPTION_VALIDATER(fuse_preprocess, lar::FusePreprocessOption::set_valid);
 REGIST_OPTION_CREATOR(weight_preprocess, lar::WeightPreprocessOption::create_option);
 REGIST_OPTION_VALIDATER(weight_preprocess, lar::WeightPreprocessOption::set_valid);

 REGIST_OPTION_CREATOR(disable_fuse_grain, lar::FuseGrainOption::create_option);
 REGIST_OPTION_VALIDATER(disable_fuse_grain, lar::FuseGrainOption::set_valid);

 REGIST_OPTION_CREATOR(
        fuse_conv_bias_nonlinearity, lar::FuseConvBiasNonlinearOption::create_option);
 REGIST_OPTION_VALIDATER(
--- a/lite/load_and_run/src/options/optimize_options.h
+++ b/lite/load_and_run/src/options/optimize_options.h
@@ -5,6 +5,7 @@
 #include "option_base.h"

 DECLARE_bool(enable_fuse_preprocess);
 DECLARE_bool(fuse_grain);
 DECLARE_bool(weight_preprocess);
 DECLARE_bool(enable_fuse_conv_bias_nonlinearity);
 DECLARE_bool(enable_fuse_conv_bias_with_z);
@@ -79,7 +80,31 @@ private:
    static bool m_valid;
    OptionValMap m_option;
 };
 ///////////////////////// fuse grain options //////////////
 class FuseGrainOption final : public OptionBase {
 public:
    static bool is_valid();

    static std::shared_ptr<OptionBase> create_option();

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    std::string option_name() const override { return m_option_name; };
    static void set_valid(bool val) { m_valid = val; };

    OptionValMap* get_option() override { return &m_option; }

 private:
    FuseGrainOption();
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};

    std::string m_option_name;
    bool m_fuse_grain;
    static bool m_valid;
    OptionValMap m_option;
 };
 /////////////// fuse_conv_bias_nonlinearity optimize options ///////////////
 class FuseConvBiasNonlinearOption final : public OptionBase {
 public:
--- a/src/core/include/megbrain/graph/bases.h
+++ b/src/core/include/megbrain/graph/bases.h
@@ -91,7 +91,7 @@ public:
 };

 class OutputVarsUserData final : public mgb::UserDataContainer::UserData {
    MGB_TYPEINFO_OBJ_DECL;
    MGB_TYPEINFO_OBJ_DECL_WITH_EXPORT;

 private:
    VarNodeArray m_output_vars;
--- a/src/core/include/megbrain/graph/cg.h
+++ b/src/core/include/megbrain/graph/cg.h
@@ -91,6 +91,9 @@ struct GraphCommonOptimizeOptions {
    bool weight_preprocess = false;
    //! fuse preprocess patten, like astype + pad_channel + dimshuffle
    bool fuse_preprocess = false;
    //! fuse_grain patten, replace grain ir with huge ir
    bool fuse_grain = false;

    enum LayoutTransform : uint32_t {
        DEFAULT,
        NCHW4,       ///< compute using NCHW4 tensor format
@@ -124,6 +127,7 @@ struct GraphCommonOptimizeOptions {
    SET(fuse_conv_bias_with_z);
    SET(fuse_preprocess);
    SET(weight_preprocess);
    SET(fuse_grain);
 #undef SET
 #define SET(_trans, _trans_capital)                                 \
    GraphCommonOptimizeOptions& enable_##_trans() {                 \
--- a/src/gopt/impl/folding_golbal_pooling.cpp
+++ b/src/gopt/impl/folding_golbal_pooling.cpp
@@ -0,0 +1,180 @@
 #include "megbrain/gopt/inference.h"
 #include "megbrain/opr/basic_arith.h"
 #include "megbrain/opr/dnn/adaptive_pooling.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/opr/utility.h"
 #include "megbrain/serialization/opr_shallow_copy.h"

 #include "megdnn/opr_param_defs.h"

 #include "megbrain/opr/internal/megdnn_opr_wrapper.h"

 #include "../../core/impl/graph/cg_impl.h"
 #include "./gopt_helper.h"

 #include "megbrain/utils/hash_ct.h"

 #include "midout.h"

 MIDOUT_DECL(megbrain_folding_global_pooling)
 #define MIDOUT_B(tag) \
    MIDOUT_BEGIN(megbrain_folding_global_pooling, midout_iv(MGB_HASH_STR(tag))) {
 #define MIDOUT_E \
    }            \
    MIDOUT_END();

 using namespace mgb;
 using namespace gopt;

 /* ==================== FoldingGlobalPoolingPass ================= */
 const char* FoldingGlobalPoolingPass::name() const {
    return mgb_cstr_log("folding reduce mean pass");
 }

 void FoldingGlobalPoolingPass::apply(OptState& opt) const {
    MIDOUT_B("FoldingGlobalPoolingPass::apply");

    FindNext find_tool(opt);

    auto rewriter = opt.graph().make_rewriter();
    /**
     *
     *   reshape+------>reduce(mean or max)+--->axis_add_remove*n
     *                          ||
     *                          ||
     *                          ||
     *                          \/
     *                  adaptive_pooling(1,1)
     */
    auto try_fuse_global_pooling_axis_add = [&rewriter,
                                             &find_tool](OperatorNodeBase* opr) {
        ThinHashSet<OperatorNodeBase*> opr_set;
        ThinHashSet<OperatorNodeBase*> reader_set;
        MGB_MARK_USED_VAR(rewriter);
        MGB_MARK_USED_VAR(find_tool);

        auto axis_modi = try_cast_as_op<opr::AxisAddRemove>(opr);
        CHECK_OR_RETURN(axis_modi);
        CHECK_OR_RETURN(find_tool.used_count(axis_modi) <= 1);
        auto output_shape = axis_modi->output(0)->shape();
        CHECK_OR_RETURN(output_shape.ndim == 4);
        CHECK_OR_RETURN(output_shape[2] == output_shape[3] && output_shape[2] == 1);

        auto axis_input = axis_modi->input(0)->owner_opr();
        auto axis_modi_x = axis_input->try_cast_final<opr::AxisAddRemove>();
        auto reduce = axis_input->try_cast_final<opr::Reduce>();
        while (axis_modi_x) {
            CHECK_OR_RETURN(find_tool.used_count(axis_modi_x) == 1);
            auto axis_input_x = axis_modi_x->input(0)->owner_opr();
            reduce = axis_input_x->try_cast_final<opr::Reduce>();
            axis_modi_x = axis_input_x->try_cast_final<opr::AxisAddRemove>();
        }

        CHECK_OR_RETURN(reduce);
        auto reduce_mode = reduce->param().mode;
        CHECK_OR_RETURN(
                reduce_mode == opr::Reduce::Param::Mode::MAX ||
                reduce_mode == opr::Reduce::Param::Mode::MEAN);
        auto reduce_axis = reduce->param().axis;
        CHECK_OR_RETURN(reduce_axis == 2)

        auto reshape = reduce->input(0)->owner_opr()->try_cast_final<opr::Reshape>();
        CHECK_OR_RETURN(reshape);

        auto reshape_in_shape = reshape->input(0)->shape();
        auto reshape_out_shape = reshape->output(0)->shape();
        bool merge_hw =
                reshape_out_shape.ndim == 3 && reshape_in_shape.ndim == 4 &&
                reshape_in_shape[2] * reshape_in_shape[3] == reshape_out_shape[2];
        CHECK_OR_RETURN(merge_hw);
        opr::AdaptivePooling::Param param;
        if (reduce_mode == opr::Reduce::Param::Mode::MAX) {
            param.mode = opr::AdaptivePooling::Param::Mode::MAX;
        } else {
            mgb_assert(reduce_mode == opr::Reduce::Param::Mode::MEAN);
            param.mode = opr::AdaptivePooling::Param::Mode::AVERAGE;
        }

        auto new_node = opr::AdaptivePooling::make(
                rewriter.get_var(reshape->input(0)), {1, 1}, param);
        rewriter.replace_var(
                axis_modi->output(0), new_node.node(),
                mgb_cstr_log("replace reshape+reduce+add_axis -> adaptive pooling"));
        return true;
    };
    /**
     *
     *   reshape+------>reduce(mean or max)+--->dimshuffle(0,1,-1,-1)
     *                          ||
     *                          ||
     *                          ||
     *                          \/
     *                  adaptive_pooling(1,1)
     */
    auto try_fuse_global_pooling_dimshuffle = [&rewriter,
                                               &find_tool](OperatorNodeBase* opr) {
        ThinHashSet<OperatorNodeBase*> opr_set;
        ThinHashSet<OperatorNodeBase*> reader_set;
        MGB_MARK_USED_VAR(rewriter);
        MGB_MARK_USED_VAR(find_tool);

        auto dimshuffle = try_cast_as_op<opr::Dimshuffle>(opr);
        CHECK_OR_RETURN(dimshuffle);
        auto patten_param = dimshuffle->param();

        CHECK_OR_RETURN(patten_param.pattern_len == 4);
        auto patten = patten_param.pattern;
        CHECK_OR_RETURN(
                patten[0] == 0 && patten[1] == 1 && patten[2] == -1 && patten[3] == -1);
        auto axis_remove =
                dimshuffle->input(0)->owner_opr()->try_cast_final<opr::AxisAddRemove>();
        CHECK_OR_RETURN(axis_remove);

        auto reduce = axis_remove->input(0)->owner_opr()->try_cast_final<opr::Reduce>();
        CHECK_OR_RETURN(reduce);
        auto reduce_mode = reduce->param().mode;
        CHECK_OR_RETURN(
                reduce_mode == opr::Reduce::Param::Mode::MAX ||
                reduce_mode == opr::Reduce::Param::Mode::MEAN);
        auto reduce_axis = reduce->param().axis;
        CHECK_OR_RETURN(reduce_axis == 2)

        auto reshape = reduce->input(0)->owner_opr()->try_cast_final<opr::Reshape>();
        CHECK_OR_RETURN(reshape);

        auto reshape_in_shape = reshape->input(0)->shape();
        auto reshape_out_shape = reshape->output(0)->shape();
        bool merge_hw =
                reshape_out_shape.ndim == 3 && reshape_in_shape.ndim == 4 &&
                reshape_in_shape[2] * reshape_in_shape[3] == reshape_out_shape[2];
        CHECK_OR_RETURN(merge_hw);
        opr::AdaptivePooling::Param param;
        if (reduce_mode == opr::Reduce::Param::Mode::MAX) {
            param.mode = opr::AdaptivePooling::Param::Mode::MAX;
        } else {
            mgb_assert(reduce_mode == opr::Reduce::Param::Mode::MEAN);
            param.mode = opr::AdaptivePooling::Param::Mode::AVERAGE;
        }
        auto new_node = opr::AdaptivePooling::make(
                rewriter.get_var(reshape->input(0)), {1, 1}, param);
        rewriter.replace_var(
                dimshuffle->output(0), new_node.node(),
                mgb_cstr_log("replace reshape+reduce+dimshuffle -> adaptive pooling"));
        return true;
    };

    auto on_opr = [&try_fuse_global_pooling_axis_add,
                   &try_fuse_global_pooling_dimshuffle,
                   &rewriter](OperatorNodeBase* opr) {
        if (!try_fuse_global_pooling_axis_add(opr) &&
            !try_fuse_global_pooling_dimshuffle(opr)) {
            rewriter.auto_replace_outputs(opr);
        }
    };
    opt.graph().iter(on_opr);
    rewriter.apply_inplace();

    MIDOUT_E
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/gopt/impl/folding_reduce_mean.cpp
+++ b/src/gopt/impl/folding_reduce_mean.cpp
@@ -0,0 +1,117 @@
 #include "megbrain/gopt/inference.h"
 #include "megbrain/opr/basic_arith.h"
 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/opr/utility.h"
 #include "megbrain/serialization/opr_shallow_copy.h"

 #include "megdnn/opr_param_defs.h"

 #include "megbrain/opr/internal/megdnn_opr_wrapper.h"

 #include "../../core/impl/graph/cg_impl.h"
 #include "./gopt_helper.h"

 #include "megbrain/utils/hash_ct.h"

 #include "midout.h"

 MIDOUT_DECL(megbrain_folding_reduce_mean)
 #define MIDOUT_B(tag) \
    MIDOUT_BEGIN(megbrain_folding_reduce_mean, midout_iv(MGB_HASH_STR(tag))) {
 #define MIDOUT_E \
    }            \
    MIDOUT_END();

 using namespace mgb;
 using namespace gopt;

 /* ==================== FoldingReduceMeanPass ================= */
 const char* FoldingReduceMeanPass::name() const {
    return mgb_cstr_log("folding reduce mean pass");
 }

 void FoldingReduceMeanPass::apply(OptState& opt) const {
    MIDOUT_B("FoldingReduceMeanPass::apply");
    FindNext find_tool(opt);

    auto rewriter = opt.graph().make_rewriter();

    /**
     *   reshape+---------->reduce(axis, sum)+--------->axis_remove+----------->true_div
     *   |                                                                       ^
     *   |                                                                       |
     *   +--------------> get_var_shape(axis)+------------>type_cvt(fp32)+-------+
     *                                   ||
     *                                   ||
     *                                   \/
     *   reshape+-------->reduce(axis, mean)+--------->axis_remove
     *
     *
     **/
    auto try_fuse_reduce_mean = [&rewriter, &find_tool](OperatorNodeBase* opr) {
        ThinHashSet<OperatorNodeBase*> opr_set;
        ThinHashSet<OperatorNodeBase*> reader_set;
        MGB_MARK_USED_VAR(rewriter);
        // check true_div
        auto elemwise = try_cast_as_op<opr::Elemwise>(opr);
        CHECK_OR_RETURN(elemwise);
        auto mode_ok = elemwise->param().mode == opr::Elemwise::Mode::TRUE_DIV;
        CHECK_OR_RETURN(mode_ok);

        auto input0 = elemwise->input(0)->owner_opr();
        auto remove_axis = input0->try_cast_final<opr::AxisAddRemove>();
        auto reduce = input0->try_cast_final<opr::Reduce>();
        if (remove_axis) {
            reduce = remove_axis->input(0)->owner_opr()->try_cast_final<opr::Reduce>();
        }
        CHECK_OR_RETURN(reduce);

        bool reduce_sum = reduce->param().mode == opr::Reduce::Param::Mode::SUM;
        CHECK_OR_RETURN(reduce_sum);

        auto input1 = elemwise->input(1)->owner_opr();
        auto typecvt = input1->try_cast_final<opr::TypeCvt>();
        CHECK_OR_RETURN(typecvt);
        auto is_typecvt_f32 = typecvt->param().enumv() == DTypeEnum::Float32;
        CHECK_OR_RETURN(is_typecvt_f32);

        auto get_var_shape =
                typecvt->input(0)->owner_opr()->try_cast_final<opr::GetVarShape>();
        CHECK_OR_RETURN(get_var_shape);

        bool same_parent =
                get_var_shape->input(0)->owner_opr() == reduce->input(0)->owner_opr();
        CHECK_OR_RETURN(same_parent);

        CHECK_OR_RETURN(
                find_tool.used_count(get_var_shape->input(0)->owner_opr()) == 2);

        bool same_axis = get_var_shape->param().axis == reduce->param().axis;
        CHECK_OR_RETURN(same_axis);

        auto new_reduce_param = reduce->param();
        new_reduce_param.mode = opr::Reduce::Mode::MEAN;
        auto new_node =
                opr::Reduce::make(rewriter.get_var(reduce->input(0)), new_reduce_param);
        if (remove_axis) {
            new_node = opr::AxisAddRemove::make(
                    new_node, remove_axis->param(), remove_axis->config());
        }
        rewriter.replace_var(
                opr->output(0), new_node.node(),
                mgb_cstr_log("replace reduce_sum+div_axis -> reduce_mean"));
        return true;
    };

    auto on_opr = [&try_fuse_reduce_mean, &rewriter](OperatorNodeBase* opr) {
        if (!try_fuse_reduce_mean(opr)) {
            rewriter.auto_replace_outputs(opr);
        }
    };
    opt.graph().iter(on_opr);
    rewriter.apply_inplace();
    MIDOUT_E
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/gopt/impl/framework.cpp
+++ b/src/gopt/impl/framework.cpp
@@ -722,7 +722,10 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
            options.disable_##_option(); \
        }                                \
    }

    cb(fuse_grain, {
        add_pass<FoldingReduceMeanPass>();
        add_pass<FoldingGlobalPoolingPass>();
    });
    cb(fuse_preprocess, {
        add_pass(FuseNCHW4Int8Preprocess::make());
        add_pass<FuseWarpPerspectiveDimshufflePass>();
--- a/src/gopt/impl/gopt_helper.h
+++ b/src/gopt/impl/gopt_helper.h
@@ -0,0 +1,35 @@
 #pragma once

 #include "../../core/impl/graph/cg_impl.h"
 #include "megbrain/gopt/inference.h"
 #include "megbrain/utils/hash_ct.h"

 namespace mgb {
 namespace gopt {
 namespace {
 #define CHECK_OR_RETURN(x) \
    if (!(x)) {            \
        return false;      \
    }

 class FindNext {
    using DepType = cg::OperatorNodeProp::DepType;

 public:
    FindNext(OptState& opt) {
        opt.graph().iter([&](OperatorNodeBase* opr) {
            for (auto&& i : opr->node_prop().dep_map()) {
                m_readers[i.first->owner_opr()].emplace_back(opr, i.second);
            }
        });
    }
    size_t used_count(OperatorNodeBase* opr) { return m_readers[opr].size(); }

 private:
    ThinHashMap<OperatorNodeBase*, SmallVector<std::pair<OperatorNodeBase*, DepType>>>
            m_readers;
 };
 }  // namespace
 }  // namespace gopt

 }  // namespace mgb
--- a/src/gopt/impl/tensor_reformat.cpp
+++ b/src/gopt/impl/tensor_reformat.cpp
@@ -4,6 +4,7 @@
 #include "megbrain/graph/event.h"
 #include "megbrain/opr/basic_arith.h"
 #include "megbrain/opr/blas.h"
 #include "megbrain/opr/dnn/adaptive_pooling.h"
 #include "megbrain/opr/dnn/batch_norm.h"
 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/opr/dnn/local.h"
@@ -1368,6 +1369,8 @@ void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size) {
            megdnn::param::Convolution::Format::NCHW88;
    megdnn::param::Pooling::Format pooling_format =
            megdnn::param::Pooling::Format::NCHW88;
    megdnn::param::AdaptivePooling::Format adapt_pooling_format =
            megdnn::param::AdaptivePooling::Format::NCHW88;
    megdnn::param::Resize::Format resize_format = megdnn::param::Resize::Format::NCHW88;
    std::string convter_pass_name = "conv_format_nchw88";

@@ -1381,6 +1384,7 @@ void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size) {
        conv_bias_format = megdnn::param::ConvBias::Format::NCHW44;
        conv_format = megdnn::param::Convolution::Format::NCHW44;
        pooling_format = megdnn::param::Pooling::Format::NCHW44;
        adapt_pooling_format = megdnn::param::AdaptivePooling::Format::NCHW44;
        resize_format = megdnn::param::Resize::Format::NCHW44;
        convter_pass_name = "conv_format_nchw44";
    }
@@ -1646,6 +1650,33 @@ void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size) {
            return new_opr;
        }
    };
    auto replace_adapt_pooling_opr = [=](OperatorNodeBase* opr,
                                         const VarNodeArray& new_inp) {
        mgb_assert(opr->input().size() == new_inp.size());
        auto& pooling_opr = opr->cast_final_safe<opr::AdaptivePooling>();
        mgb_throw_if(
                pooling_opr.param().format !=
                        opr::AdaptivePoolingForward::Param::Format::NCHW,
                MegBrainError,
                "ConvertFormat Pass only support converting NCHW to NCHWxx");
        VarNode* inp_0 = new_inp[0];
        VarNode* inp_1 = new_inp[1];
        //! if input is nchwxx
        if (inp_0->shape().ndim == 5) {
            auto new_param = pooling_opr.param();
            new_param.format = adapt_pooling_format;
            auto new_pooling_opr = opr::AdaptivePoolingForward::make(
                    inp_0, inp_1, new_param, opr->config());
            mgb_assert(
                    new_pooling_opr.shape().ndim == 5,
                    "The pooling dst dim is not trans to nchwxx");
            return new_pooling_opr.node()->owner_opr();
        } else {
            auto new_opr =
                    serialization::copy_opr_shallow(*opr, new_inp, opr->config());
            return new_opr;
        }
    };

    auto replace_resize_opr = [=](OperatorNodeBase* opr, const VarNodeArray& new_inp) {
        mgb_assert(opr->input().size() == new_inp.size());
@@ -1763,6 +1794,7 @@ void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size) {
    replace_func[opr::Convolution::typeinfo()] = replace_conv_opr;
    replace_func[opr::ConvBias::typeinfo()] = replace_conv_bias_opr;
    replace_func[opr::PoolingForward::typeinfo()] = replace_pooling_opr;
    replace_func[opr::AdaptivePooling::typeinfo()] = replace_adapt_pooling_opr;
    replace_func[opr::ResizeForward::typeinfo()] = replace_resize_opr;
    replace_func[opr::Concat::typeinfo()] = replace_multi_inp_opr;
    replace_func[opr::Elemwise::typeinfo()] = replace_multi_inp_opr;
--- a/src/gopt/include/megbrain/gopt/inference.h
+++ b/src/gopt/include/megbrain/gopt/inference.h
@@ -327,6 +327,8 @@ struct OptimizeForInferenceOptions : cg::GraphCommonOptimizeOptions {
            ret |= 1u << 4;
        if (fuse_preprocess)
            ret |= 1u << 5;
        if (fuse_grain)
            ret |= 1u << 6;
        return ret;
    }

@@ -338,6 +340,7 @@ struct OptimizeForInferenceOptions : cg::GraphCommonOptimizeOptions {
        ret.fuse_conv_bias_with_z = buf & 1u << 3;
        ret.weight_preprocess = buf & 1u << 4;
        ret.fuse_preprocess = buf & 1u << 5;
        ret.fuse_grain = buf & 1u << 6;
        ret.layout_transform = (LayoutTransform)(buf >> 32);
        return ret;
    }
@@ -477,6 +480,25 @@ public:
    void apply(OptState& opt) const override;
 };
 #endif
 /**
 * \brief old megbrain support reduce_mean by reduce_sum and div, fuse it for efficient
 *
 */
 class FoldingReduceMeanPass final : public Pass {
 public:
    const char* name() const override;
    void apply(OptState& opt) const override;
 };

 /**
 * \brief fold reduce hw to global pooling, for nchwxx optimize
 *
 */
 class FoldingGlobalPoolingPass final : public Pass {
 public:
    const char* name() const override;
    void apply(OptState& opt) const override;
 };

 /*!
 * \brief padding channel to enable fast int8/int4 support
--- a/src/gopt/test/gopt_old_model.cpp
+++ b/src/gopt/test/gopt_old_model.cpp
@@ -0,0 +1,249 @@
 #include "megbrain/opr/dnn/local.h"
 #include "megbrain/test/helper.h"

 #include "megbrain/gopt/basic_arith.h"
 #include "megbrain/gopt/gtrans.h"
 #include "megbrain/gopt/inference.h"

 #include "megbrain/opr/basic_arith_wrapper.h"
 #include "megbrain/opr/blas.h"
 #include "megbrain/opr/dnn/adaptive_pooling.h"
 #include "megbrain/opr/io.h"
 #include "megbrain/opr/nn_int.h"
 #include "megbrain/opr/tensor_gen.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/opr/utility.h"

 #include "./helper.h"
 #include "megbrain/comp_node_env.h"

 #include "megdnn/tensor_format.h"

 #include <random>
 #include <vector>

 using namespace mgb;

 namespace {
 //! find first the operator of specific type; raise exception if not found
 template <typename T>
 T& find_opr(SymbolVar endpoint) {
    T* found = nullptr;
    auto cb = [&found](cg::OperatorNodeBase* opr) {
        if (!found && opr->same_type<T>()) {
            found = &opr->cast_final_safe<T>();
        }
    };
    cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
    mgb_assert(found, "not found opr from %s", endpoint.node()->name().c_str());
    return *found;
 }

 template <typename T>
 T& find_opr(SymbolVar endpoint, const std::string& node_name) {
    T* found = nullptr;
    auto cb = [&found, &node_name](cg::OperatorNodeBase* opr) {
        if (!found && opr->same_type<T>() && opr->name() == node_name) {
            found = &opr->cast_final_safe<T>();
        }
    };
    cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
    mgb_assert(
            found, "not found opr %s from %s", node_name.c_str(),
            endpoint.node()->name().c_str());
    return *found;
 }

 template <typename T>
 size_t find_opr_num(SymbolVar endpoint) {
    size_t opr_num = 0;
    auto cb = [&opr_num](cg::OperatorNodeBase* opr) {
        if (opr->same_type<T>()) {
            opr_num++;
        }
    };
    cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
    return opr_num;
 }
 }  // namespace

 TEST(TestGoptOldModel, FoldingGlobalPooling) {
    HostTensorGenerator<> gen;
    auto cn = CompNode::load("cpu0");
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    auto mkcvar = [&](const char* name, const TensorShape& shp) {
        return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
    };

    auto host_x = gen({2, 3, 16, 16}, cn);
    auto x = opr::Host2DeviceCopy::make(*graph, host_x);

    opr::Convolution::Param param_conv;
    param_conv.stride_h = param_conv.stride_w = 1;
    param_conv.pad_h = param_conv.pad_w = 1;
    auto w1 = mkcvar("w1", {8, 3, 3, 3});
    auto conv1 =
            opr::Convolution::make(x, w1, param_conv, {}, OperatorNodeConfig("conv1"));
    auto conv_n = opr::GetVarShape::make(conv1, 0);
    auto conv_c = opr::GetVarShape::make(conv1, 1);
    auto conv_h = opr::GetVarShape::make(conv1, 2);
    auto conv_w = opr::GetVarShape::make(conv1, 3);
    auto hxw = conv_h * conv_w;
    auto reshape_shape = opr::Concat::make({conv_n, conv_c, hxw}, 0);

    auto reshape1 = opr::Reshape::make(conv1, reshape_shape);

    opr::Reduce::Param param_reduce;
    param_reduce.axis = 2;
    param_reduce.mode = opr::Reduce::Mode::SUM;
    auto reduce = opr::Reduce::make(reshape1, param_reduce);
    auto reduce_remove_axis = opr::AxisAddRemove::make(
            reduce, {opr::AxisAddRemove::AxisDesc::make_remove(2)});
    auto hw_count = opr::GetVarShape::make(reshape1, 2);

    auto fp32_hw_count = opr::TypeCvt::make(hw_count, dtype::Float32());
    auto true_div = reduce_remove_axis / fp32_hw_count;
    auto y = opr::AxisAddRemove::make(
            true_div, {opr::AxisAddRemove::AxisDesc::make_add(2),
                       opr::AxisAddRemove::AxisDesc::make_add(3)});

    SymbolVar y_opt = y;
    {
        auto options = gopt::OptimizeForInferenceOptions{};
        options.fuse_grain = true;
        unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
    }
    ASSERT_EQ(
            opr::AdaptivePooling::Param::Mode::AVERAGE,
            find_opr<opr::AdaptivePooling>(y_opt).param().mode);

    graph->compile({{y_opt, {}}})
            ->to_json()
            ->writeto_fpath(output_file("TestGoptOldModel.FoldingGlobalPooling.json"));

    HostTensorND host_y_opt, host_y;
    auto func = graph->compile(
            {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
    func->execute();
    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
 }

 TEST(TestGoptOldModel, FoldingGlobalPooling2) {
    HostTensorGenerator<> gen;
    auto cn = CompNode::load("cpu0");
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    auto mkcvar = [&](const char* name, const TensorShape& shp) {
        return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
    };

    auto host_x = gen({2, 3, 16, 16}, cn);
    auto x = opr::Host2DeviceCopy::make(*graph, host_x);

    opr::Convolution::Param param_conv;
    param_conv.stride_h = param_conv.stride_w = 1;
    param_conv.pad_h = param_conv.pad_w = 1;
    auto w1 = mkcvar("w1", {8, 3, 3, 3});
    auto conv1 =
            opr::Convolution::make(x, w1, param_conv, {}, OperatorNodeConfig("conv1"));
    auto conv_n = opr::GetVarShape::make(conv1, 0);
    auto conv_c = opr::GetVarShape::make(conv1, 1);
    auto conv_h = opr::GetVarShape::make(conv1, 2);
    auto conv_w = opr::GetVarShape::make(conv1, 3);
    auto hxw = conv_h * conv_w;
    auto reshape_shape = opr::Concat::make({conv_n, conv_c, hxw}, 0);

    auto reshape1 = opr::Reshape::make(conv1, reshape_shape);

    opr::Reduce::Param param_reduce;
    param_reduce.axis = 2;
    param_reduce.mode = opr::Reduce::Mode::SUM;
    auto reduce = opr::Reduce::make(reshape1, param_reduce);
    auto reduce_remove_axis = opr::AxisAddRemove::make(
            reduce, {opr::AxisAddRemove::AxisDesc::make_remove(2)});
    auto hw_count = opr::GetVarShape::make(reshape1, 2);

    auto fp32_hw_count = opr::TypeCvt::make(hw_count, dtype::Float32());
    auto true_div = reduce_remove_axis / fp32_hw_count;
    auto y = opr::Dimshuffle::make(true_div, {0, 1, -1, -1});

    SymbolVar y_opt = y;
    {
        auto options = gopt::OptimizeForInferenceOptions{};
        options.fuse_grain = true;
        unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
    }
    ASSERT_EQ(
            opr::AdaptivePooling::Param::Mode::AVERAGE,
            find_opr<opr::AdaptivePooling>(y_opt).param().mode);

    graph->compile({{y_opt, {}}})
            ->to_json()
            ->writeto_fpath(output_file("TestGoptOldModel.FoldingGlobalPooling2.json"));

    HostTensorND host_y_opt, host_y;
    auto func = graph->compile(
            {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
    func->execute();
    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
 }

 TEST(TestGoptOldModel, FoldingReduceMean) {
    HostTensorGenerator<> gen;
    auto cn = CompNode::load("cpu0");
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    auto mkcvar = [&](const char* name, const TensorShape& shp) {
        return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
    };

    auto host_x = gen({2, 3, 16, 16}, cn);
    auto x = opr::Host2DeviceCopy::make(*graph, host_x);

    opr::Convolution::Param param_conv;
    param_conv.stride_h = param_conv.stride_w = 1;
    param_conv.pad_h = param_conv.pad_w = 1;
    auto w1 = mkcvar("w1", {8, 3, 3, 3});
    auto conv1 =
            opr::Convolution::make(x, w1, param_conv, {}, OperatorNodeConfig("conv1"));
    auto conv_n = opr::GetVarShape::make(conv1, 0);
    auto conv_c = opr::GetVarShape::make(conv1, 1);
    auto conv_h = opr::GetVarShape::make(conv1, 2);
    auto conv_w = opr::GetVarShape::make(conv1, 3);
    auto hxw = conv_h * conv_w;
    auto reshape_shape = opr::Concat::make({conv_n, conv_c, hxw}, 0);

    auto reshape1 = opr::Reshape::make(conv1, reshape_shape);

    opr::Reduce::Param param_reduce;
    param_reduce.axis = 2;
    param_reduce.mode = opr::Reduce::Mode::SUM;
    auto reduce = opr::Reduce::make(reshape1, param_reduce);
    auto hw_count = opr::GetVarShape::make(reshape1, 2);

    auto y = reduce / hw_count;

    SymbolVar y_opt = y;
    {
        auto options = gopt::OptimizeForInferenceOptions{};
        options.fuse_grain = true;
        unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
    }
    ASSERT_EQ(
            opr::Reduce::Param::Mode::MEAN, find_opr<opr::Reduce>(y_opt).param().mode);

    graph->compile({{y_opt, {}}})
            ->to_json()
            ->writeto_fpath(output_file("TestGoptOldModel.FoldingReduceMean.json"));

    HostTensorND host_y_opt, host_y;
    auto func = graph->compile(
            {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
    func->execute();
    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);

    *host_x = *gen({2, 3, 16, 16}, cn);
    func->execute();
    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
 }
--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -7,6 +7,7 @@

 #include "megbrain/opr/basic_arith_wrapper.h"
 #include "megbrain/opr/blas.h"
 #include "megbrain/opr/dnn/adaptive_pooling.h"
 #include "megbrain/opr/dnn/batch_norm.h"
 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/opr/dnn/pooling.h"
@@ -4113,6 +4114,71 @@ TEST(TestGoptInference, ConvertFormatNCHW44Reshape) {
    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
 }

 TEST(TestGoptInference, ConvertFormatNCHW44GlobalPooling) {
    HostTensorGenerator<> gen;
    auto cn = CompNode::load("cpu0");
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    auto mkcvar = [&](const char* name, const TensorShape& shp) {
        return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
    };

    auto host_x1 = gen({1, 4, 16, 16}, cn);
    auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
    opr::Convolution::Param param_conv;
    param_conv.stride_h = param_conv.stride_w = 1;
    param_conv.pad_h = param_conv.pad_w = 1;
    auto w1 = mkcvar("w1", {8, 4, 3, 3});
    auto conv1 =
            opr::Convolution::make(x, w1, param_conv, {}, OperatorNodeConfig("conv1"));
    auto conv_n = opr::GetVarShape::make(conv1, 0);
    auto conv_c = opr::GetVarShape::make(conv1, 1);
    auto conv_h = opr::GetVarShape::make(conv1, 2);
    auto conv_w = opr::GetVarShape::make(conv1, 3);
    auto hxw = conv_h * conv_w;
    auto reshape_shape = opr::Concat::make({conv_n, conv_c, hxw}, 0);
    auto reshape1 = opr::Reshape::make(conv1, reshape_shape);

    opr::Reduce::Param param_reduce;
    param_reduce.axis = 2;
    param_reduce.mode = opr::Reduce::Mode::SUM;
    auto reduce = opr::Reduce::make(reshape1, param_reduce);
    auto reduce_remove_axis = opr::AxisAddRemove::make(
            reduce, {opr::AxisAddRemove::AxisDesc::make_remove(2)});
    auto hw_count = opr::GetVarShape::make(reshape1, 2);

    auto fp32_hw_count = opr::TypeCvt::make(hw_count, dtype::Float32());
    auto reduce_mean = reduce_remove_axis / fp32_hw_count;
    auto global_pool = opr::AxisAddRemove::make(
            reduce_mean, {opr::AxisAddRemove::AxisDesc::make_add(2),
                          opr::AxisAddRemove::AxisDesc::make_add(3)});

    opr::Elemwise::Param elem_param;
    elem_param.mode = opr::Elemwise::Param::Mode::RELU;
    auto y = opr::Elemwise::make({global_pool}, elem_param);

    SymbolVar y_opt;
    auto options = gopt::OptimizeForInferenceOptions{};
    options.enable_fuse_grain();
    options.enable_nchw44();
    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);

    ASSERT_EQ(
            opr::AdaptivePooling::Param::Format::NCHW44,
            find_opr<opr::AdaptivePooling>(y_opt).param().format);

    graph->compile({{y_opt, {}}})
            ->to_json()
            ->writeto_fpath(output_file(
                    "TestGoptInference.ConvertFormatNCHW44GlobalPooling.json"));
    HostTensorND host_y_opt, host_y;
    auto func = graph->compile(
            {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)});
    func->execute();
    //! meybe go to winograd in x86-32, so set error 1e-1
    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1);
 }

 TEST(TestGoptInference, ConvertFormatNCHW44_DOT) {
    HostTensorGenerator<> gen;
    auto cn = CompNode::load("cpu0");
--- a/src/opr/impl/dnn/adaptive_pooling.cpp
+++ b/src/opr/impl/dnn/adaptive_pooling.cpp
@@ -39,7 +39,7 @@ void AdaptivePoolingForward::outshape_by_symvar_do_get_output_shape(
    cg::copy_tensor_value_to_shape(oshp2d, *shpinfo.shpval_inp_val.at(0));
    auto src = shpinfo.shape_inp_shp.at(0);
    mgb_assert(
            src.ndim == 4 && (oshp2d.ndim == 2 || oshp2d.ndim == 1),
            (src.ndim == 4 || src.ndim == 5) && (oshp2d.ndim == 2 || oshp2d.ndim == 1),
            "shape mismatch for AdaptivePooling: src=%s, out2d=%s",
            src.to_string().c_str(), oshp2d.to_string().c_str());

@@ -57,8 +57,19 @@ void AdaptivePoolingForward::outshape_by_symvar_do_get_output_shape(
        dest.shape[1] = oshp2d.shape[0];
        dest.shape[2] = (tshp1n) ? oshp2d.shape[0] : oshp2d.shape[1];
        dest.shape[3] = src.shape[3];
    } else if (
            param_format == Param::Format::NCHW44 ||
            param_format == Param::Format::NCHW88) {
        dest.ndim = 5;
        dest.shape[0] = src.shape[0];
        dest.shape[1] = src.shape[1];
        dest.shape[2] = oshp2d.shape[0];
        dest.shape[3] = (tshp1n) ? oshp2d.shape[0] : oshp2d.shape[1];
        dest.shape[4] = src.shape[4];
    } else {
        mgb_throw(MegBrainError, "AdaptivePooling only support NCHW or NHWC format");
        mgb_throw(
                MegBrainError, "AdaptivePooling not support %d format",
                (int)param_format);
    }
 }

--- a/src/opr/test/dnn/adaptive_pooling.cpp
+++ b/src/opr/test/dnn/adaptive_pooling.cpp
@@ -48,7 +48,6 @@ void run(Param::Mode mode) {

    Checker::RunOptions opt;
    opt.numdiff_max_err = 1e-2;

    Checker checker{make_graph, fwd};
    checker.set_input_allow_grad(1, false).set_input_generator(0, gen);
    checker.run({TensorShape{1, 1, 10, 7}, TensorShape{5, 4}}, opt);